├── .editorconfig ├── .eslintignore ├── .eslintrc ├── .gitignore ├── .vscode └── launch.json ├── LICENSE ├── README.md ├── package.json ├── scripts ├── restart.js ├── start.js └── stop.js ├── src ├── config │ ├── index.js │ └── privates.js ├── index.html ├── index.js ├── parse.js ├── puppeteer.js ├── scheduler.js ├── table.js └── util.js └── test ├── 163 ├── account.js ├── clickLinkInEmail.js ├── openEmail.js ├── signin.js └── test.js ├── accounts └── index.js ├── batch.js ├── github ├── account-signin.js ├── account-signup.js ├── openAndStarRepos.js ├── signin.js ├── signup.js ├── test-signin-repl.js ├── test-signin.js ├── test-signup-batch.js └── test-signup.js ├── iqiyi ├── screenshot.js ├── screenshots │ └── .gitkeep └── test-screenshot.js └── zhihu ├── elem-extract.js ├── fespider.js ├── outputs └── .gitkeep └── test-elem-extract.js /.editorconfig: -------------------------------------------------------------------------------- 1 | root = true 2 | 3 | [*] 4 | end_of_line = lf 5 | insert_final_newline = true 6 | charset = utf-8 7 | 8 | [src/**.{js}] 9 | indent_style = space 10 | indent_size = 4 11 | 12 | [*.json] 13 | indent_size = 2 14 | -------------------------------------------------------------------------------- /.eslintignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shenfe/puppeteer-service/7b1731d1725d5376e23109f6054ac963278eec75/.eslintignore -------------------------------------------------------------------------------- /.eslintrc: -------------------------------------------------------------------------------- 1 | { 2 | "extends": "standard", 3 | "parserOptions": { 4 | "ecmaVersion": 8 5 | }, 6 | "rules": { 7 | "semi": 0, 8 | "indent": ["error", 4], 9 | "camelcase": 0, 10 | "space-before-function-paren": ["error", { 11 | "anonymous": "always", 12 | "named": "never", 13 | "asyncArrow": "always" 14 | }], 15 | "no-fallthrough": 0, 16 | "no-unused-vars": 0, 17 | "eqeqeq": 0, 18 | "operator-linebreak": 0, 19 | "no-labels": 0, 20 | "no-inner-declarations": 0, 21 | "no-undef": 0, 22 | "standard/no-callback-literal": 0, 23 | "key-spacing": 0, 24 | "no-new-func": 0, 25 | "no-unused-expressions": 0, 26 | "no-new": 0, 27 | "eol-last": 0, 28 | "no-global-assign": 0 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules 2 | package-lock.json 3 | .DS_Store 4 | accounts* 5 | test/iqiyi/screenshots/*.png 6 | test/zhihu/outputs/*.html 7 | -------------------------------------------------------------------------------- /.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": "0.2.0", 3 | "configurations": [ 4 | { 5 | "type": "node", 6 | "request": "attach", 7 | "name": "Attach by Process ID", 8 | "processId": "${command:PickProcess}", 9 | "port": 3000, 10 | "protocol": "inspector" 11 | }, 12 | { 13 | "type": "node", 14 | "request": "launch", 15 | "name": "stop", 16 | "program": "${workspaceFolder}/scripts/stop.js" 17 | }, 18 | { 19 | "type": "node", 20 | "request": "launch", 21 | "name": "test 163", 22 | "program": "${workspaceFolder}/test/163/test.js" 23 | }, 24 | { 25 | "type": "node", 26 | "request": "launch", 27 | "name": "test github signin", 28 | "program": "${workspaceFolder}/test/github/test-signin.js" 29 | }, 30 | { 31 | "type": "node", 32 | "request": "launch", 33 | "name": "test github signup", 34 | "program": "${workspaceFolder}/test/github/test-signup.js" 35 | } 36 | ] 37 | } 38 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Shen 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # puppeteer-service 2 | 3 | 4 | 5 | 🎠 Run [GoogleChrome/puppeteer](https://github.com/GoogleChrome/puppeteer) as a service. 6 | 7 | ## Usage 8 | 9 | ### Server 10 | 11 | ```bash 12 | $ npm install puppeteer-service --save 13 | ``` 14 | 15 | ```js 16 | const PuppeteerService = require('puppeteer-service'); 17 | const { koaApp, server } = PuppeteerService({ 18 | cluster: true, // default: false 19 | port: 3000, // default 20 | api: 'run', // default 21 | test: true, // default: false 22 | puppeteer: { 23 | // See https://github.com/GoogleChrome/puppeteer/blob/master/docs/api.md#puppeteerlaunchoptions 24 | headless: true, // default 25 | args: ['--no-sandbox'] 26 | } 27 | }); 28 | ``` 29 | 30 | 😯 If the `test` option is set `true` like above, you can visit the test page via `http://your.host:3000/test/`. 31 | 32 | ### Client 33 | 34 | #### 👉 Option 1: Use puppeteer-service-client 35 | 36 | ```bash 37 | $ npm install puppeteer-service-client --save 38 | ``` 39 | 40 | Use [puppeteer-service-client](https://github.com/shenfe/puppeteer-service-client) to communicate with the server. It's runnable at **both browser and Node.js**. 41 | 42 | ```js 43 | const Run = require('puppeteer-service-client'); 44 | Run('http://your.host:3000/run', { 45 | /* Entry page url */ 46 | url: 'https://target.com/', 47 | 48 | /* Runner function */ 49 | run: async page => { 50 | const title = await page.title(); 51 | echo({ url: page.url(), title }); 52 | return { 53 | info: b(a, title) 54 | }; 55 | }, 56 | 57 | /* Options (Optional) */ 58 | options: { 59 | /* Variables to inject */ 60 | /* Identifiers and their corresponding literal values will be injected 61 | as variable declarations into the runner function. */ 62 | injection: { 63 | a: 'Welcome to ', 64 | b: function (x, y) { 65 | return x + y; 66 | } 67 | } 68 | }, 69 | 70 | /* WebSocket data handler (Optional) */ 71 | socket: data => { 72 | /**/ 73 | } 74 | }) 75 | .then(data => { 76 | /**/ 77 | }).catch(error => { 78 | /**/ 79 | }); 80 | ``` 81 | 82 | **socket and echo** 83 | 84 | The `socket` option specifies a handler for WebSocket data at client side. Correspondingly, the function `echo`, which is callable inside the "page runner function", is a built-in function whose responsibility is to transfer data to the right socket connection with the client. 85 | 86 | #### 👉 Option 2: Send a request directly 87 | 88 | As the following does: 89 | 90 | ```js 91 | const pageRunner = async page => { 92 | const title = await page.title(); 93 | return { 94 | info: b(a, title) 95 | }; 96 | }; 97 | fetch('http://your.host:3000/run', { 98 | method: 'POST', 99 | /*...*/ 100 | headers: { 101 | 'Content-Type': 'application/json' 102 | }, 103 | body: JSON.stringify({ 104 | data: `{ 105 | url: 'https://www.sogou.com', 106 | run: ${pageRunner}, 107 | options: { 108 | injection: { 109 | a: 'Welcome to ', 110 | b: function (x, y) { 111 | return x + y; 112 | } 113 | } 114 | } 115 | }` 116 | }) 117 | }) 118 | .then(res => { 119 | if (res.ok) return res.json(); 120 | throw new Error('Response is not ok'); 121 | }) 122 | .then(data => { 123 | /**/ 124 | }).catch(error => { 125 | /**/ 126 | }); 127 | ``` 128 | 129 | ⚠️ This way is lightweight but too simple to communicate with the server via WebSocket. 130 | 131 | ## Development 132 | 133 | Some commands: 134 | 135 | ```bash 136 | npm start # start 137 | npm start -- -p 3000 # port 138 | npm start -- -c # cluster 139 | npm run debug # debugging mode 140 | npm test # test 141 | npm test -- -u http://127.0.0.1:3000/run # api url 142 | npm test -- -n 10 # batch number 143 | ``` 144 | 145 | ## License 146 | 147 | [MIT](http://opensource.org/licenses/MIT) 148 | 149 | Copyright © 2018-present, [shenfe](https://github.com/shenfe) 150 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "scripts": { 3 | "start": "node ./scripts/start.js", 4 | "stop": "node ./scripts/stop.js", 5 | "restart": "node ./scripts/restart.js", 6 | "debug": "node --inspect ./scripts/start.js", 7 | "test": "node ./test/batch.js" 8 | }, 9 | "dependencies": { 10 | "@koa/cors": "^2.2.1", 11 | "clusterun": "^0.1.1", 12 | "function-sandbox": "^1.1.2", 13 | "http-graceful-shutdown": "^2.1.0", 14 | "ip": "^1.1.5", 15 | "koa": "^2.3.0", 16 | "koa-bodyparser": "^4.2.0", 17 | "koa-router": "^7.2.1", 18 | "koa-session": "^5.5.0", 19 | "node-fetch": "^2.0.0", 20 | "puppeteer": "^1.0.0", 21 | "puppeteer-service-client": "^0.1.6", 22 | "socket.io": "^2.0.4", 23 | "sticky-session": "^1.1.2" 24 | }, 25 | "devDependencies": { 26 | "Base64": "^1.0.1", 27 | "babel-core": "^6.26.0", 28 | "eslint": "^4.9.0", 29 | "eslint-config-standard": "^10.2.1", 30 | "eslint-plugin-import": "^2.8.0", 31 | "eslint-plugin-node": "^5.2.0", 32 | "eslint-plugin-promise": "^3.6.0", 33 | "eslint-plugin-standard": "^3.0.1", 34 | "http-server": "^0.11.1", 35 | "open": "0.0.5", 36 | "readline-sync": "^1.4.9" 37 | }, 38 | "name": "puppeteer-service", 39 | "description": "Run headless Chrome (aka Puppeteer) as a service, for web crawling, remote controlling and so on.", 40 | "version": "0.4.8", 41 | "main": "src/index.js", 42 | "directories": { 43 | "test": "test" 44 | }, 45 | "repository": { 46 | "type": "git", 47 | "url": "git+https://github.com/shenfe/puppeteer-service.git" 48 | }, 49 | "keywords": [ 50 | "puppeteer", 51 | "headless-chrome", 52 | "service", 53 | "web-crawler" 54 | ], 55 | "author": "hengwu", 56 | "license": "MIT", 57 | "bugs": { 58 | "url": "https://github.com/shenfe/puppeteer-service/issues" 59 | }, 60 | "homepage": "https://github.com/shenfe/puppeteer-service#readme" 61 | } 62 | -------------------------------------------------------------------------------- /scripts/restart.js: -------------------------------------------------------------------------------- 1 | (async () => { 2 | await require('./stop'); 3 | require('./start'); 4 | })(); 5 | -------------------------------------------------------------------------------- /scripts/start.js: -------------------------------------------------------------------------------- 1 | let port; 2 | 3 | let useCluster = false; 4 | 5 | const args = process.argv.slice(2); 6 | args.forEach(function (val, index, array) { 7 | switch (val) { 8 | case '-p': 9 | case '--port': 10 | port = +args[index + 1]; 11 | break; 12 | case '-c': 13 | case '--cluster': 14 | useCluster = true; 15 | break; 16 | } 17 | }); 18 | 19 | const index = require('../src'); 20 | 21 | const run = (ifUseCluster) => index({ 22 | cluster: ifUseCluster, 23 | test: true, 24 | ...(port && { port }), 25 | puppeteer: { 26 | // headless: false, 27 | // executablePath: '..\\spiderman\\node_modules\\puppeteer\\.local-chromium\\win64-526987\\chrome-win32\\chrome.exe' 28 | } 29 | }).then(({ koaApp, server }) => { 30 | // do stuff 31 | }); 32 | 33 | console.log('Starting...'); 34 | 35 | if (useCluster) { 36 | // console.log('使用Node的cluster模块会导致session混乱问题。如果想要使用集群,请用不同端口启动多个puppeteer-service,再在更上层根据ip-hash实现分发。'); 37 | run(true); 38 | } else { 39 | run(); 40 | } 41 | -------------------------------------------------------------------------------- /scripts/stop.js: -------------------------------------------------------------------------------- 1 | const fetch = require('node-fetch'); 2 | const config = require('../src/config'); 3 | const keyString = require('../src/config/privates').key; 4 | 5 | let port; 6 | 7 | const args = process.argv.slice(2); 8 | args.forEach(function (val, index, array) { 9 | switch (val) { 10 | case '-p': 11 | case '--port': 12 | port = +args[index + 1]; 13 | break; 14 | } 15 | }); 16 | 17 | port = port || config.server.port; 18 | 19 | module.exports = fetch(`http://127.0.0.1:${port}/stop`, { 20 | method: 'POST', 21 | headers: { 'Content-Type': 'application/json' }, 22 | body: JSON.stringify({ 23 | key: keyString 24 | }) 25 | }) 26 | .then(res => { 27 | if (res.ok) return res.json().then(console.log); 28 | console.error('Response of the `stop` request is not ok'); 29 | }) 30 | .catch(err => { 31 | switch (err.code) { 32 | case 'ECONNREFUSED': 33 | console.log('Connection refused'); 34 | break; 35 | case 'ECONNRESET': 36 | console.log('Connection aborted'); 37 | break; 38 | default: 39 | console.error(err); 40 | break; 41 | } 42 | }) 43 | .then(() => console.log('Stopped')); 44 | -------------------------------------------------------------------------------- /src/config/index.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | server: { 3 | port: 3000, 4 | apiName: 'run' 5 | }, 6 | launch: { 7 | ignoreHTTPSErrors: true, 8 | headless: true, 9 | // userDataDir: './browser_data', 10 | args: [ 11 | '--no-sandbox', 12 | '--memory-pressure-thresholds=1', 13 | // '--memory-pressure-off', 14 | // '--force-fieldtrials=AutomaticTabDiscarding/Disabled', 15 | // '--renderer-process-limit=1000', 16 | // '--v8-cache-strategies-for-cache-storage=aggressive', 17 | // '--aggressive', 18 | // '--user-data-dir', 19 | // '--disable-renderer-backgrounding', 20 | // '--disable-javascript', 21 | // '-incognito', 22 | // '--aggressive-cache-discard', 23 | // '--aggressive-tab-discard', 24 | ] 25 | } 26 | }; 27 | -------------------------------------------------------------------------------- /src/config/privates.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | key: 'the specific key string for authentication' 3 | }; 4 | -------------------------------------------------------------------------------- /src/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | test 8 | 26 | 27 | 28 | 29 |
30 | 31 |
32 |

33 |         
34 |
35 |
36 | 37 |
38 | 74 | 75 | 76 | -------------------------------------------------------------------------------- /src/index.js: -------------------------------------------------------------------------------- 1 | const sticky = require('sticky-session'); 2 | const http = require('http'); 3 | const cluster = require('cluster'); 4 | 5 | const Koa = require('koa'); 6 | const Router = require('koa-router'); 7 | const bodyParser = require('koa-bodyparser'); 8 | const cors = require('@koa/cors'); 9 | 10 | const sockio = require('socket.io'); 11 | 12 | const { createReadStream } = require('fs'); 13 | 14 | const fnsb = require('function-sandbox'); 15 | 16 | const ObjectParse = require('./parse'); 17 | const Ppt = require('./puppeteer'); 18 | 19 | const config = require('./config'); 20 | const privates = require('./config/privates'); 21 | 22 | const path = require('path'); 23 | const ip = require('ip'); 24 | const ipAddr = ip.address(); 25 | 26 | const SessionMap = require('./table'); 27 | 28 | const gracefulShutdown = require('http-graceful-shutdown'); 29 | 30 | module.exports = async function (options = {}) { 31 | const useCluster = !!options.cluster; 32 | const test = !!options.test; 33 | const port = options.port || config.server.port; 34 | const apiName = options.api || config.server.apiName; 35 | 36 | Open_puppeteer: { 37 | await Ppt.open(options.puppeteer); 38 | console.log('Chrome puppeteer open'); 39 | } 40 | 41 | const app = new Koa(); 42 | const router = new Router(); 43 | 44 | if (test) { /* Serve static files for the test page */ 45 | router.get('/test/', (ctx, next) => { 46 | ctx.type = 'html'; 47 | ctx.body = createReadStream(path.resolve(__dirname, './index.html')); 48 | }); 49 | router.get('/server.config.js', ctx => { 50 | ctx.type = 'application/javascript'; 51 | ctx.body = `export default { host: '${ipAddr}', port: ${port}, apiName: '${apiName}' }`; 52 | }); 53 | router.get('/puppeteer-service-client.js', (ctx, next) => { 54 | ctx.type = 'application/javascript'; 55 | const pscSrc = require.resolve('puppeteer-service-client'); 56 | const pscDist = pscSrc.replace(/(puppeteer-service-client)(.*)$/g, function (...args) { 57 | return args[1]; 58 | }); 59 | ctx.body = createReadStream(path.resolve(pscDist, 'dist/puppeteer-service-client.js')); 60 | }); 61 | } 62 | 63 | router.post(`/${apiName}`, async function (ctx) { 64 | ctx.response.header['Access-Control-Allow-Origin'] = ctx.request.origin; 65 | ctx.response.header['Content-Type'] = 'application/json; charset=utf-8'; 66 | const { sessId, sockId } = ctx.request.body; 67 | const data = ObjectParse(ctx.request.body.data); 68 | console.log('data', data); // test 69 | ctx.status = 200; 70 | 71 | const injection = { 72 | echo: function (data) { 73 | const skt = socksesses.get(sessId); 74 | if (!skt) return; 75 | skt.emit('server:echo', data); 76 | } 77 | }; 78 | 79 | if (!data.options) data.options = {}; 80 | if (!data.options.whiteList) data.options.whiteList = []; 81 | data.options.whiteList = data.options.whiteList.concat(Object.keys(injection)); 82 | 83 | console.log(ctx.request.url, ' begin'); // test 84 | 85 | ctx.body = await Ppt.run(data.url, fnsb(data.run, { 86 | ...data.options, 87 | asFunction: true 88 | }), injection); 89 | 90 | console.log(ctx.request.url, ' end'); // test 91 | if (cluster.isWorker) { 92 | console.log('worker', cluster.worker.id); // test 93 | } 94 | 95 | const skt = socksesses.get(sessId); 96 | skt && skt.emit('server:close', 'done') && skt.disconnect(); 97 | }); 98 | 99 | router.post(`/puppeteer`, async function (ctx, next) { 100 | ctx.response.header['Access-Control-Allow-Origin'] = ctx.request.origin; 101 | ctx.response.header['Content-Type'] = 'application/json; charset=utf-8'; 102 | let { key, cmd, opt } = ctx.request.body; 103 | ctx.status = 200; 104 | if (key === privates.key) { 105 | let re = await Ppt[cmd](opt); 106 | ctx.body = { 107 | code: re === 0 ? 0 : 1, 108 | message: re === 0 ? 'success' : 'failure' 109 | }; 110 | } else { 111 | ctx.body = { 112 | code: -1, 113 | message: 'error' 114 | }; 115 | } 116 | await next(); 117 | }); 118 | 119 | router.post(`/stop`, async function (ctx, next) { 120 | let key = ctx.request.body.key; 121 | if (key === privates.key) { 122 | process.exit(0); 123 | } 124 | ctx.response.header['Access-Control-Allow-Origin'] = ctx.request.origin; 125 | ctx.response.header['Content-Type'] = 'application/json; charset=utf-8'; 126 | ctx.status = 200; 127 | ctx.body = { 128 | code: -1, 129 | message: 'error' 130 | }; 131 | await next(); 132 | }); 133 | 134 | // Serve_socketio_client_js: { 135 | // const sockioClientModuleIndex = require.resolve('socket.io-client'); 136 | // const sockioClientModuleDist = sockioClientModuleIndex.replace(/(socket\.io-client)(.*)$/, function (...args) { 137 | // return args[1]; 138 | // }); 139 | // router.get('/socket.io/socket.io.js', ctx => { 140 | // ctx.type = 'application/javascript'; 141 | // ctx.body = createReadStream(path.resolve(sockioClientModuleDist, './dist/socket.io.js')); 142 | // }); 143 | // } 144 | 145 | app 146 | .use(cors({ 147 | // origin: '*', 148 | credentials: true 149 | })) 150 | .use(bodyParser()) 151 | .use(router.routes()) 152 | .use(router.allowedMethods()) 153 | ; 154 | 155 | const server = http.createServer(app.callback()); 156 | 157 | const socksesses = new SessionMap(); 158 | Set_up_websocket: { 159 | const io = sockio(server); 160 | io.on('connect', function (socket) { 161 | const sid = socket.handshake.query.sessId; 162 | socksesses.put(sid, socket); 163 | socket.emit('server:greet', { hello: sid }); 164 | socket.on('client:some-event', function (data) { 165 | console.log('client:some-event', data); 166 | }); 167 | socket.on('disconnect', reason => { 168 | socksesses.del(sid); 169 | }); 170 | }); 171 | } 172 | 173 | gracefulShutdown(server, { 174 | onShutdown: () => { 175 | console.log('Closing...'); 176 | return Ppt.close(); 177 | } 178 | }); 179 | 180 | if (useCluster) { 181 | if (!sticky.listen(server, +port)) { 182 | // Master code 183 | server.once('listening', function () { 184 | console.log(`Server started on ${port} port`); 185 | }); 186 | } else { 187 | // Worker code 188 | } 189 | } else { 190 | server.listen(+port, function () { 191 | console.log(`Server started on ${port} port`); 192 | }); 193 | } 194 | 195 | return { 196 | koaApp: app, 197 | server 198 | }; 199 | }; 200 | -------------------------------------------------------------------------------- /src/parse.js: -------------------------------------------------------------------------------- 1 | const fnsb = require('function-sandbox'); 2 | 3 | const { evaluate, walk } = require('./util'); 4 | 5 | const ObjectParse = str => { 6 | if (typeof str !== 'string') return str; 7 | 8 | let obj; 9 | try { 10 | obj = evaluate(str); 11 | } catch (e) { 12 | console.error(e); 13 | } 14 | 15 | // walk(obj, (target, p, v) => { 16 | // if (typeof v === 'function') { 17 | // target[p] = fnsb(v, true); 18 | // } 19 | // }); 20 | 21 | return obj; 22 | }; 23 | 24 | module.exports = ObjectParse; 25 | -------------------------------------------------------------------------------- /src/puppeteer.js: -------------------------------------------------------------------------------- 1 | const puppeteer = require('puppeteer'); 2 | 3 | // console.log(puppeteer.defaultArgs()); 4 | 5 | const { launch } = require('./config'); 6 | 7 | const { evaluate } = require('./util'); 8 | 9 | let browser; 10 | let status = 0; 11 | 12 | const usePagePool = true; 13 | const pagePoolSize = 100; 14 | const { PageBroker } = require('./scheduler'); 15 | let pageBroker; 16 | 17 | const open = async (options = {}) => { 18 | if (status === 1) return 0; 19 | browser = await puppeteer.launch({ 20 | ...launch, 21 | ...options 22 | }); 23 | status = 1; 24 | pageBroker = PageBroker(browser, { 25 | pooling: usePagePool, 26 | limit: pagePoolSize 27 | }); 28 | return 0; 29 | }; 30 | 31 | const close = () => { 32 | return browser.close().then(_ => { 33 | console.log('Chromium and all of its pages have been closed.'); 34 | browser = null; 35 | status = 0; 36 | return 0; 37 | }).catch(e => { 38 | console.error(e); 39 | }); 40 | }; 41 | 42 | const run = async (url, fn, injection = {}) => { 43 | let page, pageId, result; 44 | try { 45 | const pageman = await pageBroker.open(); 46 | page = pageman.page; 47 | pageId = pageman.id; 48 | await page.goto(url); 49 | result = await evaluate(`(${fn})(page)`, { page, echo: injection.echo }); 50 | } catch (e) { 51 | console.error(e); 52 | result = {}; 53 | } 54 | pageBroker.close({ 55 | page, 56 | id: pageId 57 | }); 58 | return result || {}; 59 | }; 60 | 61 | // const run = async (url, fn, injection = {}) => { 62 | // return browser.newPage().then(async page => { 63 | // await page.goto(url); 64 | // return evaluate(`(${fn})(page)`, { page, echo: injection.echo }).then(data => { 65 | // page.close(); 66 | // return data; 67 | // }, () => { 68 | // page.close(); 69 | // return {}; 70 | // }); 71 | // }); 72 | // }; 73 | 74 | const pageCount = async () => { 75 | if (!browser) return -1; 76 | let pages = await browser.pages(); 77 | return pages.length; 78 | }; 79 | 80 | module.exports = { 81 | open, 82 | close, 83 | run, 84 | process: () => browser && browser.process(), 85 | pageCount 86 | }; 87 | -------------------------------------------------------------------------------- /src/scheduler.js: -------------------------------------------------------------------------------- 1 | const EventEmitter = require('events').EventEmitter; 2 | 3 | const PageBroker = function (browser, options = {}) { 4 | const usePagePool = !!options.pooling; 5 | const maxSize = 100; 6 | const minSize = 10; 7 | const limit = (typeof options.limit === 'number' && !isNaN(options.limit) && options.limit >= minSize) ? options.limit : maxSize; 8 | const pool = []; 9 | const event = new EventEmitter(); 10 | return { 11 | open: async () => { 12 | if (!usePagePool) { 13 | return { 14 | page: await browser.newPage() 15 | }; 16 | } 17 | if (pool.length < limit) { 18 | let page = await browser.newPage(); 19 | pool.push({ 20 | page, 21 | status: 0 22 | }); 23 | return { page, id: pool.length - 1 }; 24 | } 25 | return new Promise((resolve, reject) => { 26 | for (let id = 0, len = pool.length; id < len; id++) { 27 | if (pool[id].status === 0) { 28 | pool[id].status = 1; 29 | resolve({ page: pool[id].page, id }); 30 | return; 31 | } 32 | } 33 | event.on('page_close', function listener(page, id) { 34 | if (pool[id].status === 0) { 35 | pool[id].status = 1; 36 | this.removeListener('page_close', listener); 37 | resolve({ page, id }); 38 | } 39 | }); 40 | }); 41 | }, 42 | close({ page, id }) { 43 | if (!usePagePool) return page.close(); 44 | return page.goto('about:blank').then(_ => { 45 | pool[id].status = 0; 46 | event.emit('page_close', page, id); 47 | return page; 48 | }); 49 | } 50 | }; 51 | }; 52 | 53 | module.exports = { 54 | PageBroker 55 | }; 56 | -------------------------------------------------------------------------------- /src/table.js: -------------------------------------------------------------------------------- 1 | module.exports = function () { 2 | const ss = {}; 3 | const get = key => { 4 | return ss[key]; 5 | }; 6 | const put = (key, s) => { 7 | if (ss.hasOwnProperty(key)) return false; 8 | return ss[key] = s; 9 | }; 10 | const del = key => { 11 | return delete ss[key]; 12 | }; 13 | return { get, put, del }; 14 | }; 15 | -------------------------------------------------------------------------------- /src/util.js: -------------------------------------------------------------------------------- 1 | const vm = require('vm'); 2 | 3 | const fnsb = require('function-sandbox'); 4 | 5 | const evaluate = (str, injection = {}) => { 6 | const safeFn = fnsb(new Function(`return (${str})`), { 7 | whiteList: Object.keys(injection) 8 | }); 9 | const returnVarName = 'result'; 10 | const script = new vm.Script(`${returnVarName} = (${safeFn})()`); 11 | const sandbox = { ...injection }; 12 | script.runInNewContext(sandbox); 13 | return sandbox[returnVarName]; 14 | }; 15 | 16 | const walk = (obj, fn) => { 17 | switch (Object.prototype.toString.call(obj)) { 18 | case '[object Object]': 19 | Object.keys(obj).forEach(i => { 20 | fn(obj, i, obj[i]); 21 | walk(obj[i], fn); 22 | }); 23 | break; 24 | case '[object Array]': 25 | obj.forEach((v, i) => { 26 | fn(obj, i, v); 27 | walk(v, fn); 28 | }); 29 | break; 30 | } 31 | }; 32 | 33 | /** 34 | * IP Hash 35 | * https://github.com/indutny/sticky-session/blob/master/lib/sticky/master.js 36 | */ 37 | const ipHash = (function () { 38 | const seed = (Math.random() * 0xffffffff) | 0; 39 | 40 | return ip => { 41 | let hash = seed; 42 | 43 | for (let i = 0; i < ip.length; i++) { 44 | const num = ip[i]; 45 | 46 | hash += num; 47 | hash %= 2147483648; 48 | hash += hash << 10; 49 | hash %= 2147483648; 50 | hash ^= hash >> 6; 51 | } 52 | 53 | hash += hash << 3; 54 | hash %= 2147483648; 55 | hash ^= hash >> 11; 56 | hash += hash << 15; 57 | hash %= 2147483648; 58 | 59 | return hash >>> 0; 60 | }; 61 | })(); 62 | 63 | const randIn = (low, high) => { 64 | return low + (high - low) * Math.random(); 65 | }; 66 | 67 | const wait = (d, high) => { 68 | if (high) d = randIn(d, high); 69 | return new Promise(resolve => setTimeout(_ => resolve(1), d)); 70 | }; 71 | 72 | module.exports = { 73 | evaluate, 74 | walk, 75 | wait, 76 | ipHash, 77 | randIn 78 | }; 79 | -------------------------------------------------------------------------------- /test/163/account.js: -------------------------------------------------------------------------------- 1 | module.exports = require('../accounts').accountToSignin163(); 2 | -------------------------------------------------------------------------------- /test/163/clickLinkInEmail.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shenfe/puppeteer-service/7b1731d1725d5376e23109f6054ac963278eec75/test/163/clickLinkInEmail.js -------------------------------------------------------------------------------- /test/163/openEmail.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shenfe/puppeteer-service/7b1731d1725d5376e23109f6054ac963278eec75/test/163/openEmail.js -------------------------------------------------------------------------------- /test/163/signin.js: -------------------------------------------------------------------------------- 1 | module.exports = async page => { 2 | await wait(1000); 3 | 4 | if (!page.url().startsWith('https://ssl.mail.')) { 5 | await page.waitForSelector('#x-URS-iframe'); 6 | let frames = await page.frames(); 7 | const frame = frames[3]; 8 | const frameContext = await frame.executionContext(); 9 | 10 | const usernameSelector = '#login-form > div:nth-of-type(1) > div:nth-of-type(1) > div:nth-of-type(2) > input:nth-of-type(1)'; 11 | const passwordSelector = '#login-form > div:nth-of-type(1) > div:nth-of-type(3) > div:nth-of-type(2) > input:nth-of-type(2)'; 12 | 13 | await frameContext.evaluate(`document.querySelector('${usernameSelector}').value = '${username}'`); 14 | await frameContext.evaluate(`document.querySelector('${passwordSelector}').value = '${password}'`); 15 | await frameContext.evaluate(`document.querySelector('#dologin').click()`); 16 | await page.waitForNavigation(); 17 | } 18 | 19 | await page.waitForSelector('.mboxlst'); 20 | const result = await page.evaluate(_ => { 21 | let q = document.querySelectorAll('.mboxlst'); 22 | return q[q.length - 1].innerText; 23 | }, 7); 24 | return { 25 | data: result 26 | }; 27 | }; 28 | -------------------------------------------------------------------------------- /test/163/test.js: -------------------------------------------------------------------------------- 1 | const Run = require('puppeteer-service-client'); 2 | 3 | const account = require('./account'); 4 | 5 | const runner = require('./signin'); 6 | 7 | const { wait } = require('../../src/util'); 8 | 9 | const { port, apiName } = require('../../src/config').server; 10 | 11 | Run(`http://127.0.0.1:${port}/${apiName}`, { 12 | url: 'https://mail.163.com/', 13 | run: runner, 14 | options: { 15 | injection: { 16 | wait, 17 | ...account 18 | } 19 | } 20 | }) 21 | .then(data => console.log(JSON.stringify(data))) 22 | .catch(err => console.error(err)); 23 | -------------------------------------------------------------------------------- /test/accounts/index.js: -------------------------------------------------------------------------------- 1 | const fs = require('fs'); 2 | const path = require('path'); 3 | 4 | const accountList = require('./accounts.js'); 5 | 6 | const accountToSignin163 = () => { 7 | let { username, password } = accountList.find(item => (item.username && item.domain === '163.com')); 8 | return { 9 | username, 10 | password 11 | }; 12 | }; 13 | 14 | const accountToSigninGithub = () => { 15 | for (let a of accountList) { 16 | let r = readAccount(`${a.username}@${a.domain}`); 17 | if (r.github) return r.github; 18 | } 19 | }; 20 | 21 | const accountToSignupGithub = () => { 22 | for (let a of accountList) { 23 | if (a.github) continue; 24 | let r = readAccount(`${a.username}@${a.domain}`); 25 | if (r.github) continue; 26 | return { 27 | username: /([a-zA-Z]+)[0-9]*/.exec(a.username)[1], 28 | password: a.password, 29 | email: `${a.username}@${a.domain}`, 30 | origin: { 31 | ...a, 32 | ...r 33 | } 34 | }; 35 | } 36 | }; 37 | 38 | const readFile = filepath => { 39 | if (!fs.existsSync(filepath)) return {}; 40 | let text = fs.readFileSync(filepath, 'utf8'); 41 | try { 42 | let obj = JSON.parse(text); 43 | return obj; 44 | } catch (e) { 45 | return {}; 46 | } 47 | }; 48 | 49 | const writeFile = (filepath, obj = {}) => { 50 | fs.writeFileSync(filepath, JSON.stringify(obj, null, 4), 'utf8'); 51 | }; 52 | 53 | const readAccount = email => { 54 | const filepath = path.resolve(__dirname, `./accounts/${email}.json`); 55 | return readFile(filepath); 56 | }; 57 | 58 | const recordAccount = (obj = {}) => { 59 | if (!obj.email) return false; 60 | const email = obj.email; 61 | const filepath = path.resolve(__dirname, `./accounts/${email}.json`); 62 | writeFile(filepath, Object.assign(readFile(filepath), obj)); 63 | }; 64 | 65 | module.exports = { 66 | accountToSignin163, 67 | accountToSigninGithub, 68 | accountToSignupGithub, 69 | recordAccount, 70 | githubRepos: require('./accounts.starRepos') 71 | }; 72 | -------------------------------------------------------------------------------- /test/batch.js: -------------------------------------------------------------------------------- 1 | let batchNumber = 30; 2 | let apiUrl; 3 | const args = process.argv.slice(2); 4 | args.forEach(function (val, index, array) { 5 | switch (val) { 6 | case '-u': 7 | case '--url': 8 | apiUrl = args[index + 1]; 9 | break; 10 | case '-n': 11 | case '--number': 12 | batchNumber = +args[index + 1]; 13 | } 14 | }); 15 | 16 | const Run = require('puppeteer-service-client'); 17 | 18 | const { wait } = require('../src/util'); 19 | 20 | const { port, apiName } = require('../src/config').server; 21 | if (!apiUrl) { 22 | apiUrl = `http://127.0.0.1:${port}/${apiName}`; 23 | } 24 | 25 | const batchPromises = []; 26 | 27 | console.time('time consumed: '); 28 | 29 | const testUrls = [ 30 | 'https://www.sogou.com/', 31 | 'https://www.baidu.com/', 32 | 'https://www.youdao.com/', 33 | 'https://www.bing.com/' 34 | ]; 35 | 36 | for (let i = 0; i < batchNumber; i++) { 37 | console.time(`time ${i} consumed: `); 38 | let p = Run(`${apiUrl}?q=${i}`, { 39 | url: testUrls[i % testUrls.length], 40 | run: async page => { 41 | console.log('page ready'); 42 | echo(`${i} hey ` + page.url()); 43 | const title = await page.title(); 44 | return { 45 | title: title 46 | }; 47 | }, 48 | // socket: data => { 49 | // console.log('socket', data); 50 | // }, 51 | options: { 52 | injection: { 53 | i, 54 | wait 55 | } 56 | } 57 | }) 58 | .then(data => { 59 | console.timeEnd(`time ${i} consumed: `); 60 | // console.log(JSON.stringify(data)); 61 | }) 62 | .catch(err => console.error(err)); 63 | batchPromises.push(p); 64 | } 65 | 66 | Promise.all(batchPromises).then(() => { 67 | console.timeEnd('time consumed: '); 68 | }); 69 | -------------------------------------------------------------------------------- /test/github/account-signin.js: -------------------------------------------------------------------------------- 1 | module.exports = require('../accounts').accountToSigninGithub(); 2 | -------------------------------------------------------------------------------- /test/github/account-signup.js: -------------------------------------------------------------------------------- 1 | module.exports = require('../accounts').accountToSignupGithub(); 2 | -------------------------------------------------------------------------------- /test/github/openAndStarRepos.js: -------------------------------------------------------------------------------- 1 | const repoList = require('../accounts').githubRepos; 2 | 3 | const { randIn } = require('../../src/util'); 4 | 5 | module.exports = async (page, repos = repoList) => { 6 | if (!Array.isArray(repos)) repos = [repos]; 7 | for (let repo of repos) { 8 | if (Math.random() < 0.2) continue; 9 | await page.goto(`https://github.com/${repo}`); 10 | await page.waitFor(randIn(2000, 10000)); 11 | const starButton = 'button[aria-label="Star this repository"]'; 12 | await page.click(starButton); 13 | console.log(repo, 'star clicked'); 14 | await page.waitFor(randIn(2000, 10000)); 15 | } 16 | 17 | return page; 18 | }; 19 | -------------------------------------------------------------------------------- /test/github/signin.js: -------------------------------------------------------------------------------- 1 | const { wait } = require('../../src/util'); 2 | 3 | module.exports = async ({ username, password }, browser) => { 4 | const page = await browser.newPage(); 5 | await page.goto('https://github.com/login'); 6 | 7 | await wait(1000); 8 | 9 | const usernameSelector = 'input#login_field'; 10 | const passwordSelector = 'input#password'; 11 | 12 | await page.type(usernameSelector, username); 13 | await page.type(passwordSelector, password); 14 | await page.click('#login form > div:last-child input:last-child'); 15 | 16 | await page.waitForNavigation(); 17 | const result = await page.$eval('ul.mini-repo-list', e => e.innerText); 18 | console.log(result); 19 | 20 | return page; 21 | }; 22 | -------------------------------------------------------------------------------- /test/github/signup.js: -------------------------------------------------------------------------------- 1 | const { wait } = require('../../src/util'); 2 | 3 | const changeUsername = (username, salt) => { 4 | let s = 'abcdefghijklmnopqrstuvwxyz'[Math.ceil(Math.random() * 10000) % 26]; 5 | return { 6 | username: (_, s) => ('' + _ + s), 7 | salt: s 8 | }; 9 | }; 10 | 11 | const changePassword = (password, salt) => { 12 | let s1 = 'abcdefghijklmnopqrstuvwxyz'[Math.ceil(Math.random() * 10000) % 26]; 13 | let s2 = Math.ceil(Math.random() * 10000) % 10; 14 | let s = `${s1}${s2}`; 15 | return { 16 | password: (_, s) => ('' + _ + s), 17 | salt: s 18 | }; 19 | }; 20 | 21 | const setInput = async (page, selector, value) => { 22 | await page.$eval(selector, input => { input.value = '' }); 23 | await page.focus(selector); 24 | await page.keyboard.type(value); 25 | }; 26 | 27 | module.exports = async ({ username, password, email }, browser) => { 28 | const page = await browser.newPage(); 29 | 30 | const pageUrl = 'https://github.com/join'; 31 | await page.goto(pageUrl); 32 | 33 | await wait(1000); 34 | 35 | const usernameSelector = 'input#user_login'; 36 | const emailSelector = 'input#user_email'; 37 | const passwordSelector = 'input#user_password'; 38 | const originUsername = username; 39 | const originPassword = password; 40 | let usernameSalt; 41 | let passwordSalt; 42 | while (true) { 43 | await setInput(page, usernameSelector, username); 44 | await setInput(page, emailSelector, email); 45 | await setInput(page, passwordSelector, password); 46 | await page.click('button#signup_button'); 47 | await page.waitForNavigation(); 48 | if (page.url() !== pageUrl) break; 49 | 50 | let usernameUpdate = changeUsername(originUsername, usernameSalt); 51 | usernameSalt = usernameUpdate.salt; 52 | username = usernameUpdate.username(originUsername, usernameSalt); 53 | console.log('change username to: ', username); 54 | 55 | let passwordUpdate = changePassword(originPassword, passwordSalt); 56 | passwordSalt = passwordUpdate.salt; 57 | password = passwordUpdate.password(originPassword, passwordSalt); 58 | console.log('change password to: ', password); 59 | } 60 | 61 | await page.click('button.js-choose-plan-submit'); 62 | await page.waitForNavigation(); 63 | 64 | const input1 = 'form.setup-form > fieldset:nth-of-type(1) input'; 65 | const input2 = 'form.setup-form > fieldset:nth-of-type(2) input'; 66 | const input3 = 'form.setup-form > fieldset:nth-of-type(3) input'; 67 | const input4 = 'form.setup-form > fieldset:nth-of-type(4) input[type="text"]'; 68 | await page.click(input1); 69 | await page.click(input2); 70 | await page.click(input3); 71 | await page.type(input4, 'web-development machine-learning '); 72 | await page.click('form.setup-form > input.btn-primary[name="commit"]'); 73 | 74 | await page.waitForNavigation(); 75 | if (page.url() === 'https://github.com/dashboard') { 76 | console.log('success: ', { 77 | username, 78 | password, 79 | email 80 | }); 81 | } 82 | 83 | return { 84 | page, 85 | github: { 86 | username, 87 | password 88 | }, 89 | email 90 | }; 91 | }; 92 | -------------------------------------------------------------------------------- /test/github/test-signin-repl.js: -------------------------------------------------------------------------------- 1 | const puppeteer = require('puppeteer'); 2 | const pupConf = require('../../src/config').launch; 3 | 4 | const readlineSync = require('readline-sync'); 5 | const username = readlineSync.question('username: '); 6 | const password = readlineSync.question('password: ', { 7 | hideEchoBack: true 8 | }); 9 | 10 | if (readlineSync.question('set the executable path of chromium? [yn]: ') === 'y') { 11 | pupConf.executablePath = '..\\spiderman\\node_modules\\puppeteer\\.local-chromium\\win64-526987\\chrome-win32\\chrome.exe'; 12 | } 13 | 14 | const account = { username, password }; 15 | 16 | const runner = require('./signin'); 17 | 18 | puppeteer.launch(pupConf).then(async browser => { 19 | await runner(account, browser); 20 | await browser.close(); 21 | }); 22 | -------------------------------------------------------------------------------- /test/github/test-signin.js: -------------------------------------------------------------------------------- 1 | const puppeteer = require('puppeteer'); 2 | const pupConf = require('../../src/config').launch; 3 | 4 | const account = require('./account-signin'); 5 | 6 | const runner = require('./signin'); 7 | 8 | puppeteer.launch(pupConf).then(async browser => { 9 | await runner(account, browser); 10 | await browser.close(); 11 | }); 12 | -------------------------------------------------------------------------------- /test/github/test-signup-batch.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shenfe/puppeteer-service/7b1731d1725d5376e23109f6054ac963278eec75/test/github/test-signup-batch.js -------------------------------------------------------------------------------- /test/github/test-signup.js: -------------------------------------------------------------------------------- 1 | const puppeteer = require('puppeteer'); 2 | const pupConf = require('../../src/config').launch; 3 | 4 | pupConf.executablePath = '..\\spiderman\\node_modules\\puppeteer\\.local-chromium\\win64-526987\\chrome-win32\\chrome.exe'; 5 | pupConf.headless = false; 6 | 7 | const account = require('./account-signup'); 8 | 9 | const runner = require('./signup'); 10 | const batchStar = require('./openAndStarRepos'); 11 | 12 | const { recordAccount } = require('../accounts'); 13 | 14 | puppeteer.launch(pupConf).then(async browser => { 15 | const { page, github, email } = await runner(account, browser); 16 | recordAccount({ github, email, ...account.origin }); 17 | await batchStar(page); 18 | await browser.close(); 19 | }); 20 | -------------------------------------------------------------------------------- /test/iqiyi/screenshot.js: -------------------------------------------------------------------------------- 1 | const { wait } = require('../../src/util'); 2 | 3 | const path = require('path'); 4 | 5 | function getScrollbarWidth() { 6 | let outer = document.createElement('div'); 7 | outer.style.visibility = 'hidden'; 8 | outer.style.width = '100px'; 9 | 10 | document.body.appendChild(outer); 11 | 12 | let widthNoScroll = outer.offsetWidth; 13 | // force scrollbars 14 | outer.style.overflow = 'scroll'; 15 | 16 | // add innerdiv 17 | let inner = document.createElement('div'); 18 | inner.style.width = '100%'; 19 | outer.appendChild(inner); 20 | 21 | let widthWithScroll = inner.offsetWidth; 22 | 23 | // remove divs 24 | outer.parentNode.removeChild(outer); 25 | 26 | return widthNoScroll - widthWithScroll; 27 | } 28 | 29 | module.exports = async (url, browser) => { 30 | const page = await browser.newPage(); 31 | 32 | await page.setViewport({ 33 | width: 1920, 34 | height: 1080 35 | }); 36 | await page.goto(url); 37 | 38 | await wait(1000); 39 | 40 | while (true) { 41 | let re = await page.evaluate(`(_ => { 42 | let getScrollbarWidth = ${getScrollbarWidth}; 43 | window.scrollBy(0, window.innerHeight); 44 | let scrbWidth = getScrollbarWidth(); 45 | return document.body.scrollHeight + scrbWidth === window.innerHeight + window.scrollY; 46 | })()`); 47 | await page.waitFor(1000); 48 | if (re) break; 49 | } 50 | 51 | const filepath = path.resolve(__dirname, `./screenshots/${Date.now()}.png`); 52 | await page.screenshot({ 53 | path: filepath, 54 | fullPage: true 55 | }); 56 | 57 | const result = { 58 | path: filepath 59 | }; 60 | console.log(result); 61 | 62 | return result; 63 | }; 64 | -------------------------------------------------------------------------------- /test/iqiyi/screenshots/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shenfe/puppeteer-service/7b1731d1725d5376e23109f6054ac963278eec75/test/iqiyi/screenshots/.gitkeep -------------------------------------------------------------------------------- /test/iqiyi/test-screenshot.js: -------------------------------------------------------------------------------- 1 | const url = 'http://www.iqiyi.com/'; 2 | 3 | const puppeteer = require('puppeteer'); 4 | const pupConf = require('../../src/config').launch; 5 | 6 | const readlineSync = require('readline-sync'); 7 | if (readlineSync.question('set the executable path of chromium? [yn]: ') === 'y') { 8 | pupConf.executablePath = '..\\spiderman\\node_modules\\puppeteer\\.local-chromium\\win64-526987\\chrome-win32\\chrome.exe'; 9 | } 10 | 11 | const runner = require('./screenshot'); 12 | 13 | const open = require('open'); 14 | 15 | puppeteer.launch(pupConf).then(async browser => { 16 | const result = await runner(url, browser); 17 | await browser.close(); 18 | open(result.path); 19 | }); 20 | -------------------------------------------------------------------------------- /test/zhihu/elem-extract.js: -------------------------------------------------------------------------------- 1 | const fs = require('fs'); 2 | const path = require('path'); 3 | 4 | module.exports = async ({ 5 | username, 6 | password, 7 | url 8 | }, browser) => { 9 | const page = await browser.newPage(); 10 | 11 | await page.setViewport({ 12 | width: 1280, 13 | height: 720 14 | }); 15 | await page.goto(url); 16 | 17 | await page.waitFor(1000); 18 | 19 | await page.waitForSelector('.SignContainer-switch'); 20 | await page.click('.SignContainer-switch>span[data-reactid]'); 21 | await page.waitForSelector('.Button.Login-socialButtonEntrance'); 22 | await page.click('.Button.Login-socialButtonEntrance'); 23 | await page.$$eval('.Button.Login-socialButton', btns => btns[1].click()); 24 | await page.waitForNavigation(); 25 | 26 | await page.waitFor(1000); 27 | await page.type(`#userId`, username); 28 | await page.type(`#passwd`, password); 29 | await page.click('.formbtn_01'); 30 | await page.click('.formbtn_01'); 31 | 32 | await page.waitForNavigation(); 33 | await page.click('#email'); 34 | await page.click('.WB_btn_allow'); 35 | 36 | await page.waitForNavigation(); 37 | 38 | await page.waitFor(1000); 39 | 40 | // await page.addScriptTag({ 41 | // url: 'https://raw.githubusercontent.com/shenfe/FeSpider/master/src/fespider/FeSpider.js' 42 | // }); 43 | 44 | const result = await page.evaluate((x, js) => { 45 | (new Function(js))(); 46 | fespider.present(document.querySelector(x)); 47 | return document.documentElement.outerHTML; 48 | }, '.TopstoryItem:nth-child(1)', fs.readFileSync(path.resolve(__dirname, './fespider.js'), 'utf8')); 49 | 50 | await page.waitFor(1000); 51 | 52 | // console.log(result); 53 | 54 | return result; 55 | }; 56 | -------------------------------------------------------------------------------- /test/zhihu/fespider.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @Author: Ke Shen 3 | * @Date: 2017-03-10 09:43:57 4 | * @Email: keshen@sohu-inc.com 5 | * @Last modified by: godzilla 6 | * @Last modified time: 2017-03-10 09:43:57 7 | */ 8 | 9 | (function () { 10 | 11 | if (window.fespider) return; 12 | 13 | var conf = { 14 | classNameUpperCase: false, 15 | classNameModulePrefix: true, 16 | moduleName: 'module', 17 | recoverUrlInAttr: false, 18 | fetchFont: true, 19 | serverHost: 'https://127.0.0.1:3663', 20 | pullContent: true, 21 | generateType: 'html' // 'html' | 'vue' 22 | }; 23 | 24 | /** 25 | * String Hash 26 | * Ref: http://werxltd.com/wp/2010/05/13/javascript-implementation-of-javas-string-hashcode-method/ 27 | */ 28 | if (!String.prototype.hashCode) { 29 | String.prototype.hashCode = function () { 30 | var hash = 0, i, chr; 31 | if (this.length === 0) return hash; 32 | for (i = 0; i < this.length; i++) { 33 | chr = this.charCodeAt(i); 34 | hash = ((hash << 5) - hash) + chr; 35 | hash |= 0; // Convert to 32bit integer 36 | } 37 | return hash; 38 | }; 39 | } 40 | 41 | if (!String.prototype.endsWith) { 42 | String.prototype.endsWith = function (s) { 43 | if (typeof s !== 'string') return false; 44 | if (s.length > this.length) return false; 45 | return (this.substr(this.length - s.length) === s); 46 | }; 47 | } 48 | 49 | var parseUrl = function (url) { 50 | var parser = document.createElement('a'); 51 | parser.href = url; 52 | return { 53 | protocol: parser.protocol, 54 | host: parser.host, 55 | path: parser.pathname, 56 | search: parser.search, 57 | hash: parser.hash 58 | }; 59 | }; 60 | var recoverUrl = function (base, target) { 61 | var prefix = target.substr(0, target.indexOf(':')); 62 | if (prefix && /[a-z]+/.test(prefix)) { 63 | return target; 64 | } 65 | 66 | base = recoverUrl(window.location.href, base); 67 | var b = parseUrl(base); 68 | if (target.startsWith('//')) return b.protocol + target; 69 | if (target.startsWith('/')) return b.protocol + '//' + b.host + target; 70 | if (b.path.endsWith('/')) return b.protocol + '//' + b.host + b.path + target; 71 | return b.protocol + '//' + b.host + b.path.substring(0, b.path.lastIndexOf('/')) + '/' + target; 72 | }; 73 | var recoverCssUrls = function (cssText, baseUrl) { 74 | var replacer = function (s, p1) { 75 | p1 = p1.trim(); 76 | var inner = p1; 77 | if ((p1.charAt(0) === "'" && p1.charAt(p1.length - 1) === "'") 78 | || (p1.charAt(0) === '"' && p1.charAt(p1.length - 1) === '"')) inner = p1.substr(1, p1.length - 2); 79 | if (inner.startsWith('data:')) return 'url(' + inner + ')'; 80 | return 'url(\'' + recoverUrl(baseUrl, inner) + '\')'; 81 | }; 82 | cssText = cssText.replace(/url\s*\((.*?)\)/g, replacer); 83 | return cssText; 84 | }; 85 | 86 | var getCssLinks = function () { 87 | var sheet = document.styleSheets, 88 | i = sheet.length; 89 | var re = []; 90 | while (0 <= --i) { 91 | if (sheet[i].href) { 92 | re.push(sheet[i].href); 93 | } 94 | } 95 | return re; 96 | }; 97 | var getFontFaces = function () { 98 | var sheet = document.styleSheets, 99 | rule = null, 100 | i = sheet.length, j; 101 | var urlQueue = []; 102 | var interRules = []; 103 | while (0 <= --i) { 104 | if (sheet[i].href) { 105 | urlQueue.push(sheet[i].href); 106 | } else { 107 | rule = sheet[i].rules || sheet[i].cssRules || []; 108 | j = rule.length; 109 | while (0 <= --j) { 110 | if (rule[j].constructor.name === 'CSSFontFaceRule') { 111 | interRules.push(recoverCssUrls(rule[j].cssText, window.location.href)); 112 | }; 113 | } 114 | } 115 | } 116 | return Promise.all(urlQueue.map(url => { 117 | return fetch(conf.serverHost + '/get/' + encodeURIComponent(url), { 118 | mode: 'cors', 119 | headers: {'Content-Type': 'text/plain'} 120 | }).then(res => { 121 | return res.text().then(data => { 122 | var regExp = /@font-face\s*\{[^}]+}/g; 123 | var results = data.match(regExp) || []; 124 | return interRules.concat(results.map(result => recoverCssUrls(result, url))); 125 | }); 126 | }).catch(err => { 127 | console.error(err); 128 | }); 129 | })); 130 | }; 131 | 132 | const ignoreNodeName = { 133 | '#text': true, 134 | '#comment': true, 135 | 'meta': true, 136 | 'script': true, 137 | 'style': true, 138 | 'iframe': true 139 | }; 140 | 141 | const PropertyTable = { 142 | 'display': {}, 143 | 'zoom': {}, 144 | 'flex-direction': {}, 145 | 'flex-wrap': {}, 146 | 'flex-flow': {}, 147 | 'justify-content': {}, 148 | 'align-items': {}, 149 | 'align-content': {}, 150 | 'order': {}, 151 | 'flex-grow': {}, 152 | 'flex-shrink': {}, 153 | 'flex-basis': {}, 154 | 'flex': {}, 155 | 'align-self': {}, 156 | 'position': {}, 157 | 'z-index': {}, 158 | 'width': { 159 | default: () => 'auto' 160 | }, 161 | 'height': { 162 | default: () => 'auto' 163 | }, 164 | 'max-width': { 165 | ignore: function (v) { 166 | return v === 'auto' || v === 'none'; 167 | } 168 | }, 169 | 'min-width': { 170 | ignore: function (v) { 171 | return v === 'auto' || v === 'none'; 172 | } 173 | }, 174 | 'max-height': { 175 | ignore: function (v) { 176 | return v === 'auto' || v === 'none'; 177 | } 178 | }, 179 | 'min-height': { 180 | ignore: function (v) { 181 | return v === 'auto' || v === 'none'; 182 | } 183 | }, 184 | 'top': { 185 | default: () => 'auto' 186 | }, 187 | 'right': { 188 | default: () => 'auto' 189 | }, 190 | 'bottom': { 191 | default: () => 'auto' 192 | }, 193 | 'left': { 194 | default: () => 'auto' 195 | }, 196 | 'background': {}, 197 | // 'background-color': {}, 198 | // 'background-size': {}, 199 | 'margin': { 200 | default: (type) => { 201 | var ignore = ['ul', 'p', 'dd', 'h1', 'h2', 'h3', 'h4', 'body']; 202 | if (ignore.indexOf(type) >= 0) return false; 203 | return '0px'; 204 | } 205 | }, 206 | // 'margin-top': {}, 207 | // 'margin-right': {}, 208 | // 'margin-bottom': {}, 209 | // 'margin-left': {}, 210 | 'padding': {}, 211 | // 'padding-top': {}, 212 | // 'padding-right': {}, 213 | // 'padding-bottom': {}, 214 | // 'padding-left': {}, 215 | 'border': { 216 | ignore: function (v) { 217 | return v.indexOf('none') >= 0; 218 | } 219 | }, 220 | 'border-top': { 221 | ignore: function (v) { 222 | return v.indexOf('none') >= 0; 223 | } 224 | }, 225 | 'border-right': { 226 | ignore: function (v) { 227 | return v.indexOf('none') >= 0; 228 | } 229 | }, 230 | 'border-bottom': { 231 | ignore: function (v) { 232 | return v.indexOf('none') >= 0; 233 | } 234 | }, 235 | 'border-left': { 236 | ignore: function (v) { 237 | return v.indexOf('none') >= 0; 238 | } 239 | }, 240 | 'border-radius': {}, 241 | 'border-collapse': { 242 | inherit: true 243 | }, 244 | 'border-spacing': { 245 | inherit: true 246 | }, 247 | 'box-shadow': {}, 248 | 'box-sizing': {}, 249 | 'outline': { 250 | ignore: function (v) { 251 | return v.indexOf('none') >= 0; 252 | } 253 | }, 254 | 'color': { 255 | inherit: true 256 | }, 257 | 'text-align': { 258 | inherit: true 259 | }, 260 | 'text-indent': { 261 | inherit: true 262 | }, 263 | 'text-overflow': { 264 | default: () => 'clip' 265 | }, 266 | 'overflow-x': {}, 267 | 'overflow-y': {}, 268 | 'cursor': { 269 | inherit: true 270 | }, 271 | 'float': {}, 272 | 'clear': {}, 273 | 'table-layout': {}, 274 | 'font': { 275 | inherit: true 276 | }, 277 | /* 278 | 'font-family': { 279 | inherit: true 280 | }, 281 | 'font-size': { 282 | inherit: true 283 | }, 284 | 'font-weight': { 285 | inherit: true 286 | }, 287 | 'font-style': { 288 | inherit: true 289 | }, 290 | 'line-height': { 291 | inherit: true 292 | }, 293 | */ 294 | 'letter-spacing': { 295 | inherit: true 296 | }, 297 | 'list-style': { 298 | inherit: true 299 | }, 300 | 'opacity': {}, 301 | 'visibility': { 302 | inherit: true 303 | }, 304 | 'text-decoration': {}, 305 | 'vertical-align': {}, 306 | 'white-space': { 307 | inherit: true 308 | }, 309 | 'word-break': { 310 | inherit: true 311 | }, 312 | 'word-wrap': { 313 | inherit: true 314 | }, 315 | 'content': {}, 316 | 'transform': {}, 317 | 'transform-origin': { 318 | default: () => '50% 50%' 319 | }, 320 | 'transition': {}, 321 | 'fill': {} 322 | }; 323 | 324 | var cleanComputedStyle = function (cs) { 325 | if (cs['border-top'] === cs['border']) delete cs['border-top']; 326 | if (cs['border-right'] === cs['border']) delete cs['border-right']; 327 | if (cs['border-bottom'] === cs['border']) delete cs['border-bottom']; 328 | if (cs['border-left'] === cs['border']) delete cs['border-left']; 329 | }; 330 | 331 | var propNameCamelify = function (name) { 332 | var parts = name.split('-'); 333 | var re = parts[0] || ''; 334 | for (var i = 1, len = parts.length; i < len; i++) { 335 | var p = parts[1]; 336 | re += p.substr(0, 1).toUpperCase() + p.substr(1); 337 | } 338 | return re; 339 | }; 340 | 341 | var getFullStyle = function (dom, pseudo, inSvg) { 342 | var cs = !pseudo ? getComputedStyle(dom) : getComputedStyle(dom, ':' + pseudo); 343 | var ncs = (pseudo && !pseudoClassTable[pseudo].element) ? getComputedStyle(dom) 344 | : getNodeDefaultCS((pseudo && pseudoClassTable[pseudo].element === 'inline') ? 'span' : dom.nodeName.toLowerCase(), inSvg); 345 | var re = {}; 346 | for (var prop in PropertyTable) { 347 | var cprop = propNameCamelify(prop); 348 | if (cs[cprop] && (preventDefaultProps[dom.nodeName.toLowerCase() + ' ' + prop] || PropertyTable[prop].inherit 349 | || (cs[cprop] !== ncs[cprop] && (!PropertyTable[prop].ignore || !PropertyTable[prop].ignore(cs[cprop]))))) { 350 | re[prop] = cs[cprop]; 351 | } 352 | } 353 | 354 | /* hack for pseudo elements */ 355 | /* 356 | if (pseudo) { 357 | if (re.height === 'auto' || re.height === '0px') { 358 | delete re.height; 359 | } 360 | } 361 | */ 362 | 363 | cleanComputedStyle(re); 364 | return re; 365 | }; 366 | 367 | const pseudoClassTable = { 368 | 'before': { element: 'inline' }, 369 | 'after': { element: 'inline' } 370 | }; 371 | var getPseudoElements = function (dom, domStyle, inSvg) { 372 | var re = {}; 373 | for (var p in pseudoClassTable) { 374 | if (pseudoClassTable[p].element) { 375 | var cs = getComputedStyle(dom, ':' + p); 376 | if (cs.content) { 377 | re[p] = getFullStyle(dom, p, inSvg); 378 | } else { 379 | continue; 380 | } 381 | var domCS = getComputedStyle(dom); 382 | for (var i in re[p]) { 383 | if (PropertyTable[i].inherit && domCS[propNameCamelify(i)] === re[p][i]) { 384 | delete re[p][i]; 385 | } 386 | } 387 | } else { 388 | // won't be reached so far 389 | } 390 | } 391 | if (Object.keys(re).length === 0) return null; 392 | return re; 393 | }; 394 | 395 | const preventDefaultProps = { 396 | 'a color': true, 397 | 'a text-decoration': true, 398 | 'em font': true, 399 | 'input outline': true, 400 | 'input border': true, 401 | 'input border-top': true, 402 | 'input border-right': true, 403 | 'input border-bottom': true, 404 | 'input border-left': true, 405 | 'input box-sizing': true, 406 | 'fieldset border': true, 407 | 'fieldset border-top': true, 408 | 'fieldset border-right': true, 409 | 'fieldset border-bottom': true, 410 | 'fieldset border-left': true, 411 | 'textarea outline': true, 412 | 'textarea border': true, 413 | 'textarea border-top': true, 414 | 'textarea border-right': true, 415 | 'textarea border-bottom': true, 416 | 'textarea border-left': true, 417 | 'button border': true, 418 | 'button border-top': true, 419 | 'button border-right': true, 420 | 'button border-bottom': true, 421 | 'button border-left': true, 422 | 'button color': true, 423 | 'ul margin': true, 424 | 'h1 font': true, 425 | 'h2 font': true, 426 | 'figure margin': true 427 | }; 428 | 429 | var getMetaData = function (dom) { 430 | var metaShow = getFullMetaData(dom); 431 | var originalDisplay = getComputedStyle(dom)['display']; 432 | dom.style.display = 'none'; 433 | var metaHide = getFullMetaData(dom); 434 | dom.style.display = originalDisplay; 435 | 436 | var propsKeptInNode1 = ['transform', 'transform-origin', 'transition']; 437 | var patch = function (node1, node2) { 438 | var nodeName = node1.nodeName; 439 | if (node1.style) { 440 | for (var p in node1.style) { 441 | if (node1.style[p] === undefined) { 442 | delete node1.style[p]; 443 | continue; 444 | } 445 | if (/px/.test(node1.style[p]) && propsKeptInNode1.indexOf(p) < 0) { 446 | if (node2.style[p] === undefined) { 447 | delete node1.style[p]; 448 | continue; 449 | } 450 | node1.style[p] = node2.style[p]; 451 | if ((node1.style[p] === 'auto' && !(PropertyTable[p].default && node1.style[p] !== PropertyTable[p].default(nodeName))) 452 | || (!PropertyTable[p].inherit && (PropertyTable[p].default && PropertyTable[p].default(nodeName) === node1.style[p]))) { 453 | delete node1.style[p]; 454 | } 455 | } 456 | } 457 | for (var p in node2.style) { 458 | if (node1.style[p] == null && node2.style[p].indexOf('auto') >= 0 && (!PropertyTable[p].default || node2.style[p] !== PropertyTable[p].default(nodeName))) { 459 | node1.style[p] = node2.style[p]; // this could fix the problem of margin auto 0 460 | } 461 | } 462 | } 463 | if (node1.childNodes) { 464 | for (var i = 0, len = node1.childNodes.length; i < len; i++) { 465 | patch(node1.childNodes[i], node2.childNodes[i]); 466 | } 467 | } 468 | if (node1.pseudo) { 469 | for (var i in node1.pseudo) { 470 | var keptProps = {}; 471 | for (let keptProp of propsKeptInNode1) { 472 | if (node1.pseudo[i][keptProp]) keptProps[keptProp] = node1.pseudo[i][keptProp]; 473 | } 474 | node1.pseudo[i] = extendObj(node2.pseudo[i], keptProps); 475 | } 476 | } 477 | }; 478 | patch(metaShow, metaHide); 479 | return metaShow; 480 | }; 481 | var getMetaData_test = function (dom) { 482 | var display = getComputedStyle(dom)['display']; 483 | dom.style.display = 'none'; 484 | var re = getFullMetaData(dom); 485 | re.style.display = display; 486 | return re; 487 | }; 488 | 489 | const reservedAttrs = { 490 | 'a': ['href', 'target'], 491 | 'img': ['src'], 492 | 'input': ['placeholder', 'value', 'type'], 493 | 'textarea': ['placeholder', 'value'] 494 | }; 495 | 496 | // notice: some attributes would be ignored by default, see variable 'ignoreTable' of function 'getAttributes' 497 | const ignoredAttrs = { 498 | 'svg': [], 499 | 'svg/*': [], 500 | 'table': [], 501 | 'table/*': [] 502 | }; 503 | 504 | var getAttributes = function (dom, ignoreAttrNames, allowAttrNames, filter) { 505 | var re = {}, ignoreTable = { 506 | 'id': true, 507 | 'class': true, 508 | 'style': true 509 | }; 510 | if (allowAttrNames) { 511 | for (let an of allowAttrNames) { 512 | var av = dom.getAttribute(an); 513 | if (av || av === '') { 514 | re[an] = filter ? filter(an, av) : av; 515 | } 516 | } 517 | return re; 518 | } 519 | if (ignoreAttrNames) { 520 | for (let an of ignoreAttrNames) ignoreTable[an] = true; 521 | } 522 | var rawAttrs = dom.attributes; 523 | for (var i = 0, len = rawAttrs.length; i < len; i++) { 524 | var an = rawAttrs[i].name; 525 | if (ignoreTable[an]) continue; 526 | var av = rawAttrs[i].value; 527 | re[an] = filter ? filter(an, av) : av; 528 | } 529 | 530 | return re; 531 | }; 532 | 533 | var cleanAttributes = function (dom) { 534 | while (dom.attributes.length > 0) 535 | dom.removeAttribute(dom.attributes[0].name); 536 | return dom; 537 | }; 538 | 539 | var getFullMetaData = function (dom, keepAttrs, inSvg) { 540 | var type = dom.nodeName.toLowerCase(); 541 | if (type === '#text') { 542 | return { 543 | nodeName: '#text', 544 | value: dom.nodeValue 545 | }; 546 | } 547 | if (ignoreNodeName[type]) return null; 548 | 549 | inSvg = inSvg || (type === 'svg'); 550 | 551 | var meta = { 552 | nodeName: type, 553 | style: getFullStyle(dom, null, inSvg) 554 | }; 555 | 556 | if (keepAttrs) { 557 | meta.attrs = getAttributes(dom); 558 | } else if (ignoredAttrs[type]) { 559 | meta.attrs = getAttributes(dom, ignoredAttrs[type]); 560 | } else if (reservedAttrs[type]) { 561 | meta.attrs = getAttributes(dom, null, reservedAttrs[type], (attrName, attrValue) => { 562 | return ((attrName === 'href' || attrName === 'src') && conf.recoverUrlInAttr) ? recoverUrl(window.location.href, attrValue) : attrValue; 563 | }); 564 | } 565 | 566 | if (ignoredAttrs[type + '/*']) { 567 | keepAttrs = true; 568 | } 569 | 570 | if (meta.attrs && Object.keys(meta.attrs).length === 0) { 571 | delete meta.attrs; 572 | } 573 | 574 | meta.pseudo = getPseudoElements(dom, meta.style, inSvg); 575 | if (!meta.pseudo) delete meta.pseudo; 576 | 577 | if (dom.childNodes.length) { 578 | meta.childNodes = []; 579 | dom.childNodes.forEach(function (el, i) { 580 | var childData = getFullMetaData(el, keepAttrs, inSvg); 581 | if (!childData) return true; 582 | if (childData.nodeName !== '#text') { 583 | var dupProps = []; 584 | for (var i in childData.style) { 585 | if (!preventDefaultProps[childData.nodeName + ' ' + i] 586 | && PropertyTable[i].inherit 587 | && meta.style[i] === childData.style[i]) { 588 | dupProps.push(i); 589 | } 590 | } 591 | dupProps.forEach(function (p) { 592 | delete childData.style[p]; 593 | }); 594 | } 595 | meta.childNodes.push(childData); 596 | }); 597 | } 598 | 599 | return meta; 600 | }; 601 | 602 | var styleSheetData = {}; 603 | var stringOfStyleObj = function (obj, indent) { 604 | indent = indent ? '\n ' : ''; 605 | var re = ''; 606 | for (var p in obj) { 607 | re += indent + p + ('' === indent ? ':' : ': ') + obj[p] + ';'; 608 | } 609 | return re; 610 | }; 611 | 612 | DATA_FOR_ADDCSSRULE: { 613 | var nodeTypeCount = {}; 614 | var cssRuleValueHash2Name = {}; 615 | var cssRuleName2ValueHash = {}; 616 | } 617 | var addCssRule = function (nodeName, obj, pseudo) { 618 | var self = obj; 619 | var selfHash = stringOfStyleObj(self).hashCode(); 620 | 621 | var pseudoValues = {}; 622 | var pseudoHashes = {}; 623 | if (pseudo) { 624 | for (var p in pseudo) { 625 | pseudoValues[p] = pseudo[p] || undefined; 626 | pseudoHashes[p] = pseudoValues[p] ? stringOfStyleObj(pseudoValues[p]).hashCode() : undefined; 627 | } 628 | } 629 | 630 | if (cssRuleValueHash2Name[selfHash]) { 631 | var existingNameList = cssRuleValueHash2Name[selfHash]; 632 | for (let existingName of existingNameList) { 633 | var consistent = true; 634 | for (var p in pseudoClassTable) { 635 | if (cssRuleName2ValueHash[existingName + ':' + p] !== pseudoHashes[p]) { 636 | consistent = false; 637 | break; 638 | } 639 | } 640 | if (consistent) { 641 | return existingName; 642 | } 643 | } 644 | } 645 | 646 | if (!nodeTypeCount[nodeName]) nodeTypeCount[nodeName] = 0; 647 | nodeTypeCount[nodeName]++; 648 | var className = (conf.classNameModulePrefix ? (conf.moduleName + '-') : '') + (conf.classNameUpperCase ? nodeName.toUpperCase() : nodeName.toLowerCase()) + nodeTypeCount[nodeName]; 649 | 650 | if (!cssRuleValueHash2Name[selfHash]) cssRuleValueHash2Name[selfHash] = []; 651 | cssRuleValueHash2Name[selfHash].push(className); 652 | for (var p in pseudoHashes) { 653 | if (pseudoHashes[p]) cssRuleName2ValueHash[className + ':' + p] = pseudoHashes[p]; 654 | } 655 | cssRuleName2ValueHash[className] = selfHash; 656 | 657 | styleSheetData['.' + className] = self; 658 | for (var p in pseudoValues) { 659 | if (pseudoValues[p]) styleSheetData['.' + className + ':' + p] = pseudoValues[p]; 660 | } 661 | 662 | return className; 663 | }; 664 | 665 | var getHelperIframe = function (iframeSrc) { 666 | var iframeId = 'qwe123'; 667 | var helperIframe; 668 | if (!window.frames[iframeId]) { 669 | helperIframe = document.createElement('iframe'); 670 | helperIframe.id = iframeId; 671 | document.body.appendChild(helperIframe); 672 | } else { 673 | helperIframe = window.frames[iframeId]; 674 | } 675 | if (iframeSrc) helperIframe.src = iframeSrc; 676 | return helperIframe; 677 | }; 678 | 679 | var getNodeDefaultCS = function (nodeName, inSvg) { 680 | inSvg = inSvg || (nodeName === 'svg'); 681 | var iframeIns = getHelperIframe(); 682 | var iframeDoc = iframeIns.contentDocument; 683 | var iframeNodes = iframeDoc.getElementsByTagName(nodeName); 684 | var node; 685 | if (iframeNodes.length) node = iframeNodes[0]; 686 | else { 687 | node = (!inSvg) ? iframeDoc.createElement(nodeName) : iframeDoc.createElementNS('http://www.w3.org/2000/svg', nodeName); 688 | iframeDoc.body.appendChild(node); 689 | } 690 | var re = extendObj({}, getComputedStyle(node)); 691 | /* 692 | var originalDisplay = re['display']; 693 | node.style.display = 'none'; 694 | re = extendObj({}, getComputedStyle(node), { 695 | display: originalDisplay 696 | }); 697 | */ 698 | ['transform-origin'].forEach(p => { 699 | if (!PropertyTable[p] || !PropertyTable[p].default) return; 700 | var dv = PropertyTable[p].default(nodeName); 701 | if (dv === false) return; 702 | re[propNameCamelify(p)] = dv; 703 | }); 704 | return re; 705 | }; 706 | 707 | var pl_extractCommonCssFromChildren = function (dom, styleData, metaData) { 708 | /* find all-children-share styles */ 709 | var getChildrenCommonStyles = function (childNodes) { 710 | var minOfChildClassCount = 2; 711 | var minOfRepeatTime = 2; 712 | 713 | if (!childNodes) return null; 714 | // var validChildCount = 0; 715 | var childClassCount = 0; 716 | var childrenCssStat = {}; 717 | var allChildrenHave = {}; 718 | var checkedClasses = {}; 719 | for (let child of childNodes) { 720 | if (child.nodeName === '#text') continue; 721 | // validChildCount++; 722 | if (checkedClasses[child.className]) continue; 723 | childClassCount++; 724 | checkedClasses[child.className] = true; 725 | var cs = styleData['.' + child.className]; 726 | for (var i in cs) { 727 | var key = i + ': ' + cs[i]; 728 | childrenCssStat[key] = (childrenCssStat[key] || 0) + 1; 729 | } 730 | } 731 | if (childClassCount >= minOfChildClassCount) { 732 | for (var i in childrenCssStat) { 733 | if (childrenCssStat[i] < childClassCount) continue; 734 | var splitPos = i.indexOf(': '); 735 | allChildrenHave[i.substr(0, splitPos)] = i.substr(splitPos + 2); 736 | } 737 | } 738 | // console.log(allChildrenHave); 739 | return Object.keys(allChildrenHave).length >= minOfRepeatTime ? allChildrenHave : null; 740 | }; 741 | 742 | /* index */ 743 | var className2Nodes = {}; 744 | var traverse = function (node, index = {}) { 745 | if (node.className) { 746 | if (!index[node.className]) index[node.className] = []; 747 | index[node.className].push(node); 748 | } 749 | if (node.childNodes) node.childNodes.forEach(child => { traverse(child, index); }); 750 | }; 751 | traverse(metaData, className2Nodes); 752 | 753 | var handler = function (node) { 754 | var className = node.className; 755 | if (!className) return; 756 | 757 | if (!node.followers) { 758 | var sameClassNodes = className2Nodes[className]; 759 | var allChildNodes = sameClassNodes.reduce((prev, next) => { return (!next.childNodes ? prev : prev.concat(next.childNodes)); }, []); 760 | var commonStyles = getChildrenCommonStyles(allChildNodes); 761 | if (commonStyles) { 762 | var checkedClasses = {}; 763 | var allIncluded = true; 764 | for (let child of allChildNodes) { 765 | if (child.nodeName === '#text') continue; 766 | if (checkedClasses[child.className]) continue; 767 | checkedClasses[child.className] = true; 768 | var childSameClassNodes = className2Nodes[child.className]; 769 | for (let scn of childSameClassNodes) { 770 | if (allChildNodes.indexOf(scn) < 0) { 771 | allIncluded = false; 772 | break; 773 | } 774 | } 775 | if (!allIncluded) break; 776 | } 777 | if (allIncluded) { 778 | styleData['.' + className + '>*'] = commonStyles; 779 | sameClassNodes.forEach(v => { v.followers = commonStyles; }); 780 | 781 | for (var c in checkedClasses) { 782 | for (var i in commonStyles) { 783 | delete styleData['.' + c][i]; 784 | } 785 | } 786 | } 787 | } 788 | } 789 | 790 | if (node.childNodes) { 791 | for (let child of node.childNodes) { 792 | if (child.nodeName === '#text') continue; 793 | handler(child); 794 | } 795 | } 796 | }; 797 | 798 | handler(metaData); 799 | }; 800 | 801 | var pl_overflowCombine = function (dom, styles = {}) { 802 | for (var sel in styles) { 803 | var s = styles[sel]; 804 | if (s['overflow-x'] && (s['overflow-x'] === s['overflow-y'])) { 805 | s['overflow'] = s['overflow-x']; 806 | delete s['overflow-x']; 807 | delete s['overflow-y']; 808 | } 809 | } 810 | }; 811 | var pl_borderCombile = function (dom, styles = {}) { 812 | for (var sel in styles) { 813 | var s = styles[sel]; 814 | if (s['border-top'] && s['border-right'] && s['border-bottom'] && s['border-left']) { 815 | var bt = s['border-top']; 816 | var br = s['border-right']; 817 | var bb = s['border-bottom']; 818 | var bl = s['border-left']; 819 | if (bt === br && bt === bb && bt === bl) { 820 | s['border'] = bt; 821 | delete s['border-top']; 822 | delete s['border-right']; 823 | delete s['border-bottom']; 824 | delete s['border-left']; 825 | } 826 | } 827 | } 828 | }; 829 | var plugins = [pl_overflowCombine, pl_borderCombile]; 830 | var plugin = function (handler) { 831 | plugins.push(handler); 832 | }; 833 | plugins.push(pl_extractCommonCssFromChildren); 834 | 835 | var buildDom = function (meta, inSvg) { 836 | if (meta.nodeName === '#text') { 837 | return document.createTextNode(meta.value); 838 | } 839 | inSvg = inSvg || (meta.nodeName === 'svg'); 840 | if (inSvg) { 841 | var dom = document.createElementNS('http://www.w3.org/2000/svg', meta.nodeName); 842 | } else { 843 | var dom = document.createElement(meta.nodeName); 844 | } 845 | 846 | if (meta.attrs) { 847 | for (var k in meta.attrs) { 848 | dom.setAttribute(k, meta.attrs[k]); 849 | } 850 | } 851 | 852 | var className = addCssRule(meta.nodeName, meta.style, meta.pseudo); 853 | dom.setAttribute('class', className); 854 | 855 | meta.className = className; 856 | 857 | if (meta.childNodes) { 858 | meta.childNodes.forEach(function (child) { 859 | dom.appendChild(buildDom(child, inSvg)); 860 | }); 861 | } 862 | 863 | return dom; 864 | }; 865 | 866 | var extendObj = function (dest, src = {}) { 867 | for (var i in src) { 868 | dest[i] = src[i]; 869 | } 870 | return dest; 871 | }; 872 | var presentDom = function (dom, moduleName, options) { 873 | 874 | initData(); 875 | 876 | extendObj(conf, options); 877 | if (moduleName) conf.moduleName = moduleName; 878 | moduleName = conf.moduleName; 879 | 880 | var styleSheet = document.createElement('style'); 881 | var ndom; 882 | 883 | var output = () => { 884 | var outputData = { 885 | name: moduleName, 886 | type: conf.generateType, 887 | style: styleSheet.innerHTML, 888 | html: (ndom.nodeName === 'body') ? ndom.innerHTML : ndom.outerHTML 889 | }; 890 | console.log(outputData); 891 | 892 | if (typeof chrome !== 'undefined') { 893 | chrome.runtime.sendMessage( 894 | JSON.parse(JSON.stringify(outputData)), 895 | function (response) { 896 | console.log(response); 897 | }); 898 | } 899 | 900 | var postData = new FormData(); 901 | postData.append('json', JSON.stringify(outputData)); 902 | if (conf.pullContent) { 903 | fetch(conf.serverHost + '/post', { 904 | method: 'post', 905 | mode: 'cors', 906 | headers: { 907 | 'Accept': '*' 908 | }, 909 | body: postData 910 | }).then(function (res) { return res.json(); }) 911 | .then(function (res) { 912 | if (res.code === 200) { 913 | console.log('[SUCCESS] to save the content.'); 914 | } else { 915 | console.error('[ERROR] to save the content.'); 916 | } 917 | }); 918 | } 919 | }; 920 | 921 | var promises = []; 922 | 923 | if (conf.fetchFont) { 924 | promises.push(getFontFaces().then(results => { 925 | styleSheet.innerHTML = results.map(result => result.join('\n')).join('\n') + '\n' + styleSheet.innerHTML; 926 | console.log('[SUCCESS] to get all font-face rules.'); 927 | }).catch(() => { 928 | console.error('[ERROR] to get all font-face rules.'); 929 | })); 930 | } 931 | 932 | var rootMeta = getMetaData(dom); 933 | document.head.innerHTML = ''; 934 | cleanAttributes(document.body).innerHTML = ''; 935 | if (rootMeta.nodeName !== 'body') document.body.style.margin = '0'; 936 | document.head.appendChild(styleSheet); 937 | 938 | ndom = buildDom(rootMeta); // will add a `className` to each valid node in `rootMeta` 939 | 940 | PLUGINS: plugins.forEach(pl => pl.call(null, ndom, styleSheetData, rootMeta)); 941 | 942 | SET_MODULE_NAME: { 943 | var moduleClassNameAlready = ndom.getAttribute('class'); 944 | var moduleClassAlone = !ndom.getElementsByClassName(moduleClassNameAlready).length; 945 | rootMeta.className = moduleClassAlone ? moduleName : (moduleName + ' ' + moduleClassNameAlready); 946 | ndom.setAttribute('class', rootMeta.className); 947 | for (var sel in styleSheetData) { 948 | if (!styleSheetData[sel]) { 949 | delete styleSheetData[sel]; 950 | continue; 951 | } 952 | if (sel === '.' + moduleClassNameAlready || sel.startsWith('.' + moduleClassNameAlready + ':') 953 | || sel.startsWith('.' + moduleClassNameAlready + '>')) { 954 | if (moduleClassAlone) { 955 | var selector = '.' + moduleName + sel.substr(1 + moduleClassNameAlready.length); 956 | styleSheetData[selector] = styleSheetData[sel]; 957 | delete styleSheetData[sel]; 958 | continue; 959 | } else { 960 | styleSheetData['.' + moduleName + sel] = styleSheetData[sel]; 961 | } 962 | } 963 | styleSheetData['.' + moduleName + ' ' + sel] = styleSheetData[sel]; 964 | delete styleSheetData[sel]; 965 | } 966 | } 967 | 968 | var styles = []; 969 | for (var sel in styleSheetData) { 970 | styles.push([sel, styleSheetData[sel]]); 971 | } 972 | styleSheet.innerHTML += styles 973 | .filter(rule => (Object.keys(rule[1]).length > 0)) 974 | .map(rule => rule[0] + ' {' + stringOfStyleObj(rule[1], true) + '\n}').join('\n'); 975 | 976 | if (rootMeta.nodeName !== 'body') document.body.appendChild(ndom); 977 | else { 978 | document.body.setAttribute('class', ndom.getAttribute('class')); 979 | document.body.innerHTML = ndom.innerHTML; 980 | } 981 | 982 | Promise.all(promises).then(() => output()); 983 | }; 984 | 985 | var initData = function () { 986 | styleSheetData = {}; 987 | nodeTypeCount = {}; 988 | cssRuleValueHash2Name = {}; 989 | cssRuleName2ValueHash = {}; 990 | }; 991 | 992 | window.fespider = { 993 | getMetaData: getMetaData, 994 | present: presentDom, 995 | plugin: plugin 996 | }; 997 | 998 | })(); 999 | -------------------------------------------------------------------------------- /test/zhihu/outputs/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shenfe/puppeteer-service/7b1731d1725d5376e23109f6054ac963278eec75/test/zhihu/outputs/.gitkeep -------------------------------------------------------------------------------- /test/zhihu/test-elem-extract.js: -------------------------------------------------------------------------------- 1 | const url = 'http://www.zhihu.com/'; 2 | 3 | const puppeteer = require('puppeteer'); 4 | const pupConf = require('../../src/config').launch; 5 | 6 | pupConf.headless = false; 7 | 8 | const readlineSync = require('readline-sync'); 9 | const username = readlineSync.question('weibo username: '); 10 | const password = readlineSync.question('weibo password: ', { 11 | hideEchoBack: true 12 | }); 13 | if (readlineSync.question('set the executable path of chromium? [yn]: ') === 'y') { 14 | pupConf.executablePath = '..\\spiderman\\node_modules\\puppeteer\\.local-chromium\\win64-526987\\chrome-win32\\chrome.exe'; 15 | } 16 | 17 | const runner = require('./elem-extract'); 18 | 19 | const fs = require('fs'); 20 | const path = require('path'); 21 | const open = require('open'); 22 | 23 | puppeteer.launch(pupConf).then(async browser => { 24 | const resultHtml = await runner({ 25 | username, 26 | password, 27 | url 28 | }, browser); 29 | await browser.close(); 30 | 31 | const filePath = path.resolve(__dirname, `outputs/${Date.now()}.html`); 32 | fs.writeFileSync(filePath, resultHtml, 'utf8'); 33 | open(filePath); 34 | }); 35 | --------------------------------------------------------------------------------