├── .editorconfig
├── .eslintignore
├── .eslintrc
├── .gitignore
├── .vscode
└── launch.json
├── LICENSE
├── README.md
├── package.json
├── scripts
├── restart.js
├── start.js
└── stop.js
├── src
├── config
│ ├── index.js
│ └── privates.js
├── index.html
├── index.js
├── parse.js
├── puppeteer.js
├── scheduler.js
├── table.js
└── util.js
└── test
├── 163
├── account.js
├── clickLinkInEmail.js
├── openEmail.js
├── signin.js
└── test.js
├── accounts
└── index.js
├── batch.js
├── github
├── account-signin.js
├── account-signup.js
├── openAndStarRepos.js
├── signin.js
├── signup.js
├── test-signin-repl.js
├── test-signin.js
├── test-signup-batch.js
└── test-signup.js
├── iqiyi
├── screenshot.js
├── screenshots
│ └── .gitkeep
└── test-screenshot.js
└── zhihu
├── elem-extract.js
├── fespider.js
├── outputs
└── .gitkeep
└── test-elem-extract.js
/.editorconfig:
--------------------------------------------------------------------------------
1 | root = true
2 |
3 | [*]
4 | end_of_line = lf
5 | insert_final_newline = true
6 | charset = utf-8
7 |
8 | [src/**.{js}]
9 | indent_style = space
10 | indent_size = 4
11 |
12 | [*.json]
13 | indent_size = 2
14 |
--------------------------------------------------------------------------------
/.eslintignore:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenfe/puppeteer-service/7b1731d1725d5376e23109f6054ac963278eec75/.eslintignore
--------------------------------------------------------------------------------
/.eslintrc:
--------------------------------------------------------------------------------
1 | {
2 | "extends": "standard",
3 | "parserOptions": {
4 | "ecmaVersion": 8
5 | },
6 | "rules": {
7 | "semi": 0,
8 | "indent": ["error", 4],
9 | "camelcase": 0,
10 | "space-before-function-paren": ["error", {
11 | "anonymous": "always",
12 | "named": "never",
13 | "asyncArrow": "always"
14 | }],
15 | "no-fallthrough": 0,
16 | "no-unused-vars": 0,
17 | "eqeqeq": 0,
18 | "operator-linebreak": 0,
19 | "no-labels": 0,
20 | "no-inner-declarations": 0,
21 | "no-undef": 0,
22 | "standard/no-callback-literal": 0,
23 | "key-spacing": 0,
24 | "no-new-func": 0,
25 | "no-unused-expressions": 0,
26 | "no-new": 0,
27 | "eol-last": 0,
28 | "no-global-assign": 0
29 | }
30 | }
31 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | node_modules
2 | package-lock.json
3 | .DS_Store
4 | accounts*
5 | test/iqiyi/screenshots/*.png
6 | test/zhihu/outputs/*.html
7 |
--------------------------------------------------------------------------------
/.vscode/launch.json:
--------------------------------------------------------------------------------
1 | {
2 | "version": "0.2.0",
3 | "configurations": [
4 | {
5 | "type": "node",
6 | "request": "attach",
7 | "name": "Attach by Process ID",
8 | "processId": "${command:PickProcess}",
9 | "port": 3000,
10 | "protocol": "inspector"
11 | },
12 | {
13 | "type": "node",
14 | "request": "launch",
15 | "name": "stop",
16 | "program": "${workspaceFolder}/scripts/stop.js"
17 | },
18 | {
19 | "type": "node",
20 | "request": "launch",
21 | "name": "test 163",
22 | "program": "${workspaceFolder}/test/163/test.js"
23 | },
24 | {
25 | "type": "node",
26 | "request": "launch",
27 | "name": "test github signin",
28 | "program": "${workspaceFolder}/test/github/test-signin.js"
29 | },
30 | {
31 | "type": "node",
32 | "request": "launch",
33 | "name": "test github signup",
34 | "program": "${workspaceFolder}/test/github/test-signup.js"
35 | }
36 | ]
37 | }
38 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2017 Shen
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # puppeteer-service
2 |
3 |
4 |
5 | 🎠 Run [GoogleChrome/puppeteer](https://github.com/GoogleChrome/puppeteer) as a service.
6 |
7 | ## Usage
8 |
9 | ### Server
10 |
11 | ```bash
12 | $ npm install puppeteer-service --save
13 | ```
14 |
15 | ```js
16 | const PuppeteerService = require('puppeteer-service');
17 | const { koaApp, server } = PuppeteerService({
18 | cluster: true, // default: false
19 | port: 3000, // default
20 | api: 'run', // default
21 | test: true, // default: false
22 | puppeteer: {
23 | // See https://github.com/GoogleChrome/puppeteer/blob/master/docs/api.md#puppeteerlaunchoptions
24 | headless: true, // default
25 | args: ['--no-sandbox']
26 | }
27 | });
28 | ```
29 |
30 | 😯 If the `test` option is set `true` like above, you can visit the test page via `http://your.host:3000/test/`.
31 |
32 | ### Client
33 |
34 | #### 👉 Option 1: Use puppeteer-service-client
35 |
36 | ```bash
37 | $ npm install puppeteer-service-client --save
38 | ```
39 |
40 | Use [puppeteer-service-client](https://github.com/shenfe/puppeteer-service-client) to communicate with the server. It's runnable at **both browser and Node.js**.
41 |
42 | ```js
43 | const Run = require('puppeteer-service-client');
44 | Run('http://your.host:3000/run', {
45 | /* Entry page url */
46 | url: 'https://target.com/',
47 |
48 | /* Runner function */
49 | run: async page => {
50 | const title = await page.title();
51 | echo({ url: page.url(), title });
52 | return {
53 | info: b(a, title)
54 | };
55 | },
56 |
57 | /* Options (Optional) */
58 | options: {
59 | /* Variables to inject */
60 | /* Identifiers and their corresponding literal values will be injected
61 | as variable declarations into the runner function. */
62 | injection: {
63 | a: 'Welcome to ',
64 | b: function (x, y) {
65 | return x + y;
66 | }
67 | }
68 | },
69 |
70 | /* WebSocket data handler (Optional) */
71 | socket: data => {
72 | /**/
73 | }
74 | })
75 | .then(data => {
76 | /**/
77 | }).catch(error => {
78 | /**/
79 | });
80 | ```
81 |
82 | **socket and echo**
83 |
84 | The `socket` option specifies a handler for WebSocket data at client side. Correspondingly, the function `echo`, which is callable inside the "page runner function", is a built-in function whose responsibility is to transfer data to the right socket connection with the client.
85 |
86 | #### 👉 Option 2: Send a request directly
87 |
88 | As the following does:
89 |
90 | ```js
91 | const pageRunner = async page => {
92 | const title = await page.title();
93 | return {
94 | info: b(a, title)
95 | };
96 | };
97 | fetch('http://your.host:3000/run', {
98 | method: 'POST',
99 | /*...*/
100 | headers: {
101 | 'Content-Type': 'application/json'
102 | },
103 | body: JSON.stringify({
104 | data: `{
105 | url: 'https://www.sogou.com',
106 | run: ${pageRunner},
107 | options: {
108 | injection: {
109 | a: 'Welcome to ',
110 | b: function (x, y) {
111 | return x + y;
112 | }
113 | }
114 | }
115 | }`
116 | })
117 | })
118 | .then(res => {
119 | if (res.ok) return res.json();
120 | throw new Error('Response is not ok');
121 | })
122 | .then(data => {
123 | /**/
124 | }).catch(error => {
125 | /**/
126 | });
127 | ```
128 |
129 | ⚠️ This way is lightweight but too simple to communicate with the server via WebSocket.
130 |
131 | ## Development
132 |
133 | Some commands:
134 |
135 | ```bash
136 | npm start # start
137 | npm start -- -p 3000 # port
138 | npm start -- -c # cluster
139 | npm run debug # debugging mode
140 | npm test # test
141 | npm test -- -u http://127.0.0.1:3000/run # api url
142 | npm test -- -n 10 # batch number
143 | ```
144 |
145 | ## License
146 |
147 | [MIT](http://opensource.org/licenses/MIT)
148 |
149 | Copyright © 2018-present, [shenfe](https://github.com/shenfe)
150 |
--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "scripts": {
3 | "start": "node ./scripts/start.js",
4 | "stop": "node ./scripts/stop.js",
5 | "restart": "node ./scripts/restart.js",
6 | "debug": "node --inspect ./scripts/start.js",
7 | "test": "node ./test/batch.js"
8 | },
9 | "dependencies": {
10 | "@koa/cors": "^2.2.1",
11 | "clusterun": "^0.1.1",
12 | "function-sandbox": "^1.1.2",
13 | "http-graceful-shutdown": "^2.1.0",
14 | "ip": "^1.1.5",
15 | "koa": "^2.3.0",
16 | "koa-bodyparser": "^4.2.0",
17 | "koa-router": "^7.2.1",
18 | "koa-session": "^5.5.0",
19 | "node-fetch": "^2.0.0",
20 | "puppeteer": "^1.0.0",
21 | "puppeteer-service-client": "^0.1.6",
22 | "socket.io": "^2.0.4",
23 | "sticky-session": "^1.1.2"
24 | },
25 | "devDependencies": {
26 | "Base64": "^1.0.1",
27 | "babel-core": "^6.26.0",
28 | "eslint": "^4.9.0",
29 | "eslint-config-standard": "^10.2.1",
30 | "eslint-plugin-import": "^2.8.0",
31 | "eslint-plugin-node": "^5.2.0",
32 | "eslint-plugin-promise": "^3.6.0",
33 | "eslint-plugin-standard": "^3.0.1",
34 | "http-server": "^0.11.1",
35 | "open": "0.0.5",
36 | "readline-sync": "^1.4.9"
37 | },
38 | "name": "puppeteer-service",
39 | "description": "Run headless Chrome (aka Puppeteer) as a service, for web crawling, remote controlling and so on.",
40 | "version": "0.4.8",
41 | "main": "src/index.js",
42 | "directories": {
43 | "test": "test"
44 | },
45 | "repository": {
46 | "type": "git",
47 | "url": "git+https://github.com/shenfe/puppeteer-service.git"
48 | },
49 | "keywords": [
50 | "puppeteer",
51 | "headless-chrome",
52 | "service",
53 | "web-crawler"
54 | ],
55 | "author": "hengwu",
56 | "license": "MIT",
57 | "bugs": {
58 | "url": "https://github.com/shenfe/puppeteer-service/issues"
59 | },
60 | "homepage": "https://github.com/shenfe/puppeteer-service#readme"
61 | }
62 |
--------------------------------------------------------------------------------
/scripts/restart.js:
--------------------------------------------------------------------------------
1 | (async () => {
2 | await require('./stop');
3 | require('./start');
4 | })();
5 |
--------------------------------------------------------------------------------
/scripts/start.js:
--------------------------------------------------------------------------------
1 | let port;
2 |
3 | let useCluster = false;
4 |
5 | const args = process.argv.slice(2);
6 | args.forEach(function (val, index, array) {
7 | switch (val) {
8 | case '-p':
9 | case '--port':
10 | port = +args[index + 1];
11 | break;
12 | case '-c':
13 | case '--cluster':
14 | useCluster = true;
15 | break;
16 | }
17 | });
18 |
19 | const index = require('../src');
20 |
21 | const run = (ifUseCluster) => index({
22 | cluster: ifUseCluster,
23 | test: true,
24 | ...(port && { port }),
25 | puppeteer: {
26 | // headless: false,
27 | // executablePath: '..\\spiderman\\node_modules\\puppeteer\\.local-chromium\\win64-526987\\chrome-win32\\chrome.exe'
28 | }
29 | }).then(({ koaApp, server }) => {
30 | // do stuff
31 | });
32 |
33 | console.log('Starting...');
34 |
35 | if (useCluster) {
36 | // console.log('使用Node的cluster模块会导致session混乱问题。如果想要使用集群,请用不同端口启动多个puppeteer-service,再在更上层根据ip-hash实现分发。');
37 | run(true);
38 | } else {
39 | run();
40 | }
41 |
--------------------------------------------------------------------------------
/scripts/stop.js:
--------------------------------------------------------------------------------
1 | const fetch = require('node-fetch');
2 | const config = require('../src/config');
3 | const keyString = require('../src/config/privates').key;
4 |
5 | let port;
6 |
7 | const args = process.argv.slice(2);
8 | args.forEach(function (val, index, array) {
9 | switch (val) {
10 | case '-p':
11 | case '--port':
12 | port = +args[index + 1];
13 | break;
14 | }
15 | });
16 |
17 | port = port || config.server.port;
18 |
19 | module.exports = fetch(`http://127.0.0.1:${port}/stop`, {
20 | method: 'POST',
21 | headers: { 'Content-Type': 'application/json' },
22 | body: JSON.stringify({
23 | key: keyString
24 | })
25 | })
26 | .then(res => {
27 | if (res.ok) return res.json().then(console.log);
28 | console.error('Response of the `stop` request is not ok');
29 | })
30 | .catch(err => {
31 | switch (err.code) {
32 | case 'ECONNREFUSED':
33 | console.log('Connection refused');
34 | break;
35 | case 'ECONNRESET':
36 | console.log('Connection aborted');
37 | break;
38 | default:
39 | console.error(err);
40 | break;
41 | }
42 | })
43 | .then(() => console.log('Stopped'));
44 |
--------------------------------------------------------------------------------
/src/config/index.js:
--------------------------------------------------------------------------------
1 | module.exports = {
2 | server: {
3 | port: 3000,
4 | apiName: 'run'
5 | },
6 | launch: {
7 | ignoreHTTPSErrors: true,
8 | headless: true,
9 | // userDataDir: './browser_data',
10 | args: [
11 | '--no-sandbox',
12 | '--memory-pressure-thresholds=1',
13 | // '--memory-pressure-off',
14 | // '--force-fieldtrials=AutomaticTabDiscarding/Disabled',
15 | // '--renderer-process-limit=1000',
16 | // '--v8-cache-strategies-for-cache-storage=aggressive',
17 | // '--aggressive',
18 | // '--user-data-dir',
19 | // '--disable-renderer-backgrounding',
20 | // '--disable-javascript',
21 | // '-incognito',
22 | // '--aggressive-cache-discard',
23 | // '--aggressive-tab-discard',
24 | ]
25 | }
26 | };
27 |
--------------------------------------------------------------------------------
/src/config/privates.js:
--------------------------------------------------------------------------------
1 | module.exports = {
2 | key: 'the specific key string for authentication'
3 | };
4 |
--------------------------------------------------------------------------------
/src/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | test
8 |
26 |
27 |
28 |
29 |
35 |
36 |
37 |
38 |
74 |
75 |
76 |
--------------------------------------------------------------------------------
/src/index.js:
--------------------------------------------------------------------------------
1 | const sticky = require('sticky-session');
2 | const http = require('http');
3 | const cluster = require('cluster');
4 |
5 | const Koa = require('koa');
6 | const Router = require('koa-router');
7 | const bodyParser = require('koa-bodyparser');
8 | const cors = require('@koa/cors');
9 |
10 | const sockio = require('socket.io');
11 |
12 | const { createReadStream } = require('fs');
13 |
14 | const fnsb = require('function-sandbox');
15 |
16 | const ObjectParse = require('./parse');
17 | const Ppt = require('./puppeteer');
18 |
19 | const config = require('./config');
20 | const privates = require('./config/privates');
21 |
22 | const path = require('path');
23 | const ip = require('ip');
24 | const ipAddr = ip.address();
25 |
26 | const SessionMap = require('./table');
27 |
28 | const gracefulShutdown = require('http-graceful-shutdown');
29 |
30 | module.exports = async function (options = {}) {
31 | const useCluster = !!options.cluster;
32 | const test = !!options.test;
33 | const port = options.port || config.server.port;
34 | const apiName = options.api || config.server.apiName;
35 |
36 | Open_puppeteer: {
37 | await Ppt.open(options.puppeteer);
38 | console.log('Chrome puppeteer open');
39 | }
40 |
41 | const app = new Koa();
42 | const router = new Router();
43 |
44 | if (test) { /* Serve static files for the test page */
45 | router.get('/test/', (ctx, next) => {
46 | ctx.type = 'html';
47 | ctx.body = createReadStream(path.resolve(__dirname, './index.html'));
48 | });
49 | router.get('/server.config.js', ctx => {
50 | ctx.type = 'application/javascript';
51 | ctx.body = `export default { host: '${ipAddr}', port: ${port}, apiName: '${apiName}' }`;
52 | });
53 | router.get('/puppeteer-service-client.js', (ctx, next) => {
54 | ctx.type = 'application/javascript';
55 | const pscSrc = require.resolve('puppeteer-service-client');
56 | const pscDist = pscSrc.replace(/(puppeteer-service-client)(.*)$/g, function (...args) {
57 | return args[1];
58 | });
59 | ctx.body = createReadStream(path.resolve(pscDist, 'dist/puppeteer-service-client.js'));
60 | });
61 | }
62 |
63 | router.post(`/${apiName}`, async function (ctx) {
64 | ctx.response.header['Access-Control-Allow-Origin'] = ctx.request.origin;
65 | ctx.response.header['Content-Type'] = 'application/json; charset=utf-8';
66 | const { sessId, sockId } = ctx.request.body;
67 | const data = ObjectParse(ctx.request.body.data);
68 | console.log('data', data); // test
69 | ctx.status = 200;
70 |
71 | const injection = {
72 | echo: function (data) {
73 | const skt = socksesses.get(sessId);
74 | if (!skt) return;
75 | skt.emit('server:echo', data);
76 | }
77 | };
78 |
79 | if (!data.options) data.options = {};
80 | if (!data.options.whiteList) data.options.whiteList = [];
81 | data.options.whiteList = data.options.whiteList.concat(Object.keys(injection));
82 |
83 | console.log(ctx.request.url, ' begin'); // test
84 |
85 | ctx.body = await Ppt.run(data.url, fnsb(data.run, {
86 | ...data.options,
87 | asFunction: true
88 | }), injection);
89 |
90 | console.log(ctx.request.url, ' end'); // test
91 | if (cluster.isWorker) {
92 | console.log('worker', cluster.worker.id); // test
93 | }
94 |
95 | const skt = socksesses.get(sessId);
96 | skt && skt.emit('server:close', 'done') && skt.disconnect();
97 | });
98 |
99 | router.post(`/puppeteer`, async function (ctx, next) {
100 | ctx.response.header['Access-Control-Allow-Origin'] = ctx.request.origin;
101 | ctx.response.header['Content-Type'] = 'application/json; charset=utf-8';
102 | let { key, cmd, opt } = ctx.request.body;
103 | ctx.status = 200;
104 | if (key === privates.key) {
105 | let re = await Ppt[cmd](opt);
106 | ctx.body = {
107 | code: re === 0 ? 0 : 1,
108 | message: re === 0 ? 'success' : 'failure'
109 | };
110 | } else {
111 | ctx.body = {
112 | code: -1,
113 | message: 'error'
114 | };
115 | }
116 | await next();
117 | });
118 |
119 | router.post(`/stop`, async function (ctx, next) {
120 | let key = ctx.request.body.key;
121 | if (key === privates.key) {
122 | process.exit(0);
123 | }
124 | ctx.response.header['Access-Control-Allow-Origin'] = ctx.request.origin;
125 | ctx.response.header['Content-Type'] = 'application/json; charset=utf-8';
126 | ctx.status = 200;
127 | ctx.body = {
128 | code: -1,
129 | message: 'error'
130 | };
131 | await next();
132 | });
133 |
134 | // Serve_socketio_client_js: {
135 | // const sockioClientModuleIndex = require.resolve('socket.io-client');
136 | // const sockioClientModuleDist = sockioClientModuleIndex.replace(/(socket\.io-client)(.*)$/, function (...args) {
137 | // return args[1];
138 | // });
139 | // router.get('/socket.io/socket.io.js', ctx => {
140 | // ctx.type = 'application/javascript';
141 | // ctx.body = createReadStream(path.resolve(sockioClientModuleDist, './dist/socket.io.js'));
142 | // });
143 | // }
144 |
145 | app
146 | .use(cors({
147 | // origin: '*',
148 | credentials: true
149 | }))
150 | .use(bodyParser())
151 | .use(router.routes())
152 | .use(router.allowedMethods())
153 | ;
154 |
155 | const server = http.createServer(app.callback());
156 |
157 | const socksesses = new SessionMap();
158 | Set_up_websocket: {
159 | const io = sockio(server);
160 | io.on('connect', function (socket) {
161 | const sid = socket.handshake.query.sessId;
162 | socksesses.put(sid, socket);
163 | socket.emit('server:greet', { hello: sid });
164 | socket.on('client:some-event', function (data) {
165 | console.log('client:some-event', data);
166 | });
167 | socket.on('disconnect', reason => {
168 | socksesses.del(sid);
169 | });
170 | });
171 | }
172 |
173 | gracefulShutdown(server, {
174 | onShutdown: () => {
175 | console.log('Closing...');
176 | return Ppt.close();
177 | }
178 | });
179 |
180 | if (useCluster) {
181 | if (!sticky.listen(server, +port)) {
182 | // Master code
183 | server.once('listening', function () {
184 | console.log(`Server started on ${port} port`);
185 | });
186 | } else {
187 | // Worker code
188 | }
189 | } else {
190 | server.listen(+port, function () {
191 | console.log(`Server started on ${port} port`);
192 | });
193 | }
194 |
195 | return {
196 | koaApp: app,
197 | server
198 | };
199 | };
200 |
--------------------------------------------------------------------------------
/src/parse.js:
--------------------------------------------------------------------------------
1 | const fnsb = require('function-sandbox');
2 |
3 | const { evaluate, walk } = require('./util');
4 |
5 | const ObjectParse = str => {
6 | if (typeof str !== 'string') return str;
7 |
8 | let obj;
9 | try {
10 | obj = evaluate(str);
11 | } catch (e) {
12 | console.error(e);
13 | }
14 |
15 | // walk(obj, (target, p, v) => {
16 | // if (typeof v === 'function') {
17 | // target[p] = fnsb(v, true);
18 | // }
19 | // });
20 |
21 | return obj;
22 | };
23 |
24 | module.exports = ObjectParse;
25 |
--------------------------------------------------------------------------------
/src/puppeteer.js:
--------------------------------------------------------------------------------
1 | const puppeteer = require('puppeteer');
2 |
3 | // console.log(puppeteer.defaultArgs());
4 |
5 | const { launch } = require('./config');
6 |
7 | const { evaluate } = require('./util');
8 |
9 | let browser;
10 | let status = 0;
11 |
12 | const usePagePool = true;
13 | const pagePoolSize = 100;
14 | const { PageBroker } = require('./scheduler');
15 | let pageBroker;
16 |
17 | const open = async (options = {}) => {
18 | if (status === 1) return 0;
19 | browser = await puppeteer.launch({
20 | ...launch,
21 | ...options
22 | });
23 | status = 1;
24 | pageBroker = PageBroker(browser, {
25 | pooling: usePagePool,
26 | limit: pagePoolSize
27 | });
28 | return 0;
29 | };
30 |
31 | const close = () => {
32 | return browser.close().then(_ => {
33 | console.log('Chromium and all of its pages have been closed.');
34 | browser = null;
35 | status = 0;
36 | return 0;
37 | }).catch(e => {
38 | console.error(e);
39 | });
40 | };
41 |
42 | const run = async (url, fn, injection = {}) => {
43 | let page, pageId, result;
44 | try {
45 | const pageman = await pageBroker.open();
46 | page = pageman.page;
47 | pageId = pageman.id;
48 | await page.goto(url);
49 | result = await evaluate(`(${fn})(page)`, { page, echo: injection.echo });
50 | } catch (e) {
51 | console.error(e);
52 | result = {};
53 | }
54 | pageBroker.close({
55 | page,
56 | id: pageId
57 | });
58 | return result || {};
59 | };
60 |
61 | // const run = async (url, fn, injection = {}) => {
62 | // return browser.newPage().then(async page => {
63 | // await page.goto(url);
64 | // return evaluate(`(${fn})(page)`, { page, echo: injection.echo }).then(data => {
65 | // page.close();
66 | // return data;
67 | // }, () => {
68 | // page.close();
69 | // return {};
70 | // });
71 | // });
72 | // };
73 |
74 | const pageCount = async () => {
75 | if (!browser) return -1;
76 | let pages = await browser.pages();
77 | return pages.length;
78 | };
79 |
80 | module.exports = {
81 | open,
82 | close,
83 | run,
84 | process: () => browser && browser.process(),
85 | pageCount
86 | };
87 |
--------------------------------------------------------------------------------
/src/scheduler.js:
--------------------------------------------------------------------------------
1 | const EventEmitter = require('events').EventEmitter;
2 |
3 | const PageBroker = function (browser, options = {}) {
4 | const usePagePool = !!options.pooling;
5 | const maxSize = 100;
6 | const minSize = 10;
7 | const limit = (typeof options.limit === 'number' && !isNaN(options.limit) && options.limit >= minSize) ? options.limit : maxSize;
8 | const pool = [];
9 | const event = new EventEmitter();
10 | return {
11 | open: async () => {
12 | if (!usePagePool) {
13 | return {
14 | page: await browser.newPage()
15 | };
16 | }
17 | if (pool.length < limit) {
18 | let page = await browser.newPage();
19 | pool.push({
20 | page,
21 | status: 0
22 | });
23 | return { page, id: pool.length - 1 };
24 | }
25 | return new Promise((resolve, reject) => {
26 | for (let id = 0, len = pool.length; id < len; id++) {
27 | if (pool[id].status === 0) {
28 | pool[id].status = 1;
29 | resolve({ page: pool[id].page, id });
30 | return;
31 | }
32 | }
33 | event.on('page_close', function listener(page, id) {
34 | if (pool[id].status === 0) {
35 | pool[id].status = 1;
36 | this.removeListener('page_close', listener);
37 | resolve({ page, id });
38 | }
39 | });
40 | });
41 | },
42 | close({ page, id }) {
43 | if (!usePagePool) return page.close();
44 | return page.goto('about:blank').then(_ => {
45 | pool[id].status = 0;
46 | event.emit('page_close', page, id);
47 | return page;
48 | });
49 | }
50 | };
51 | };
52 |
53 | module.exports = {
54 | PageBroker
55 | };
56 |
--------------------------------------------------------------------------------
/src/table.js:
--------------------------------------------------------------------------------
1 | module.exports = function () {
2 | const ss = {};
3 | const get = key => {
4 | return ss[key];
5 | };
6 | const put = (key, s) => {
7 | if (ss.hasOwnProperty(key)) return false;
8 | return ss[key] = s;
9 | };
10 | const del = key => {
11 | return delete ss[key];
12 | };
13 | return { get, put, del };
14 | };
15 |
--------------------------------------------------------------------------------
/src/util.js:
--------------------------------------------------------------------------------
1 | const vm = require('vm');
2 |
3 | const fnsb = require('function-sandbox');
4 |
5 | const evaluate = (str, injection = {}) => {
6 | const safeFn = fnsb(new Function(`return (${str})`), {
7 | whiteList: Object.keys(injection)
8 | });
9 | const returnVarName = 'result';
10 | const script = new vm.Script(`${returnVarName} = (${safeFn})()`);
11 | const sandbox = { ...injection };
12 | script.runInNewContext(sandbox);
13 | return sandbox[returnVarName];
14 | };
15 |
16 | const walk = (obj, fn) => {
17 | switch (Object.prototype.toString.call(obj)) {
18 | case '[object Object]':
19 | Object.keys(obj).forEach(i => {
20 | fn(obj, i, obj[i]);
21 | walk(obj[i], fn);
22 | });
23 | break;
24 | case '[object Array]':
25 | obj.forEach((v, i) => {
26 | fn(obj, i, v);
27 | walk(v, fn);
28 | });
29 | break;
30 | }
31 | };
32 |
33 | /**
34 | * IP Hash
35 | * https://github.com/indutny/sticky-session/blob/master/lib/sticky/master.js
36 | */
37 | const ipHash = (function () {
38 | const seed = (Math.random() * 0xffffffff) | 0;
39 |
40 | return ip => {
41 | let hash = seed;
42 |
43 | for (let i = 0; i < ip.length; i++) {
44 | const num = ip[i];
45 |
46 | hash += num;
47 | hash %= 2147483648;
48 | hash += hash << 10;
49 | hash %= 2147483648;
50 | hash ^= hash >> 6;
51 | }
52 |
53 | hash += hash << 3;
54 | hash %= 2147483648;
55 | hash ^= hash >> 11;
56 | hash += hash << 15;
57 | hash %= 2147483648;
58 |
59 | return hash >>> 0;
60 | };
61 | })();
62 |
63 | const randIn = (low, high) => {
64 | return low + (high - low) * Math.random();
65 | };
66 |
67 | const wait = (d, high) => {
68 | if (high) d = randIn(d, high);
69 | return new Promise(resolve => setTimeout(_ => resolve(1), d));
70 | };
71 |
72 | module.exports = {
73 | evaluate,
74 | walk,
75 | wait,
76 | ipHash,
77 | randIn
78 | };
79 |
--------------------------------------------------------------------------------
/test/163/account.js:
--------------------------------------------------------------------------------
1 | module.exports = require('../accounts').accountToSignin163();
2 |
--------------------------------------------------------------------------------
/test/163/clickLinkInEmail.js:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenfe/puppeteer-service/7b1731d1725d5376e23109f6054ac963278eec75/test/163/clickLinkInEmail.js
--------------------------------------------------------------------------------
/test/163/openEmail.js:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenfe/puppeteer-service/7b1731d1725d5376e23109f6054ac963278eec75/test/163/openEmail.js
--------------------------------------------------------------------------------
/test/163/signin.js:
--------------------------------------------------------------------------------
1 | module.exports = async page => {
2 | await wait(1000);
3 |
4 | if (!page.url().startsWith('https://ssl.mail.')) {
5 | await page.waitForSelector('#x-URS-iframe');
6 | let frames = await page.frames();
7 | const frame = frames[3];
8 | const frameContext = await frame.executionContext();
9 |
10 | const usernameSelector = '#login-form > div:nth-of-type(1) > div:nth-of-type(1) > div:nth-of-type(2) > input:nth-of-type(1)';
11 | const passwordSelector = '#login-form > div:nth-of-type(1) > div:nth-of-type(3) > div:nth-of-type(2) > input:nth-of-type(2)';
12 |
13 | await frameContext.evaluate(`document.querySelector('${usernameSelector}').value = '${username}'`);
14 | await frameContext.evaluate(`document.querySelector('${passwordSelector}').value = '${password}'`);
15 | await frameContext.evaluate(`document.querySelector('#dologin').click()`);
16 | await page.waitForNavigation();
17 | }
18 |
19 | await page.waitForSelector('.mboxlst');
20 | const result = await page.evaluate(_ => {
21 | let q = document.querySelectorAll('.mboxlst');
22 | return q[q.length - 1].innerText;
23 | }, 7);
24 | return {
25 | data: result
26 | };
27 | };
28 |
--------------------------------------------------------------------------------
/test/163/test.js:
--------------------------------------------------------------------------------
1 | const Run = require('puppeteer-service-client');
2 |
3 | const account = require('./account');
4 |
5 | const runner = require('./signin');
6 |
7 | const { wait } = require('../../src/util');
8 |
9 | const { port, apiName } = require('../../src/config').server;
10 |
11 | Run(`http://127.0.0.1:${port}/${apiName}`, {
12 | url: 'https://mail.163.com/',
13 | run: runner,
14 | options: {
15 | injection: {
16 | wait,
17 | ...account
18 | }
19 | }
20 | })
21 | .then(data => console.log(JSON.stringify(data)))
22 | .catch(err => console.error(err));
23 |
--------------------------------------------------------------------------------
/test/accounts/index.js:
--------------------------------------------------------------------------------
1 | const fs = require('fs');
2 | const path = require('path');
3 |
4 | const accountList = require('./accounts.js');
5 |
6 | const accountToSignin163 = () => {
7 | let { username, password } = accountList.find(item => (item.username && item.domain === '163.com'));
8 | return {
9 | username,
10 | password
11 | };
12 | };
13 |
14 | const accountToSigninGithub = () => {
15 | for (let a of accountList) {
16 | let r = readAccount(`${a.username}@${a.domain}`);
17 | if (r.github) return r.github;
18 | }
19 | };
20 |
21 | const accountToSignupGithub = () => {
22 | for (let a of accountList) {
23 | if (a.github) continue;
24 | let r = readAccount(`${a.username}@${a.domain}`);
25 | if (r.github) continue;
26 | return {
27 | username: /([a-zA-Z]+)[0-9]*/.exec(a.username)[1],
28 | password: a.password,
29 | email: `${a.username}@${a.domain}`,
30 | origin: {
31 | ...a,
32 | ...r
33 | }
34 | };
35 | }
36 | };
37 |
38 | const readFile = filepath => {
39 | if (!fs.existsSync(filepath)) return {};
40 | let text = fs.readFileSync(filepath, 'utf8');
41 | try {
42 | let obj = JSON.parse(text);
43 | return obj;
44 | } catch (e) {
45 | return {};
46 | }
47 | };
48 |
49 | const writeFile = (filepath, obj = {}) => {
50 | fs.writeFileSync(filepath, JSON.stringify(obj, null, 4), 'utf8');
51 | };
52 |
53 | const readAccount = email => {
54 | const filepath = path.resolve(__dirname, `./accounts/${email}.json`);
55 | return readFile(filepath);
56 | };
57 |
58 | const recordAccount = (obj = {}) => {
59 | if (!obj.email) return false;
60 | const email = obj.email;
61 | const filepath = path.resolve(__dirname, `./accounts/${email}.json`);
62 | writeFile(filepath, Object.assign(readFile(filepath), obj));
63 | };
64 |
65 | module.exports = {
66 | accountToSignin163,
67 | accountToSigninGithub,
68 | accountToSignupGithub,
69 | recordAccount,
70 | githubRepos: require('./accounts.starRepos')
71 | };
72 |
--------------------------------------------------------------------------------
/test/batch.js:
--------------------------------------------------------------------------------
1 | let batchNumber = 30;
2 | let apiUrl;
3 | const args = process.argv.slice(2);
4 | args.forEach(function (val, index, array) {
5 | switch (val) {
6 | case '-u':
7 | case '--url':
8 | apiUrl = args[index + 1];
9 | break;
10 | case '-n':
11 | case '--number':
12 | batchNumber = +args[index + 1];
13 | }
14 | });
15 |
16 | const Run = require('puppeteer-service-client');
17 |
18 | const { wait } = require('../src/util');
19 |
20 | const { port, apiName } = require('../src/config').server;
21 | if (!apiUrl) {
22 | apiUrl = `http://127.0.0.1:${port}/${apiName}`;
23 | }
24 |
25 | const batchPromises = [];
26 |
27 | console.time('time consumed: ');
28 |
29 | const testUrls = [
30 | 'https://www.sogou.com/',
31 | 'https://www.baidu.com/',
32 | 'https://www.youdao.com/',
33 | 'https://www.bing.com/'
34 | ];
35 |
36 | for (let i = 0; i < batchNumber; i++) {
37 | console.time(`time ${i} consumed: `);
38 | let p = Run(`${apiUrl}?q=${i}`, {
39 | url: testUrls[i % testUrls.length],
40 | run: async page => {
41 | console.log('page ready');
42 | echo(`${i} hey ` + page.url());
43 | const title = await page.title();
44 | return {
45 | title: title
46 | };
47 | },
48 | // socket: data => {
49 | // console.log('socket', data);
50 | // },
51 | options: {
52 | injection: {
53 | i,
54 | wait
55 | }
56 | }
57 | })
58 | .then(data => {
59 | console.timeEnd(`time ${i} consumed: `);
60 | // console.log(JSON.stringify(data));
61 | })
62 | .catch(err => console.error(err));
63 | batchPromises.push(p);
64 | }
65 |
66 | Promise.all(batchPromises).then(() => {
67 | console.timeEnd('time consumed: ');
68 | });
69 |
--------------------------------------------------------------------------------
/test/github/account-signin.js:
--------------------------------------------------------------------------------
1 | module.exports = require('../accounts').accountToSigninGithub();
2 |
--------------------------------------------------------------------------------
/test/github/account-signup.js:
--------------------------------------------------------------------------------
1 | module.exports = require('../accounts').accountToSignupGithub();
2 |
--------------------------------------------------------------------------------
/test/github/openAndStarRepos.js:
--------------------------------------------------------------------------------
1 | const repoList = require('../accounts').githubRepos;
2 |
3 | const { randIn } = require('../../src/util');
4 |
5 | module.exports = async (page, repos = repoList) => {
6 | if (!Array.isArray(repos)) repos = [repos];
7 | for (let repo of repos) {
8 | if (Math.random() < 0.2) continue;
9 | await page.goto(`https://github.com/${repo}`);
10 | await page.waitFor(randIn(2000, 10000));
11 | const starButton = 'button[aria-label="Star this repository"]';
12 | await page.click(starButton);
13 | console.log(repo, 'star clicked');
14 | await page.waitFor(randIn(2000, 10000));
15 | }
16 |
17 | return page;
18 | };
19 |
--------------------------------------------------------------------------------
/test/github/signin.js:
--------------------------------------------------------------------------------
1 | const { wait } = require('../../src/util');
2 |
3 | module.exports = async ({ username, password }, browser) => {
4 | const page = await browser.newPage();
5 | await page.goto('https://github.com/login');
6 |
7 | await wait(1000);
8 |
9 | const usernameSelector = 'input#login_field';
10 | const passwordSelector = 'input#password';
11 |
12 | await page.type(usernameSelector, username);
13 | await page.type(passwordSelector, password);
14 | await page.click('#login form > div:last-child input:last-child');
15 |
16 | await page.waitForNavigation();
17 | const result = await page.$eval('ul.mini-repo-list', e => e.innerText);
18 | console.log(result);
19 |
20 | return page;
21 | };
22 |
--------------------------------------------------------------------------------
/test/github/signup.js:
--------------------------------------------------------------------------------
1 | const { wait } = require('../../src/util');
2 |
3 | const changeUsername = (username, salt) => {
4 | let s = 'abcdefghijklmnopqrstuvwxyz'[Math.ceil(Math.random() * 10000) % 26];
5 | return {
6 | username: (_, s) => ('' + _ + s),
7 | salt: s
8 | };
9 | };
10 |
11 | const changePassword = (password, salt) => {
12 | let s1 = 'abcdefghijklmnopqrstuvwxyz'[Math.ceil(Math.random() * 10000) % 26];
13 | let s2 = Math.ceil(Math.random() * 10000) % 10;
14 | let s = `${s1}${s2}`;
15 | return {
16 | password: (_, s) => ('' + _ + s),
17 | salt: s
18 | };
19 | };
20 |
21 | const setInput = async (page, selector, value) => {
22 | await page.$eval(selector, input => { input.value = '' });
23 | await page.focus(selector);
24 | await page.keyboard.type(value);
25 | };
26 |
27 | module.exports = async ({ username, password, email }, browser) => {
28 | const page = await browser.newPage();
29 |
30 | const pageUrl = 'https://github.com/join';
31 | await page.goto(pageUrl);
32 |
33 | await wait(1000);
34 |
35 | const usernameSelector = 'input#user_login';
36 | const emailSelector = 'input#user_email';
37 | const passwordSelector = 'input#user_password';
38 | const originUsername = username;
39 | const originPassword = password;
40 | let usernameSalt;
41 | let passwordSalt;
42 | while (true) {
43 | await setInput(page, usernameSelector, username);
44 | await setInput(page, emailSelector, email);
45 | await setInput(page, passwordSelector, password);
46 | await page.click('button#signup_button');
47 | await page.waitForNavigation();
48 | if (page.url() !== pageUrl) break;
49 |
50 | let usernameUpdate = changeUsername(originUsername, usernameSalt);
51 | usernameSalt = usernameUpdate.salt;
52 | username = usernameUpdate.username(originUsername, usernameSalt);
53 | console.log('change username to: ', username);
54 |
55 | let passwordUpdate = changePassword(originPassword, passwordSalt);
56 | passwordSalt = passwordUpdate.salt;
57 | password = passwordUpdate.password(originPassword, passwordSalt);
58 | console.log('change password to: ', password);
59 | }
60 |
61 | await page.click('button.js-choose-plan-submit');
62 | await page.waitForNavigation();
63 |
64 | const input1 = 'form.setup-form > fieldset:nth-of-type(1) input';
65 | const input2 = 'form.setup-form > fieldset:nth-of-type(2) input';
66 | const input3 = 'form.setup-form > fieldset:nth-of-type(3) input';
67 | const input4 = 'form.setup-form > fieldset:nth-of-type(4) input[type="text"]';
68 | await page.click(input1);
69 | await page.click(input2);
70 | await page.click(input3);
71 | await page.type(input4, 'web-development machine-learning ');
72 | await page.click('form.setup-form > input.btn-primary[name="commit"]');
73 |
74 | await page.waitForNavigation();
75 | if (page.url() === 'https://github.com/dashboard') {
76 | console.log('success: ', {
77 | username,
78 | password,
79 | email
80 | });
81 | }
82 |
83 | return {
84 | page,
85 | github: {
86 | username,
87 | password
88 | },
89 | email
90 | };
91 | };
92 |
--------------------------------------------------------------------------------
/test/github/test-signin-repl.js:
--------------------------------------------------------------------------------
1 | const puppeteer = require('puppeteer');
2 | const pupConf = require('../../src/config').launch;
3 |
4 | const readlineSync = require('readline-sync');
5 | const username = readlineSync.question('username: ');
6 | const password = readlineSync.question('password: ', {
7 | hideEchoBack: true
8 | });
9 |
10 | if (readlineSync.question('set the executable path of chromium? [yn]: ') === 'y') {
11 | pupConf.executablePath = '..\\spiderman\\node_modules\\puppeteer\\.local-chromium\\win64-526987\\chrome-win32\\chrome.exe';
12 | }
13 |
14 | const account = { username, password };
15 |
16 | const runner = require('./signin');
17 |
18 | puppeteer.launch(pupConf).then(async browser => {
19 | await runner(account, browser);
20 | await browser.close();
21 | });
22 |
--------------------------------------------------------------------------------
/test/github/test-signin.js:
--------------------------------------------------------------------------------
1 | const puppeteer = require('puppeteer');
2 | const pupConf = require('../../src/config').launch;
3 |
4 | const account = require('./account-signin');
5 |
6 | const runner = require('./signin');
7 |
8 | puppeteer.launch(pupConf).then(async browser => {
9 | await runner(account, browser);
10 | await browser.close();
11 | });
12 |
--------------------------------------------------------------------------------
/test/github/test-signup-batch.js:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenfe/puppeteer-service/7b1731d1725d5376e23109f6054ac963278eec75/test/github/test-signup-batch.js
--------------------------------------------------------------------------------
/test/github/test-signup.js:
--------------------------------------------------------------------------------
1 | const puppeteer = require('puppeteer');
2 | const pupConf = require('../../src/config').launch;
3 |
4 | pupConf.executablePath = '..\\spiderman\\node_modules\\puppeteer\\.local-chromium\\win64-526987\\chrome-win32\\chrome.exe';
5 | pupConf.headless = false;
6 |
7 | const account = require('./account-signup');
8 |
9 | const runner = require('./signup');
10 | const batchStar = require('./openAndStarRepos');
11 |
12 | const { recordAccount } = require('../accounts');
13 |
14 | puppeteer.launch(pupConf).then(async browser => {
15 | const { page, github, email } = await runner(account, browser);
16 | recordAccount({ github, email, ...account.origin });
17 | await batchStar(page);
18 | await browser.close();
19 | });
20 |
--------------------------------------------------------------------------------
/test/iqiyi/screenshot.js:
--------------------------------------------------------------------------------
1 | const { wait } = require('../../src/util');
2 |
3 | const path = require('path');
4 |
5 | function getScrollbarWidth() {
6 | let outer = document.createElement('div');
7 | outer.style.visibility = 'hidden';
8 | outer.style.width = '100px';
9 |
10 | document.body.appendChild(outer);
11 |
12 | let widthNoScroll = outer.offsetWidth;
13 | // force scrollbars
14 | outer.style.overflow = 'scroll';
15 |
16 | // add innerdiv
17 | let inner = document.createElement('div');
18 | inner.style.width = '100%';
19 | outer.appendChild(inner);
20 |
21 | let widthWithScroll = inner.offsetWidth;
22 |
23 | // remove divs
24 | outer.parentNode.removeChild(outer);
25 |
26 | return widthNoScroll - widthWithScroll;
27 | }
28 |
29 | module.exports = async (url, browser) => {
30 | const page = await browser.newPage();
31 |
32 | await page.setViewport({
33 | width: 1920,
34 | height: 1080
35 | });
36 | await page.goto(url);
37 |
38 | await wait(1000);
39 |
40 | while (true) {
41 | let re = await page.evaluate(`(_ => {
42 | let getScrollbarWidth = ${getScrollbarWidth};
43 | window.scrollBy(0, window.innerHeight);
44 | let scrbWidth = getScrollbarWidth();
45 | return document.body.scrollHeight + scrbWidth === window.innerHeight + window.scrollY;
46 | })()`);
47 | await page.waitFor(1000);
48 | if (re) break;
49 | }
50 |
51 | const filepath = path.resolve(__dirname, `./screenshots/${Date.now()}.png`);
52 | await page.screenshot({
53 | path: filepath,
54 | fullPage: true
55 | });
56 |
57 | const result = {
58 | path: filepath
59 | };
60 | console.log(result);
61 |
62 | return result;
63 | };
64 |
--------------------------------------------------------------------------------
/test/iqiyi/screenshots/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenfe/puppeteer-service/7b1731d1725d5376e23109f6054ac963278eec75/test/iqiyi/screenshots/.gitkeep
--------------------------------------------------------------------------------
/test/iqiyi/test-screenshot.js:
--------------------------------------------------------------------------------
1 | const url = 'http://www.iqiyi.com/';
2 |
3 | const puppeteer = require('puppeteer');
4 | const pupConf = require('../../src/config').launch;
5 |
6 | const readlineSync = require('readline-sync');
7 | if (readlineSync.question('set the executable path of chromium? [yn]: ') === 'y') {
8 | pupConf.executablePath = '..\\spiderman\\node_modules\\puppeteer\\.local-chromium\\win64-526987\\chrome-win32\\chrome.exe';
9 | }
10 |
11 | const runner = require('./screenshot');
12 |
13 | const open = require('open');
14 |
15 | puppeteer.launch(pupConf).then(async browser => {
16 | const result = await runner(url, browser);
17 | await browser.close();
18 | open(result.path);
19 | });
20 |
--------------------------------------------------------------------------------
/test/zhihu/elem-extract.js:
--------------------------------------------------------------------------------
1 | const fs = require('fs');
2 | const path = require('path');
3 |
4 | module.exports = async ({
5 | username,
6 | password,
7 | url
8 | }, browser) => {
9 | const page = await browser.newPage();
10 |
11 | await page.setViewport({
12 | width: 1280,
13 | height: 720
14 | });
15 | await page.goto(url);
16 |
17 | await page.waitFor(1000);
18 |
19 | await page.waitForSelector('.SignContainer-switch');
20 | await page.click('.SignContainer-switch>span[data-reactid]');
21 | await page.waitForSelector('.Button.Login-socialButtonEntrance');
22 | await page.click('.Button.Login-socialButtonEntrance');
23 | await page.$$eval('.Button.Login-socialButton', btns => btns[1].click());
24 | await page.waitForNavigation();
25 |
26 | await page.waitFor(1000);
27 | await page.type(`#userId`, username);
28 | await page.type(`#passwd`, password);
29 | await page.click('.formbtn_01');
30 | await page.click('.formbtn_01');
31 |
32 | await page.waitForNavigation();
33 | await page.click('#email');
34 | await page.click('.WB_btn_allow');
35 |
36 | await page.waitForNavigation();
37 |
38 | await page.waitFor(1000);
39 |
40 | // await page.addScriptTag({
41 | // url: 'https://raw.githubusercontent.com/shenfe/FeSpider/master/src/fespider/FeSpider.js'
42 | // });
43 |
44 | const result = await page.evaluate((x, js) => {
45 | (new Function(js))();
46 | fespider.present(document.querySelector(x));
47 | return document.documentElement.outerHTML;
48 | }, '.TopstoryItem:nth-child(1)', fs.readFileSync(path.resolve(__dirname, './fespider.js'), 'utf8'));
49 |
50 | await page.waitFor(1000);
51 |
52 | // console.log(result);
53 |
54 | return result;
55 | };
56 |
--------------------------------------------------------------------------------
/test/zhihu/fespider.js:
--------------------------------------------------------------------------------
1 | /**
2 | * @Author: Ke Shen
3 | * @Date: 2017-03-10 09:43:57
4 | * @Email: keshen@sohu-inc.com
5 | * @Last modified by: godzilla
6 | * @Last modified time: 2017-03-10 09:43:57
7 | */
8 |
9 | (function () {
10 |
11 | if (window.fespider) return;
12 |
13 | var conf = {
14 | classNameUpperCase: false,
15 | classNameModulePrefix: true,
16 | moduleName: 'module',
17 | recoverUrlInAttr: false,
18 | fetchFont: true,
19 | serverHost: 'https://127.0.0.1:3663',
20 | pullContent: true,
21 | generateType: 'html' // 'html' | 'vue'
22 | };
23 |
24 | /**
25 | * String Hash
26 | * Ref: http://werxltd.com/wp/2010/05/13/javascript-implementation-of-javas-string-hashcode-method/
27 | */
28 | if (!String.prototype.hashCode) {
29 | String.prototype.hashCode = function () {
30 | var hash = 0, i, chr;
31 | if (this.length === 0) return hash;
32 | for (i = 0; i < this.length; i++) {
33 | chr = this.charCodeAt(i);
34 | hash = ((hash << 5) - hash) + chr;
35 | hash |= 0; // Convert to 32bit integer
36 | }
37 | return hash;
38 | };
39 | }
40 |
41 | if (!String.prototype.endsWith) {
42 | String.prototype.endsWith = function (s) {
43 | if (typeof s !== 'string') return false;
44 | if (s.length > this.length) return false;
45 | return (this.substr(this.length - s.length) === s);
46 | };
47 | }
48 |
49 | var parseUrl = function (url) {
50 | var parser = document.createElement('a');
51 | parser.href = url;
52 | return {
53 | protocol: parser.protocol,
54 | host: parser.host,
55 | path: parser.pathname,
56 | search: parser.search,
57 | hash: parser.hash
58 | };
59 | };
60 | var recoverUrl = function (base, target) {
61 | var prefix = target.substr(0, target.indexOf(':'));
62 | if (prefix && /[a-z]+/.test(prefix)) {
63 | return target;
64 | }
65 |
66 | base = recoverUrl(window.location.href, base);
67 | var b = parseUrl(base);
68 | if (target.startsWith('//')) return b.protocol + target;
69 | if (target.startsWith('/')) return b.protocol + '//' + b.host + target;
70 | if (b.path.endsWith('/')) return b.protocol + '//' + b.host + b.path + target;
71 | return b.protocol + '//' + b.host + b.path.substring(0, b.path.lastIndexOf('/')) + '/' + target;
72 | };
73 | var recoverCssUrls = function (cssText, baseUrl) {
74 | var replacer = function (s, p1) {
75 | p1 = p1.trim();
76 | var inner = p1;
77 | if ((p1.charAt(0) === "'" && p1.charAt(p1.length - 1) === "'")
78 | || (p1.charAt(0) === '"' && p1.charAt(p1.length - 1) === '"')) inner = p1.substr(1, p1.length - 2);
79 | if (inner.startsWith('data:')) return 'url(' + inner + ')';
80 | return 'url(\'' + recoverUrl(baseUrl, inner) + '\')';
81 | };
82 | cssText = cssText.replace(/url\s*\((.*?)\)/g, replacer);
83 | return cssText;
84 | };
85 |
86 | var getCssLinks = function () {
87 | var sheet = document.styleSheets,
88 | i = sheet.length;
89 | var re = [];
90 | while (0 <= --i) {
91 | if (sheet[i].href) {
92 | re.push(sheet[i].href);
93 | }
94 | }
95 | return re;
96 | };
97 | var getFontFaces = function () {
98 | var sheet = document.styleSheets,
99 | rule = null,
100 | i = sheet.length, j;
101 | var urlQueue = [];
102 | var interRules = [];
103 | while (0 <= --i) {
104 | if (sheet[i].href) {
105 | urlQueue.push(sheet[i].href);
106 | } else {
107 | rule = sheet[i].rules || sheet[i].cssRules || [];
108 | j = rule.length;
109 | while (0 <= --j) {
110 | if (rule[j].constructor.name === 'CSSFontFaceRule') {
111 | interRules.push(recoverCssUrls(rule[j].cssText, window.location.href));
112 | };
113 | }
114 | }
115 | }
116 | return Promise.all(urlQueue.map(url => {
117 | return fetch(conf.serverHost + '/get/' + encodeURIComponent(url), {
118 | mode: 'cors',
119 | headers: {'Content-Type': 'text/plain'}
120 | }).then(res => {
121 | return res.text().then(data => {
122 | var regExp = /@font-face\s*\{[^}]+}/g;
123 | var results = data.match(regExp) || [];
124 | return interRules.concat(results.map(result => recoverCssUrls(result, url)));
125 | });
126 | }).catch(err => {
127 | console.error(err);
128 | });
129 | }));
130 | };
131 |
132 | const ignoreNodeName = {
133 | '#text': true,
134 | '#comment': true,
135 | 'meta': true,
136 | 'script': true,
137 | 'style': true,
138 | 'iframe': true
139 | };
140 |
141 | const PropertyTable = {
142 | 'display': {},
143 | 'zoom': {},
144 | 'flex-direction': {},
145 | 'flex-wrap': {},
146 | 'flex-flow': {},
147 | 'justify-content': {},
148 | 'align-items': {},
149 | 'align-content': {},
150 | 'order': {},
151 | 'flex-grow': {},
152 | 'flex-shrink': {},
153 | 'flex-basis': {},
154 | 'flex': {},
155 | 'align-self': {},
156 | 'position': {},
157 | 'z-index': {},
158 | 'width': {
159 | default: () => 'auto'
160 | },
161 | 'height': {
162 | default: () => 'auto'
163 | },
164 | 'max-width': {
165 | ignore: function (v) {
166 | return v === 'auto' || v === 'none';
167 | }
168 | },
169 | 'min-width': {
170 | ignore: function (v) {
171 | return v === 'auto' || v === 'none';
172 | }
173 | },
174 | 'max-height': {
175 | ignore: function (v) {
176 | return v === 'auto' || v === 'none';
177 | }
178 | },
179 | 'min-height': {
180 | ignore: function (v) {
181 | return v === 'auto' || v === 'none';
182 | }
183 | },
184 | 'top': {
185 | default: () => 'auto'
186 | },
187 | 'right': {
188 | default: () => 'auto'
189 | },
190 | 'bottom': {
191 | default: () => 'auto'
192 | },
193 | 'left': {
194 | default: () => 'auto'
195 | },
196 | 'background': {},
197 | // 'background-color': {},
198 | // 'background-size': {},
199 | 'margin': {
200 | default: (type) => {
201 | var ignore = ['ul', 'p', 'dd', 'h1', 'h2', 'h3', 'h4', 'body'];
202 | if (ignore.indexOf(type) >= 0) return false;
203 | return '0px';
204 | }
205 | },
206 | // 'margin-top': {},
207 | // 'margin-right': {},
208 | // 'margin-bottom': {},
209 | // 'margin-left': {},
210 | 'padding': {},
211 | // 'padding-top': {},
212 | // 'padding-right': {},
213 | // 'padding-bottom': {},
214 | // 'padding-left': {},
215 | 'border': {
216 | ignore: function (v) {
217 | return v.indexOf('none') >= 0;
218 | }
219 | },
220 | 'border-top': {
221 | ignore: function (v) {
222 | return v.indexOf('none') >= 0;
223 | }
224 | },
225 | 'border-right': {
226 | ignore: function (v) {
227 | return v.indexOf('none') >= 0;
228 | }
229 | },
230 | 'border-bottom': {
231 | ignore: function (v) {
232 | return v.indexOf('none') >= 0;
233 | }
234 | },
235 | 'border-left': {
236 | ignore: function (v) {
237 | return v.indexOf('none') >= 0;
238 | }
239 | },
240 | 'border-radius': {},
241 | 'border-collapse': {
242 | inherit: true
243 | },
244 | 'border-spacing': {
245 | inherit: true
246 | },
247 | 'box-shadow': {},
248 | 'box-sizing': {},
249 | 'outline': {
250 | ignore: function (v) {
251 | return v.indexOf('none') >= 0;
252 | }
253 | },
254 | 'color': {
255 | inherit: true
256 | },
257 | 'text-align': {
258 | inherit: true
259 | },
260 | 'text-indent': {
261 | inherit: true
262 | },
263 | 'text-overflow': {
264 | default: () => 'clip'
265 | },
266 | 'overflow-x': {},
267 | 'overflow-y': {},
268 | 'cursor': {
269 | inherit: true
270 | },
271 | 'float': {},
272 | 'clear': {},
273 | 'table-layout': {},
274 | 'font': {
275 | inherit: true
276 | },
277 | /*
278 | 'font-family': {
279 | inherit: true
280 | },
281 | 'font-size': {
282 | inherit: true
283 | },
284 | 'font-weight': {
285 | inherit: true
286 | },
287 | 'font-style': {
288 | inherit: true
289 | },
290 | 'line-height': {
291 | inherit: true
292 | },
293 | */
294 | 'letter-spacing': {
295 | inherit: true
296 | },
297 | 'list-style': {
298 | inherit: true
299 | },
300 | 'opacity': {},
301 | 'visibility': {
302 | inherit: true
303 | },
304 | 'text-decoration': {},
305 | 'vertical-align': {},
306 | 'white-space': {
307 | inherit: true
308 | },
309 | 'word-break': {
310 | inherit: true
311 | },
312 | 'word-wrap': {
313 | inherit: true
314 | },
315 | 'content': {},
316 | 'transform': {},
317 | 'transform-origin': {
318 | default: () => '50% 50%'
319 | },
320 | 'transition': {},
321 | 'fill': {}
322 | };
323 |
324 | var cleanComputedStyle = function (cs) {
325 | if (cs['border-top'] === cs['border']) delete cs['border-top'];
326 | if (cs['border-right'] === cs['border']) delete cs['border-right'];
327 | if (cs['border-bottom'] === cs['border']) delete cs['border-bottom'];
328 | if (cs['border-left'] === cs['border']) delete cs['border-left'];
329 | };
330 |
331 | var propNameCamelify = function (name) {
332 | var parts = name.split('-');
333 | var re = parts[0] || '';
334 | for (var i = 1, len = parts.length; i < len; i++) {
335 | var p = parts[1];
336 | re += p.substr(0, 1).toUpperCase() + p.substr(1);
337 | }
338 | return re;
339 | };
340 |
341 | var getFullStyle = function (dom, pseudo, inSvg) {
342 | var cs = !pseudo ? getComputedStyle(dom) : getComputedStyle(dom, ':' + pseudo);
343 | var ncs = (pseudo && !pseudoClassTable[pseudo].element) ? getComputedStyle(dom)
344 | : getNodeDefaultCS((pseudo && pseudoClassTable[pseudo].element === 'inline') ? 'span' : dom.nodeName.toLowerCase(), inSvg);
345 | var re = {};
346 | for (var prop in PropertyTable) {
347 | var cprop = propNameCamelify(prop);
348 | if (cs[cprop] && (preventDefaultProps[dom.nodeName.toLowerCase() + ' ' + prop] || PropertyTable[prop].inherit
349 | || (cs[cprop] !== ncs[cprop] && (!PropertyTable[prop].ignore || !PropertyTable[prop].ignore(cs[cprop]))))) {
350 | re[prop] = cs[cprop];
351 | }
352 | }
353 |
354 | /* hack for pseudo elements */
355 | /*
356 | if (pseudo) {
357 | if (re.height === 'auto' || re.height === '0px') {
358 | delete re.height;
359 | }
360 | }
361 | */
362 |
363 | cleanComputedStyle(re);
364 | return re;
365 | };
366 |
367 | const pseudoClassTable = {
368 | 'before': { element: 'inline' },
369 | 'after': { element: 'inline' }
370 | };
371 | var getPseudoElements = function (dom, domStyle, inSvg) {
372 | var re = {};
373 | for (var p in pseudoClassTable) {
374 | if (pseudoClassTable[p].element) {
375 | var cs = getComputedStyle(dom, ':' + p);
376 | if (cs.content) {
377 | re[p] = getFullStyle(dom, p, inSvg);
378 | } else {
379 | continue;
380 | }
381 | var domCS = getComputedStyle(dom);
382 | for (var i in re[p]) {
383 | if (PropertyTable[i].inherit && domCS[propNameCamelify(i)] === re[p][i]) {
384 | delete re[p][i];
385 | }
386 | }
387 | } else {
388 | // won't be reached so far
389 | }
390 | }
391 | if (Object.keys(re).length === 0) return null;
392 | return re;
393 | };
394 |
395 | const preventDefaultProps = {
396 | 'a color': true,
397 | 'a text-decoration': true,
398 | 'em font': true,
399 | 'input outline': true,
400 | 'input border': true,
401 | 'input border-top': true,
402 | 'input border-right': true,
403 | 'input border-bottom': true,
404 | 'input border-left': true,
405 | 'input box-sizing': true,
406 | 'fieldset border': true,
407 | 'fieldset border-top': true,
408 | 'fieldset border-right': true,
409 | 'fieldset border-bottom': true,
410 | 'fieldset border-left': true,
411 | 'textarea outline': true,
412 | 'textarea border': true,
413 | 'textarea border-top': true,
414 | 'textarea border-right': true,
415 | 'textarea border-bottom': true,
416 | 'textarea border-left': true,
417 | 'button border': true,
418 | 'button border-top': true,
419 | 'button border-right': true,
420 | 'button border-bottom': true,
421 | 'button border-left': true,
422 | 'button color': true,
423 | 'ul margin': true,
424 | 'h1 font': true,
425 | 'h2 font': true,
426 | 'figure margin': true
427 | };
428 |
429 | var getMetaData = function (dom) {
430 | var metaShow = getFullMetaData(dom);
431 | var originalDisplay = getComputedStyle(dom)['display'];
432 | dom.style.display = 'none';
433 | var metaHide = getFullMetaData(dom);
434 | dom.style.display = originalDisplay;
435 |
436 | var propsKeptInNode1 = ['transform', 'transform-origin', 'transition'];
437 | var patch = function (node1, node2) {
438 | var nodeName = node1.nodeName;
439 | if (node1.style) {
440 | for (var p in node1.style) {
441 | if (node1.style[p] === undefined) {
442 | delete node1.style[p];
443 | continue;
444 | }
445 | if (/px/.test(node1.style[p]) && propsKeptInNode1.indexOf(p) < 0) {
446 | if (node2.style[p] === undefined) {
447 | delete node1.style[p];
448 | continue;
449 | }
450 | node1.style[p] = node2.style[p];
451 | if ((node1.style[p] === 'auto' && !(PropertyTable[p].default && node1.style[p] !== PropertyTable[p].default(nodeName)))
452 | || (!PropertyTable[p].inherit && (PropertyTable[p].default && PropertyTable[p].default(nodeName) === node1.style[p]))) {
453 | delete node1.style[p];
454 | }
455 | }
456 | }
457 | for (var p in node2.style) {
458 | if (node1.style[p] == null && node2.style[p].indexOf('auto') >= 0 && (!PropertyTable[p].default || node2.style[p] !== PropertyTable[p].default(nodeName))) {
459 | node1.style[p] = node2.style[p]; // this could fix the problem of margin auto 0
460 | }
461 | }
462 | }
463 | if (node1.childNodes) {
464 | for (var i = 0, len = node1.childNodes.length; i < len; i++) {
465 | patch(node1.childNodes[i], node2.childNodes[i]);
466 | }
467 | }
468 | if (node1.pseudo) {
469 | for (var i in node1.pseudo) {
470 | var keptProps = {};
471 | for (let keptProp of propsKeptInNode1) {
472 | if (node1.pseudo[i][keptProp]) keptProps[keptProp] = node1.pseudo[i][keptProp];
473 | }
474 | node1.pseudo[i] = extendObj(node2.pseudo[i], keptProps);
475 | }
476 | }
477 | };
478 | patch(metaShow, metaHide);
479 | return metaShow;
480 | };
481 | var getMetaData_test = function (dom) {
482 | var display = getComputedStyle(dom)['display'];
483 | dom.style.display = 'none';
484 | var re = getFullMetaData(dom);
485 | re.style.display = display;
486 | return re;
487 | };
488 |
489 | const reservedAttrs = {
490 | 'a': ['href', 'target'],
491 | 'img': ['src'],
492 | 'input': ['placeholder', 'value', 'type'],
493 | 'textarea': ['placeholder', 'value']
494 | };
495 |
496 | // notice: some attributes would be ignored by default, see variable 'ignoreTable' of function 'getAttributes'
497 | const ignoredAttrs = {
498 | 'svg': [],
499 | 'svg/*': [],
500 | 'table': [],
501 | 'table/*': []
502 | };
503 |
504 | var getAttributes = function (dom, ignoreAttrNames, allowAttrNames, filter) {
505 | var re = {}, ignoreTable = {
506 | 'id': true,
507 | 'class': true,
508 | 'style': true
509 | };
510 | if (allowAttrNames) {
511 | for (let an of allowAttrNames) {
512 | var av = dom.getAttribute(an);
513 | if (av || av === '') {
514 | re[an] = filter ? filter(an, av) : av;
515 | }
516 | }
517 | return re;
518 | }
519 | if (ignoreAttrNames) {
520 | for (let an of ignoreAttrNames) ignoreTable[an] = true;
521 | }
522 | var rawAttrs = dom.attributes;
523 | for (var i = 0, len = rawAttrs.length; i < len; i++) {
524 | var an = rawAttrs[i].name;
525 | if (ignoreTable[an]) continue;
526 | var av = rawAttrs[i].value;
527 | re[an] = filter ? filter(an, av) : av;
528 | }
529 |
530 | return re;
531 | };
532 |
533 | var cleanAttributes = function (dom) {
534 | while (dom.attributes.length > 0)
535 | dom.removeAttribute(dom.attributes[0].name);
536 | return dom;
537 | };
538 |
539 | var getFullMetaData = function (dom, keepAttrs, inSvg) {
540 | var type = dom.nodeName.toLowerCase();
541 | if (type === '#text') {
542 | return {
543 | nodeName: '#text',
544 | value: dom.nodeValue
545 | };
546 | }
547 | if (ignoreNodeName[type]) return null;
548 |
549 | inSvg = inSvg || (type === 'svg');
550 |
551 | var meta = {
552 | nodeName: type,
553 | style: getFullStyle(dom, null, inSvg)
554 | };
555 |
556 | if (keepAttrs) {
557 | meta.attrs = getAttributes(dom);
558 | } else if (ignoredAttrs[type]) {
559 | meta.attrs = getAttributes(dom, ignoredAttrs[type]);
560 | } else if (reservedAttrs[type]) {
561 | meta.attrs = getAttributes(dom, null, reservedAttrs[type], (attrName, attrValue) => {
562 | return ((attrName === 'href' || attrName === 'src') && conf.recoverUrlInAttr) ? recoverUrl(window.location.href, attrValue) : attrValue;
563 | });
564 | }
565 |
566 | if (ignoredAttrs[type + '/*']) {
567 | keepAttrs = true;
568 | }
569 |
570 | if (meta.attrs && Object.keys(meta.attrs).length === 0) {
571 | delete meta.attrs;
572 | }
573 |
574 | meta.pseudo = getPseudoElements(dom, meta.style, inSvg);
575 | if (!meta.pseudo) delete meta.pseudo;
576 |
577 | if (dom.childNodes.length) {
578 | meta.childNodes = [];
579 | dom.childNodes.forEach(function (el, i) {
580 | var childData = getFullMetaData(el, keepAttrs, inSvg);
581 | if (!childData) return true;
582 | if (childData.nodeName !== '#text') {
583 | var dupProps = [];
584 | for (var i in childData.style) {
585 | if (!preventDefaultProps[childData.nodeName + ' ' + i]
586 | && PropertyTable[i].inherit
587 | && meta.style[i] === childData.style[i]) {
588 | dupProps.push(i);
589 | }
590 | }
591 | dupProps.forEach(function (p) {
592 | delete childData.style[p];
593 | });
594 | }
595 | meta.childNodes.push(childData);
596 | });
597 | }
598 |
599 | return meta;
600 | };
601 |
602 | var styleSheetData = {};
603 | var stringOfStyleObj = function (obj, indent) {
604 | indent = indent ? '\n ' : '';
605 | var re = '';
606 | for (var p in obj) {
607 | re += indent + p + ('' === indent ? ':' : ': ') + obj[p] + ';';
608 | }
609 | return re;
610 | };
611 |
612 | DATA_FOR_ADDCSSRULE: {
613 | var nodeTypeCount = {};
614 | var cssRuleValueHash2Name = {};
615 | var cssRuleName2ValueHash = {};
616 | }
617 | var addCssRule = function (nodeName, obj, pseudo) {
618 | var self = obj;
619 | var selfHash = stringOfStyleObj(self).hashCode();
620 |
621 | var pseudoValues = {};
622 | var pseudoHashes = {};
623 | if (pseudo) {
624 | for (var p in pseudo) {
625 | pseudoValues[p] = pseudo[p] || undefined;
626 | pseudoHashes[p] = pseudoValues[p] ? stringOfStyleObj(pseudoValues[p]).hashCode() : undefined;
627 | }
628 | }
629 |
630 | if (cssRuleValueHash2Name[selfHash]) {
631 | var existingNameList = cssRuleValueHash2Name[selfHash];
632 | for (let existingName of existingNameList) {
633 | var consistent = true;
634 | for (var p in pseudoClassTable) {
635 | if (cssRuleName2ValueHash[existingName + ':' + p] !== pseudoHashes[p]) {
636 | consistent = false;
637 | break;
638 | }
639 | }
640 | if (consistent) {
641 | return existingName;
642 | }
643 | }
644 | }
645 |
646 | if (!nodeTypeCount[nodeName]) nodeTypeCount[nodeName] = 0;
647 | nodeTypeCount[nodeName]++;
648 | var className = (conf.classNameModulePrefix ? (conf.moduleName + '-') : '') + (conf.classNameUpperCase ? nodeName.toUpperCase() : nodeName.toLowerCase()) + nodeTypeCount[nodeName];
649 |
650 | if (!cssRuleValueHash2Name[selfHash]) cssRuleValueHash2Name[selfHash] = [];
651 | cssRuleValueHash2Name[selfHash].push(className);
652 | for (var p in pseudoHashes) {
653 | if (pseudoHashes[p]) cssRuleName2ValueHash[className + ':' + p] = pseudoHashes[p];
654 | }
655 | cssRuleName2ValueHash[className] = selfHash;
656 |
657 | styleSheetData['.' + className] = self;
658 | for (var p in pseudoValues) {
659 | if (pseudoValues[p]) styleSheetData['.' + className + ':' + p] = pseudoValues[p];
660 | }
661 |
662 | return className;
663 | };
664 |
665 | var getHelperIframe = function (iframeSrc) {
666 | var iframeId = 'qwe123';
667 | var helperIframe;
668 | if (!window.frames[iframeId]) {
669 | helperIframe = document.createElement('iframe');
670 | helperIframe.id = iframeId;
671 | document.body.appendChild(helperIframe);
672 | } else {
673 | helperIframe = window.frames[iframeId];
674 | }
675 | if (iframeSrc) helperIframe.src = iframeSrc;
676 | return helperIframe;
677 | };
678 |
679 | var getNodeDefaultCS = function (nodeName, inSvg) {
680 | inSvg = inSvg || (nodeName === 'svg');
681 | var iframeIns = getHelperIframe();
682 | var iframeDoc = iframeIns.contentDocument;
683 | var iframeNodes = iframeDoc.getElementsByTagName(nodeName);
684 | var node;
685 | if (iframeNodes.length) node = iframeNodes[0];
686 | else {
687 | node = (!inSvg) ? iframeDoc.createElement(nodeName) : iframeDoc.createElementNS('http://www.w3.org/2000/svg', nodeName);
688 | iframeDoc.body.appendChild(node);
689 | }
690 | var re = extendObj({}, getComputedStyle(node));
691 | /*
692 | var originalDisplay = re['display'];
693 | node.style.display = 'none';
694 | re = extendObj({}, getComputedStyle(node), {
695 | display: originalDisplay
696 | });
697 | */
698 | ['transform-origin'].forEach(p => {
699 | if (!PropertyTable[p] || !PropertyTable[p].default) return;
700 | var dv = PropertyTable[p].default(nodeName);
701 | if (dv === false) return;
702 | re[propNameCamelify(p)] = dv;
703 | });
704 | return re;
705 | };
706 |
707 | var pl_extractCommonCssFromChildren = function (dom, styleData, metaData) {
708 | /* find all-children-share styles */
709 | var getChildrenCommonStyles = function (childNodes) {
710 | var minOfChildClassCount = 2;
711 | var minOfRepeatTime = 2;
712 |
713 | if (!childNodes) return null;
714 | // var validChildCount = 0;
715 | var childClassCount = 0;
716 | var childrenCssStat = {};
717 | var allChildrenHave = {};
718 | var checkedClasses = {};
719 | for (let child of childNodes) {
720 | if (child.nodeName === '#text') continue;
721 | // validChildCount++;
722 | if (checkedClasses[child.className]) continue;
723 | childClassCount++;
724 | checkedClasses[child.className] = true;
725 | var cs = styleData['.' + child.className];
726 | for (var i in cs) {
727 | var key = i + ': ' + cs[i];
728 | childrenCssStat[key] = (childrenCssStat[key] || 0) + 1;
729 | }
730 | }
731 | if (childClassCount >= minOfChildClassCount) {
732 | for (var i in childrenCssStat) {
733 | if (childrenCssStat[i] < childClassCount) continue;
734 | var splitPos = i.indexOf(': ');
735 | allChildrenHave[i.substr(0, splitPos)] = i.substr(splitPos + 2);
736 | }
737 | }
738 | // console.log(allChildrenHave);
739 | return Object.keys(allChildrenHave).length >= minOfRepeatTime ? allChildrenHave : null;
740 | };
741 |
742 | /* index */
743 | var className2Nodes = {};
744 | var traverse = function (node, index = {}) {
745 | if (node.className) {
746 | if (!index[node.className]) index[node.className] = [];
747 | index[node.className].push(node);
748 | }
749 | if (node.childNodes) node.childNodes.forEach(child => { traverse(child, index); });
750 | };
751 | traverse(metaData, className2Nodes);
752 |
753 | var handler = function (node) {
754 | var className = node.className;
755 | if (!className) return;
756 |
757 | if (!node.followers) {
758 | var sameClassNodes = className2Nodes[className];
759 | var allChildNodes = sameClassNodes.reduce((prev, next) => { return (!next.childNodes ? prev : prev.concat(next.childNodes)); }, []);
760 | var commonStyles = getChildrenCommonStyles(allChildNodes);
761 | if (commonStyles) {
762 | var checkedClasses = {};
763 | var allIncluded = true;
764 | for (let child of allChildNodes) {
765 | if (child.nodeName === '#text') continue;
766 | if (checkedClasses[child.className]) continue;
767 | checkedClasses[child.className] = true;
768 | var childSameClassNodes = className2Nodes[child.className];
769 | for (let scn of childSameClassNodes) {
770 | if (allChildNodes.indexOf(scn) < 0) {
771 | allIncluded = false;
772 | break;
773 | }
774 | }
775 | if (!allIncluded) break;
776 | }
777 | if (allIncluded) {
778 | styleData['.' + className + '>*'] = commonStyles;
779 | sameClassNodes.forEach(v => { v.followers = commonStyles; });
780 |
781 | for (var c in checkedClasses) {
782 | for (var i in commonStyles) {
783 | delete styleData['.' + c][i];
784 | }
785 | }
786 | }
787 | }
788 | }
789 |
790 | if (node.childNodes) {
791 | for (let child of node.childNodes) {
792 | if (child.nodeName === '#text') continue;
793 | handler(child);
794 | }
795 | }
796 | };
797 |
798 | handler(metaData);
799 | };
800 |
801 | var pl_overflowCombine = function (dom, styles = {}) {
802 | for (var sel in styles) {
803 | var s = styles[sel];
804 | if (s['overflow-x'] && (s['overflow-x'] === s['overflow-y'])) {
805 | s['overflow'] = s['overflow-x'];
806 | delete s['overflow-x'];
807 | delete s['overflow-y'];
808 | }
809 | }
810 | };
811 | var pl_borderCombile = function (dom, styles = {}) {
812 | for (var sel in styles) {
813 | var s = styles[sel];
814 | if (s['border-top'] && s['border-right'] && s['border-bottom'] && s['border-left']) {
815 | var bt = s['border-top'];
816 | var br = s['border-right'];
817 | var bb = s['border-bottom'];
818 | var bl = s['border-left'];
819 | if (bt === br && bt === bb && bt === bl) {
820 | s['border'] = bt;
821 | delete s['border-top'];
822 | delete s['border-right'];
823 | delete s['border-bottom'];
824 | delete s['border-left'];
825 | }
826 | }
827 | }
828 | };
829 | var plugins = [pl_overflowCombine, pl_borderCombile];
830 | var plugin = function (handler) {
831 | plugins.push(handler);
832 | };
833 | plugins.push(pl_extractCommonCssFromChildren);
834 |
835 | var buildDom = function (meta, inSvg) {
836 | if (meta.nodeName === '#text') {
837 | return document.createTextNode(meta.value);
838 | }
839 | inSvg = inSvg || (meta.nodeName === 'svg');
840 | if (inSvg) {
841 | var dom = document.createElementNS('http://www.w3.org/2000/svg', meta.nodeName);
842 | } else {
843 | var dom = document.createElement(meta.nodeName);
844 | }
845 |
846 | if (meta.attrs) {
847 | for (var k in meta.attrs) {
848 | dom.setAttribute(k, meta.attrs[k]);
849 | }
850 | }
851 |
852 | var className = addCssRule(meta.nodeName, meta.style, meta.pseudo);
853 | dom.setAttribute('class', className);
854 |
855 | meta.className = className;
856 |
857 | if (meta.childNodes) {
858 | meta.childNodes.forEach(function (child) {
859 | dom.appendChild(buildDom(child, inSvg));
860 | });
861 | }
862 |
863 | return dom;
864 | };
865 |
866 | var extendObj = function (dest, src = {}) {
867 | for (var i in src) {
868 | dest[i] = src[i];
869 | }
870 | return dest;
871 | };
872 | var presentDom = function (dom, moduleName, options) {
873 |
874 | initData();
875 |
876 | extendObj(conf, options);
877 | if (moduleName) conf.moduleName = moduleName;
878 | moduleName = conf.moduleName;
879 |
880 | var styleSheet = document.createElement('style');
881 | var ndom;
882 |
883 | var output = () => {
884 | var outputData = {
885 | name: moduleName,
886 | type: conf.generateType,
887 | style: styleSheet.innerHTML,
888 | html: (ndom.nodeName === 'body') ? ndom.innerHTML : ndom.outerHTML
889 | };
890 | console.log(outputData);
891 |
892 | if (typeof chrome !== 'undefined') {
893 | chrome.runtime.sendMessage(
894 | JSON.parse(JSON.stringify(outputData)),
895 | function (response) {
896 | console.log(response);
897 | });
898 | }
899 |
900 | var postData = new FormData();
901 | postData.append('json', JSON.stringify(outputData));
902 | if (conf.pullContent) {
903 | fetch(conf.serverHost + '/post', {
904 | method: 'post',
905 | mode: 'cors',
906 | headers: {
907 | 'Accept': '*'
908 | },
909 | body: postData
910 | }).then(function (res) { return res.json(); })
911 | .then(function (res) {
912 | if (res.code === 200) {
913 | console.log('[SUCCESS] to save the content.');
914 | } else {
915 | console.error('[ERROR] to save the content.');
916 | }
917 | });
918 | }
919 | };
920 |
921 | var promises = [];
922 |
923 | if (conf.fetchFont) {
924 | promises.push(getFontFaces().then(results => {
925 | styleSheet.innerHTML = results.map(result => result.join('\n')).join('\n') + '\n' + styleSheet.innerHTML;
926 | console.log('[SUCCESS] to get all font-face rules.');
927 | }).catch(() => {
928 | console.error('[ERROR] to get all font-face rules.');
929 | }));
930 | }
931 |
932 | var rootMeta = getMetaData(dom);
933 | document.head.innerHTML = '';
934 | cleanAttributes(document.body).innerHTML = '';
935 | if (rootMeta.nodeName !== 'body') document.body.style.margin = '0';
936 | document.head.appendChild(styleSheet);
937 |
938 | ndom = buildDom(rootMeta); // will add a `className` to each valid node in `rootMeta`
939 |
940 | PLUGINS: plugins.forEach(pl => pl.call(null, ndom, styleSheetData, rootMeta));
941 |
942 | SET_MODULE_NAME: {
943 | var moduleClassNameAlready = ndom.getAttribute('class');
944 | var moduleClassAlone = !ndom.getElementsByClassName(moduleClassNameAlready).length;
945 | rootMeta.className = moduleClassAlone ? moduleName : (moduleName + ' ' + moduleClassNameAlready);
946 | ndom.setAttribute('class', rootMeta.className);
947 | for (var sel in styleSheetData) {
948 | if (!styleSheetData[sel]) {
949 | delete styleSheetData[sel];
950 | continue;
951 | }
952 | if (sel === '.' + moduleClassNameAlready || sel.startsWith('.' + moduleClassNameAlready + ':')
953 | || sel.startsWith('.' + moduleClassNameAlready + '>')) {
954 | if (moduleClassAlone) {
955 | var selector = '.' + moduleName + sel.substr(1 + moduleClassNameAlready.length);
956 | styleSheetData[selector] = styleSheetData[sel];
957 | delete styleSheetData[sel];
958 | continue;
959 | } else {
960 | styleSheetData['.' + moduleName + sel] = styleSheetData[sel];
961 | }
962 | }
963 | styleSheetData['.' + moduleName + ' ' + sel] = styleSheetData[sel];
964 | delete styleSheetData[sel];
965 | }
966 | }
967 |
968 | var styles = [];
969 | for (var sel in styleSheetData) {
970 | styles.push([sel, styleSheetData[sel]]);
971 | }
972 | styleSheet.innerHTML += styles
973 | .filter(rule => (Object.keys(rule[1]).length > 0))
974 | .map(rule => rule[0] + ' {' + stringOfStyleObj(rule[1], true) + '\n}').join('\n');
975 |
976 | if (rootMeta.nodeName !== 'body') document.body.appendChild(ndom);
977 | else {
978 | document.body.setAttribute('class', ndom.getAttribute('class'));
979 | document.body.innerHTML = ndom.innerHTML;
980 | }
981 |
982 | Promise.all(promises).then(() => output());
983 | };
984 |
985 | var initData = function () {
986 | styleSheetData = {};
987 | nodeTypeCount = {};
988 | cssRuleValueHash2Name = {};
989 | cssRuleName2ValueHash = {};
990 | };
991 |
992 | window.fespider = {
993 | getMetaData: getMetaData,
994 | present: presentDom,
995 | plugin: plugin
996 | };
997 |
998 | })();
999 |
--------------------------------------------------------------------------------
/test/zhihu/outputs/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenfe/puppeteer-service/7b1731d1725d5376e23109f6054ac963278eec75/test/zhihu/outputs/.gitkeep
--------------------------------------------------------------------------------
/test/zhihu/test-elem-extract.js:
--------------------------------------------------------------------------------
1 | const url = 'http://www.zhihu.com/';
2 |
3 | const puppeteer = require('puppeteer');
4 | const pupConf = require('../../src/config').launch;
5 |
6 | pupConf.headless = false;
7 |
8 | const readlineSync = require('readline-sync');
9 | const username = readlineSync.question('weibo username: ');
10 | const password = readlineSync.question('weibo password: ', {
11 | hideEchoBack: true
12 | });
13 | if (readlineSync.question('set the executable path of chromium? [yn]: ') === 'y') {
14 | pupConf.executablePath = '..\\spiderman\\node_modules\\puppeteer\\.local-chromium\\win64-526987\\chrome-win32\\chrome.exe';
15 | }
16 |
17 | const runner = require('./elem-extract');
18 |
19 | const fs = require('fs');
20 | const path = require('path');
21 | const open = require('open');
22 |
23 | puppeteer.launch(pupConf).then(async browser => {
24 | const resultHtml = await runner({
25 | username,
26 | password,
27 | url
28 | }, browser);
29 | await browser.close();
30 |
31 | const filePath = path.resolve(__dirname, `outputs/${Date.now()}.html`);
32 | fs.writeFileSync(filePath, resultHtml, 'utf8');
33 | open(filePath);
34 | });
35 |
--------------------------------------------------------------------------------