├── .gitignore
├── README.md
├── package-lock.json
├── package.json
└── src
├── config.js
├── data-source
├── baidu.js
├── bing.js
├── so.js
├── sogou.js
├── template
│ └── search-engine.js
└── tianyancha.js
├── duanzi
├── budejie-detail.js
├── common.js
├── gaoxiaogif-detail.js
└── qiubai-detail.js
├── img
└── 5b3f553ed9fb7.gif
├── index.js
├── req.js
├── tmp
└── pengfu.js
└── tools
├── file-upload.js
├── img-sina.js
└── tools.js
/.gitignore:
--------------------------------------------------------------------------------
1 | node_modules
2 | src/data
3 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 | ### 为什么要做信息聚合工具?意义是什么?
3 | - 在日常工作生活中,多数人遇到新知识、新事物,第一反应便是去百度、知乎、bing、各个社区论坛,搜集信息,然后消化学习~~~
4 | - 那么作为Coder的我们,为什么要手动去多个平台搜集信息,能不能有什么工具帮我们去各个平台搜集信息,最好还是消化过的,然后把有用的信息给我们呢?
5 | - OK,今天先讨论如何从各个平台搜集信息
6 | - 嗯,如标题所示,这里采用的的技术方案是Puppeteer。至于为什么要用Puppeteer?原因有二:1. 使用Puppeteer模拟用户操作,而不是简单的接口调用去获取数据,极大降低被判定为爬虫的风险。2. Puppeteer可以抓取客户端渲染的站点内容,这是传统爬虫难以做到的!
7 |
8 | ### 其次对Puppeteer进行简介
9 | - Puppeteer本质是一个node库,他提供了一组通过DevTools Protocol来操纵Headless Chrome的高级API
10 | - 其他详细介绍:[请猛戳Puppeteer官方文档](https://github.com/GoogleChrome/puppeteer/blob/master/docs/api.md#puppeteerlaunchoptions)
11 |
12 | #### Puppeteer 核心功能
13 | - 生成网页PDF、截图
14 | - 从网站抓取内容
15 | - 爬取SPA应用,并生成预渲染内容
16 | - UI自动化操作:表单提交、UI测试、键盘输入等
17 | - 创建一个最新的自动化测试环境(chrome),可以直接在此运行测试用例
18 | - 捕获站点时间线,帮助分析网站性能问题
19 |
20 | ### 接下来开始实战,Repo在 [Github Eva](https://github.com/zhentaoo/eva)
21 | 1. 运行Puppeteer
22 | ```js
23 | puppeteer.launch().then(async browser => {
24 | ......
25 | // do what you want
26 | ......
27 | })
28 | ```
29 |
30 | 2. 开一个新的tab页,跳转至事先定义好的站点
31 | ```js
32 | let page = await browser.newPage();
33 | await page.goto('https://www.tianyancha.com/search?key=${key}');
34 | ```
35 |
36 | 3. 分析站点dom结构,抓取你想要的内容
37 | ```js
38 | let data = await page.evaluate(() => {
39 | let list = [...document.querySelectorAll('.search_result_single')]
40 |
41 | return list.map(el => {
42 | return { html: el.innerHTML, content: el.innerText }
43 | })
44 | })
45 | ```
46 |
47 | 4. 将内容写入文件(如果大家感兴趣可以做成数据库存取,这不是重点)
48 | ```js
49 | fs.appendFileSync(`./src/data/tianyancha-${key}.txt`, `startTime: ${new Date().toUTCString()}`+'\r');
50 |
51 | fs.appendFileSync(`./src/data/tianyancha-${key}.txt`, JSON.stringify(content, null , ' ')+'\r');
52 | ```
53 |
54 | 5. 5.翻页操作,获取下一页的内容
55 | ```js
56 | for (let index = 0; index < needPageMaxNum; index++) {
57 | var nextPage = await page.$('#web-content > div > div > div > div.col-9.search-2017-2.pr15.pl0 > div.b-c-white.clearfix.position-rel.mb30 > div > div.search_pager.human_pager.in-block > ul > li.pagination-next.ng-scope > a')
58 |
59 | await nextPage.click()
60 | await timeout(6 * 1000 * Math.random());
61 | await getDataFromDom()
62 | }
63 | ```
64 |
65 | 6. 数据获取完毕,关闭页面
66 | ```js
67 | await page.close()
68 | ```
69 |
70 | 7. 小细节,为了更加真实,在模拟用户操作的的过程中,使用随机时间作为间隔
71 | ```js
72 | await timeout(6 * 1000 * Math.random());
73 | ```
74 |
75 | 8. 最后说明,这里只做了最基础的数据爬取,后续可能会做出duckduckgo么?敬请期待
76 |
77 | 9. 完整代码在: https://github.com/zhentaoo/eva (Demo型项目,无敏感信息)
78 |
79 | 10. 项目运行
80 | - git clone https://github.com/zhentaoo/eva
81 | - npm install (puppeteer在win下100+M、mac下70+M,请耐心等候,如果安装不了,请使用cnpm)
82 | - npm start
83 |
--------------------------------------------------------------------------------
/package-lock.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "eva",
3 | "version": "1.0.0",
4 | "lockfileVersion": 1,
5 | "requires": true,
6 | "dependencies": {
7 | "ajv": {
8 | "version": "5.5.2",
9 | "resolved": "http://registry.npm.taobao.org/ajv/download/ajv-5.5.2.tgz",
10 | "integrity": "sha1-c7Xuyj+rZT49P5Qis0GtQiBdyWU=",
11 | "requires": {
12 | "co": "4.6.0",
13 | "fast-deep-equal": "1.1.0",
14 | "fast-json-stable-stringify": "2.0.0",
15 | "json-schema-traverse": "0.3.1"
16 | }
17 | },
18 | "asn1": {
19 | "version": "0.2.3",
20 | "resolved": "http://registry.npm.taobao.org/asn1/download/asn1-0.2.3.tgz",
21 | "integrity": "sha1-2sh4dxPJlmhJ/IGAd36+nB3fO4Y="
22 | },
23 | "assert-plus": {
24 | "version": "1.0.0",
25 | "resolved": "http://registry.npm.taobao.org/assert-plus/download/assert-plus-1.0.0.tgz",
26 | "integrity": "sha1-8S4PPF13sLHN2RRpQuTpbB5N1SU="
27 | },
28 | "async-limiter": {
29 | "version": "1.0.0",
30 | "resolved": "http://registry.npm.taobao.org/async-limiter/download/async-limiter-1.0.0.tgz",
31 | "integrity": "sha1-ePrtjD0HSrgfIrTphdeehzj3IPg="
32 | },
33 | "asynckit": {
34 | "version": "0.4.0",
35 | "resolved": "http://registry.npm.taobao.org/asynckit/download/asynckit-0.4.0.tgz",
36 | "integrity": "sha1-x57Zf380y48robyXkLzDZkdLS3k="
37 | },
38 | "aws-sign2": {
39 | "version": "0.7.0",
40 | "resolved": "http://registry.npm.taobao.org/aws-sign2/download/aws-sign2-0.7.0.tgz",
41 | "integrity": "sha1-tG6JCTSpWR8tL2+G1+ap8bP+dqg="
42 | },
43 | "aws4": {
44 | "version": "1.7.0",
45 | "resolved": "http://registry.npm.taobao.org/aws4/download/aws4-1.7.0.tgz",
46 | "integrity": "sha1-1NDpudv8p3vwjusKikcVUP454ok="
47 | },
48 | "balanced-match": {
49 | "version": "1.0.0",
50 | "resolved": "http://registry.npm.taobao.org/balanced-match/download/balanced-match-1.0.0.tgz",
51 | "integrity": "sha1-ibTRmasr7kneFk6gK4nORi1xt2c="
52 | },
53 | "bcrypt-pbkdf": {
54 | "version": "1.0.1",
55 | "resolved": "http://registry.npm.taobao.org/bcrypt-pbkdf/download/bcrypt-pbkdf-1.0.1.tgz",
56 | "integrity": "sha1-Y7xdy2EzG5K8Bf1SiVPDNGKgb40=",
57 | "optional": true,
58 | "requires": {
59 | "tweetnacl": "0.14.5"
60 | }
61 | },
62 | "bluebird": {
63 | "version": "3.5.1",
64 | "resolved": "http://registry.npm.taobao.org/bluebird/download/bluebird-3.5.1.tgz",
65 | "integrity": "sha1-2VUfnemPH82h5oPRfukaBgLuLrk="
66 | },
67 | "brace-expansion": {
68 | "version": "1.1.11",
69 | "resolved": "http://registry.npm.taobao.org/brace-expansion/download/brace-expansion-1.1.11.tgz",
70 | "integrity": "sha1-PH/L9SnYcibz0vUrlm/1Jx60Qd0=",
71 | "requires": {
72 | "balanced-match": "1.0.0",
73 | "concat-map": "0.0.1"
74 | }
75 | },
76 | "caseless": {
77 | "version": "0.12.0",
78 | "resolved": "http://registry.npm.taobao.org/caseless/download/caseless-0.12.0.tgz",
79 | "integrity": "sha1-G2gcIf+EAzyCZUMJBolCDRhxUdw="
80 | },
81 | "co": {
82 | "version": "4.6.0",
83 | "resolved": "http://registry.npm.taobao.org/co/download/co-4.6.0.tgz",
84 | "integrity": "sha1-bqa989hTrlTMuOR7+gvz+QMfsYQ="
85 | },
86 | "combined-stream": {
87 | "version": "1.0.6",
88 | "resolved": "http://registry.npm.taobao.org/combined-stream/download/combined-stream-1.0.6.tgz",
89 | "integrity": "sha1-cj599ugBrFYTETp+RFqbactjKBg=",
90 | "requires": {
91 | "delayed-stream": "1.0.0"
92 | }
93 | },
94 | "concat-map": {
95 | "version": "0.0.1",
96 | "resolved": "http://registry.npm.taobao.org/concat-map/download/concat-map-0.0.1.tgz",
97 | "integrity": "sha1-2Klr13/Wjfd5OnMDajug1UBdR3s="
98 | },
99 | "concat-stream": {
100 | "version": "1.6.0",
101 | "resolved": "http://registry.npm.taobao.org/concat-stream/download/concat-stream-1.6.0.tgz",
102 | "integrity": "sha1-CqxmL9Ur54lk1VMvaUeE5wEQrPc=",
103 | "requires": {
104 | "inherits": "2.0.3",
105 | "readable-stream": "2.3.6",
106 | "typedarray": "0.0.6"
107 | }
108 | },
109 | "core-util-is": {
110 | "version": "1.0.2",
111 | "resolved": "http://registry.npm.taobao.org/core-util-is/download/core-util-is-1.0.2.tgz",
112 | "integrity": "sha1-tf1UIgqivFq1eqtxQMlAdUUDwac="
113 | },
114 | "dashdash": {
115 | "version": "1.14.1",
116 | "resolved": "http://registry.npm.taobao.org/dashdash/download/dashdash-1.14.1.tgz",
117 | "integrity": "sha1-hTz6D3y+L+1d4gMmuN1YEDX24vA=",
118 | "requires": {
119 | "assert-plus": "1.0.0"
120 | }
121 | },
122 | "debug": {
123 | "version": "2.6.9",
124 | "resolved": "http://registry.npm.taobao.org/debug/download/debug-2.6.9.tgz",
125 | "integrity": "sha1-XRKFFd8TT/Mn6QpMk/Tgd6U2NB8=",
126 | "requires": {
127 | "ms": "2.0.0"
128 | }
129 | },
130 | "delayed-stream": {
131 | "version": "1.0.0",
132 | "resolved": "http://registry.npm.taobao.org/delayed-stream/download/delayed-stream-1.0.0.tgz",
133 | "integrity": "sha1-3zrhmayt+31ECqrgsp4icrJOxhk="
134 | },
135 | "ecc-jsbn": {
136 | "version": "0.1.1",
137 | "resolved": "http://registry.npm.taobao.org/ecc-jsbn/download/ecc-jsbn-0.1.1.tgz",
138 | "integrity": "sha1-D8c6ntXw1Tw4GTOYUj735UN3dQU=",
139 | "optional": true,
140 | "requires": {
141 | "jsbn": "0.1.1"
142 | }
143 | },
144 | "extend": {
145 | "version": "3.0.1",
146 | "resolved": "http://registry.npm.taobao.org/extend/download/extend-3.0.1.tgz",
147 | "integrity": "sha1-p1Xqe8Gt/MWjHOfnYtuq3F5jZEQ="
148 | },
149 | "extract-zip": {
150 | "version": "1.6.6",
151 | "resolved": "http://registry.npm.taobao.org/extract-zip/download/extract-zip-1.6.6.tgz",
152 | "integrity": "sha1-EpDt6NINCHK0Kf0/NRyhKOxe+Fw=",
153 | "requires": {
154 | "concat-stream": "1.6.0",
155 | "debug": "2.6.9",
156 | "mkdirp": "0.5.0",
157 | "yauzl": "2.4.1"
158 | }
159 | },
160 | "extsprintf": {
161 | "version": "1.3.0",
162 | "resolved": "http://registry.npm.taobao.org/extsprintf/download/extsprintf-1.3.0.tgz",
163 | "integrity": "sha1-lpGEQOMEGnpBT4xS48V06zw+HgU="
164 | },
165 | "fast-deep-equal": {
166 | "version": "1.1.0",
167 | "resolved": "http://registry.npm.taobao.org/fast-deep-equal/download/fast-deep-equal-1.1.0.tgz",
168 | "integrity": "sha1-wFNHeBfIa1HaqFPIHgWbcz0CNhQ="
169 | },
170 | "fast-json-stable-stringify": {
171 | "version": "2.0.0",
172 | "resolved": "http://registry.npm.taobao.org/fast-json-stable-stringify/download/fast-json-stable-stringify-2.0.0.tgz",
173 | "integrity": "sha1-1RQsDK7msRifh9OnYREGT4bIu/I="
174 | },
175 | "fd-slicer": {
176 | "version": "1.0.1",
177 | "resolved": "http://registry.npm.taobao.org/fd-slicer/download/fd-slicer-1.0.1.tgz",
178 | "integrity": "sha1-i1vL2ewyfFBBv5qwI/1nUPEXfmU=",
179 | "requires": {
180 | "pend": "1.2.0"
181 | }
182 | },
183 | "forever-agent": {
184 | "version": "0.6.1",
185 | "resolved": "http://registry.npm.taobao.org/forever-agent/download/forever-agent-0.6.1.tgz",
186 | "integrity": "sha1-+8cfDEGt6zf5bFd60e1C2P2sypE="
187 | },
188 | "form-data": {
189 | "version": "2.3.2",
190 | "resolved": "http://registry.npm.taobao.org/form-data/download/form-data-2.3.2.tgz",
191 | "integrity": "sha1-SXBJi+YEwgwAXU9cI67NIda0kJk=",
192 | "requires": {
193 | "asynckit": "0.4.0",
194 | "combined-stream": "1.0.6",
195 | "mime-types": "2.1.18"
196 | }
197 | },
198 | "fs.realpath": {
199 | "version": "1.0.0",
200 | "resolved": "http://registry.npm.taobao.org/fs.realpath/download/fs.realpath-1.0.0.tgz",
201 | "integrity": "sha1-FQStJSMVjKpA20onh8sBQRmU6k8="
202 | },
203 | "getpass": {
204 | "version": "0.1.7",
205 | "resolved": "http://registry.npm.taobao.org/getpass/download/getpass-0.1.7.tgz",
206 | "integrity": "sha1-Xv+OPmhNVprkyysSgmBOi6YhSfo=",
207 | "requires": {
208 | "assert-plus": "1.0.0"
209 | }
210 | },
211 | "glob": {
212 | "version": "7.1.2",
213 | "resolved": "http://registry.npm.taobao.org/glob/download/glob-7.1.2.tgz",
214 | "integrity": "sha1-wZyd+aAocC1nhhI4SmVSQExjbRU=",
215 | "requires": {
216 | "fs.realpath": "1.0.0",
217 | "inflight": "1.0.6",
218 | "inherits": "2.0.3",
219 | "minimatch": "3.0.4",
220 | "once": "1.4.0",
221 | "path-is-absolute": "1.0.1"
222 | }
223 | },
224 | "har-schema": {
225 | "version": "2.0.0",
226 | "resolved": "http://registry.npm.taobao.org/har-schema/download/har-schema-2.0.0.tgz",
227 | "integrity": "sha1-qUwiJOvKwEeCoNkDVSHyRzW37JI="
228 | },
229 | "har-validator": {
230 | "version": "5.0.3",
231 | "resolved": "http://registry.npm.taobao.org/har-validator/download/har-validator-5.0.3.tgz",
232 | "integrity": "sha1-ukAsJmGU8VlW7xXg/PJCmT9qff0=",
233 | "requires": {
234 | "ajv": "5.5.2",
235 | "har-schema": "2.0.0"
236 | }
237 | },
238 | "http-signature": {
239 | "version": "1.2.0",
240 | "resolved": "http://registry.npm.taobao.org/http-signature/download/http-signature-1.2.0.tgz",
241 | "integrity": "sha1-muzZJRFHcvPZW2WmCruPfBj7rOE=",
242 | "requires": {
243 | "assert-plus": "1.0.0",
244 | "jsprim": "1.4.1",
245 | "sshpk": "1.14.1"
246 | }
247 | },
248 | "inflight": {
249 | "version": "1.0.6",
250 | "resolved": "http://registry.npm.taobao.org/inflight/download/inflight-1.0.6.tgz",
251 | "integrity": "sha1-Sb1jMdfQLQwJvJEKEHW6gWW1bfk=",
252 | "requires": {
253 | "once": "1.4.0",
254 | "wrappy": "1.0.2"
255 | }
256 | },
257 | "inherits": {
258 | "version": "2.0.3",
259 | "resolved": "http://registry.npm.taobao.org/inherits/download/inherits-2.0.3.tgz",
260 | "integrity": "sha1-Yzwsg+PaQqUC9SRmAiSA9CCCYd4="
261 | },
262 | "is-typedarray": {
263 | "version": "1.0.0",
264 | "resolved": "http://registry.npm.taobao.org/is-typedarray/download/is-typedarray-1.0.0.tgz",
265 | "integrity": "sha1-5HnICFjfDBsR3dppQPlgEfzaSpo="
266 | },
267 | "isarray": {
268 | "version": "1.0.0",
269 | "resolved": "http://registry.npm.taobao.org/isarray/download/isarray-1.0.0.tgz",
270 | "integrity": "sha1-u5NdSFgsuhaMBoNJV6VKPgcSTxE="
271 | },
272 | "isstream": {
273 | "version": "0.1.2",
274 | "resolved": "http://registry.npm.taobao.org/isstream/download/isstream-0.1.2.tgz",
275 | "integrity": "sha1-R+Y/evVa+m+S4VAOaQ64uFKcCZo="
276 | },
277 | "jsbn": {
278 | "version": "0.1.1",
279 | "resolved": "http://registry.npm.taobao.org/jsbn/download/jsbn-0.1.1.tgz",
280 | "integrity": "sha1-peZUwuWi3rXyAdls77yoDA7y9RM=",
281 | "optional": true
282 | },
283 | "json-schema": {
284 | "version": "0.2.3",
285 | "resolved": "http://registry.npm.taobao.org/json-schema/download/json-schema-0.2.3.tgz",
286 | "integrity": "sha1-tIDIkuWaLwWVTOcnvT8qTogvnhM="
287 | },
288 | "json-schema-traverse": {
289 | "version": "0.3.1",
290 | "resolved": "http://registry.npm.taobao.org/json-schema-traverse/download/json-schema-traverse-0.3.1.tgz",
291 | "integrity": "sha1-NJptRMU6Ud6JtAgFxdXlm0F9M0A="
292 | },
293 | "json-stringify-safe": {
294 | "version": "5.0.1",
295 | "resolved": "http://registry.npm.taobao.org/json-stringify-safe/download/json-stringify-safe-5.0.1.tgz",
296 | "integrity": "sha1-Epai1Y/UXxmg9s4B1lcB4sc1tus="
297 | },
298 | "jsprim": {
299 | "version": "1.4.1",
300 | "resolved": "http://registry.npm.taobao.org/jsprim/download/jsprim-1.4.1.tgz",
301 | "integrity": "sha1-MT5mvB5cwG5Di8G3SZwuXFastqI=",
302 | "requires": {
303 | "assert-plus": "1.0.0",
304 | "extsprintf": "1.3.0",
305 | "json-schema": "0.2.3",
306 | "verror": "1.10.0"
307 | }
308 | },
309 | "lodash": {
310 | "version": "4.17.10",
311 | "resolved": "http://registry.npm.taobao.org/lodash/download/lodash-4.17.10.tgz",
312 | "integrity": "sha1-G3eTz3JZ6jj7NmHU04syYK+K5Oc="
313 | },
314 | "mime": {
315 | "version": "1.6.0",
316 | "resolved": "http://registry.npm.taobao.org/mime/download/mime-1.6.0.tgz",
317 | "integrity": "sha1-Ms2eXGRVO9WNGaVor0Uqz/BJgbE="
318 | },
319 | "mime-db": {
320 | "version": "1.33.0",
321 | "resolved": "http://registry.npm.taobao.org/mime-db/download/mime-db-1.33.0.tgz",
322 | "integrity": "sha1-o0kgUKXLm2NFBUHjnZeI0icng9s="
323 | },
324 | "mime-types": {
325 | "version": "2.1.18",
326 | "resolved": "http://registry.npm.taobao.org/mime-types/download/mime-types-2.1.18.tgz",
327 | "integrity": "sha1-bzI/YKg9ERRvgx/xH9ZuL+VQO7g=",
328 | "requires": {
329 | "mime-db": "1.33.0"
330 | }
331 | },
332 | "minimatch": {
333 | "version": "3.0.4",
334 | "resolved": "http://registry.npm.taobao.org/minimatch/download/minimatch-3.0.4.tgz",
335 | "integrity": "sha1-UWbihkV/AzBgZL5Ul+jbsMPTIIM=",
336 | "requires": {
337 | "brace-expansion": "1.1.11"
338 | }
339 | },
340 | "minimist": {
341 | "version": "0.0.8",
342 | "resolved": "http://registry.npm.taobao.org/minimist/download/minimist-0.0.8.tgz",
343 | "integrity": "sha1-hX/Kv8M5fSYluCKCYuhqp6ARsF0="
344 | },
345 | "mkdirp": {
346 | "version": "0.5.0",
347 | "resolved": "http://registry.npm.taobao.org/mkdirp/download/mkdirp-0.5.0.tgz",
348 | "integrity": "sha1-HXMHam35hs2TROFecfzAWkyavxI=",
349 | "requires": {
350 | "minimist": "0.0.8"
351 | }
352 | },
353 | "moment": {
354 | "version": "2.22.2",
355 | "resolved": "http://registry.npm.taobao.org/moment/download/moment-2.22.2.tgz",
356 | "integrity": "sha1-PCV/mDn8DpP/UxSWMiOeuQeD/2Y="
357 | },
358 | "ms": {
359 | "version": "2.0.0",
360 | "resolved": "http://registry.npm.taobao.org/ms/download/ms-2.0.0.tgz",
361 | "integrity": "sha1-VgiurfwAvmwpAd9fmGF4jeDVl8g="
362 | },
363 | "multiparty": {
364 | "version": "4.1.4",
365 | "resolved": "http://registry.npm.taobao.org/multiparty/download/multiparty-4.1.4.tgz",
366 | "integrity": "sha1-TJbcvcEeP3kX4WFeZAtLUCK+ZP0=",
367 | "requires": {
368 | "fd-slicer": "1.0.1",
369 | "safe-buffer": "5.1.2"
370 | }
371 | },
372 | "oauth-sign": {
373 | "version": "0.8.2",
374 | "resolved": "http://registry.npm.taobao.org/oauth-sign/download/oauth-sign-0.8.2.tgz",
375 | "integrity": "sha1-Rqarfwrq2N6unsBWV4C31O/rnUM="
376 | },
377 | "once": {
378 | "version": "1.4.0",
379 | "resolved": "http://registry.npm.taobao.org/once/download/once-1.4.0.tgz",
380 | "integrity": "sha1-WDsap3WWHUsROsF9nFC6753Xa9E=",
381 | "requires": {
382 | "wrappy": "1.0.2"
383 | }
384 | },
385 | "path-is-absolute": {
386 | "version": "1.0.1",
387 | "resolved": "http://registry.npm.taobao.org/path-is-absolute/download/path-is-absolute-1.0.1.tgz",
388 | "integrity": "sha1-F0uSaHNVNP+8es5r9TpanhtcX18="
389 | },
390 | "pend": {
391 | "version": "1.2.0",
392 | "resolved": "http://registry.npm.taobao.org/pend/download/pend-1.2.0.tgz",
393 | "integrity": "sha1-elfrVQpng/kRUzH89GY9XI4AelA="
394 | },
395 | "performance-now": {
396 | "version": "2.1.0",
397 | "resolved": "http://registry.npm.taobao.org/performance-now/download/performance-now-2.1.0.tgz",
398 | "integrity": "sha1-Ywn04OX6kT7BxpMHrjZLSzd8nns="
399 | },
400 | "process-nextick-args": {
401 | "version": "2.0.0",
402 | "resolved": "http://registry.npm.taobao.org/process-nextick-args/download/process-nextick-args-2.0.0.tgz",
403 | "integrity": "sha1-o31zL0JxtKsa0HDTVQjoKQeI/6o="
404 | },
405 | "progress": {
406 | "version": "2.0.0",
407 | "resolved": "http://registry.npm.taobao.org/progress/download/progress-2.0.0.tgz",
408 | "integrity": "sha1-ihvjZr+Pwj2yvSPxDG/pILQ4nR8="
409 | },
410 | "punycode": {
411 | "version": "1.4.1",
412 | "resolved": "http://registry.npm.taobao.org/punycode/download/punycode-1.4.1.tgz",
413 | "integrity": "sha1-wNWmOycYgArY4esPpSachN1BhF4="
414 | },
415 | "puppeteer": {
416 | "version": "0.9.0",
417 | "resolved": "http://registry.npm.taobao.org/puppeteer/download/puppeteer-0.9.0.tgz",
418 | "integrity": "sha1-1lmX/4PiTrVp5Vd9L3VpXcvlvko=",
419 | "requires": {
420 | "debug": "2.6.9",
421 | "extract-zip": "1.6.6",
422 | "mime": "1.6.0",
423 | "progress": "2.0.0",
424 | "rimraf": "2.6.2",
425 | "ws": "3.3.3"
426 | }
427 | },
428 | "qs": {
429 | "version": "6.5.2",
430 | "resolved": "http://registry.npm.taobao.org/qs/download/qs-6.5.2.tgz",
431 | "integrity": "sha1-yzroBuh0BERYTvFUzo7pjUA/PjY="
432 | },
433 | "readable-stream": {
434 | "version": "2.3.6",
435 | "resolved": "http://registry.npm.taobao.org/readable-stream/download/readable-stream-2.3.6.tgz",
436 | "integrity": "sha1-sRwn2IuP8fvgcGQ8+UsMea4bCq8=",
437 | "requires": {
438 | "core-util-is": "1.0.2",
439 | "inherits": "2.0.3",
440 | "isarray": "1.0.0",
441 | "process-nextick-args": "2.0.0",
442 | "safe-buffer": "5.1.2",
443 | "string_decoder": "1.1.1",
444 | "util-deprecate": "1.0.2"
445 | }
446 | },
447 | "request": {
448 | "version": "2.87.0",
449 | "resolved": "http://registry.npm.taobao.org/request/download/request-2.87.0.tgz",
450 | "integrity": "sha1-MvACNc0I1IK00NaNuTqCnA7VdW4=",
451 | "requires": {
452 | "aws-sign2": "0.7.0",
453 | "aws4": "1.7.0",
454 | "caseless": "0.12.0",
455 | "combined-stream": "1.0.6",
456 | "extend": "3.0.1",
457 | "forever-agent": "0.6.1",
458 | "form-data": "2.3.2",
459 | "har-validator": "5.0.3",
460 | "http-signature": "1.2.0",
461 | "is-typedarray": "1.0.0",
462 | "isstream": "0.1.2",
463 | "json-stringify-safe": "5.0.1",
464 | "mime-types": "2.1.18",
465 | "oauth-sign": "0.8.2",
466 | "performance-now": "2.1.0",
467 | "qs": "6.5.2",
468 | "safe-buffer": "5.1.2",
469 | "tough-cookie": "2.3.4",
470 | "tunnel-agent": "0.6.0",
471 | "uuid": "3.2.1"
472 | }
473 | },
474 | "request-promise": {
475 | "version": "4.2.2",
476 | "resolved": "http://registry.npm.taobao.org/request-promise/download/request-promise-4.2.2.tgz",
477 | "integrity": "sha1-0epG1lSm7k+O5qT+oQGMIpEZBLQ=",
478 | "requires": {
479 | "bluebird": "3.5.1",
480 | "request-promise-core": "1.1.1",
481 | "stealthy-require": "1.1.1",
482 | "tough-cookie": "2.3.4"
483 | }
484 | },
485 | "request-promise-core": {
486 | "version": "1.1.1",
487 | "resolved": "http://registry.npm.taobao.org/request-promise-core/download/request-promise-core-1.1.1.tgz",
488 | "integrity": "sha1-Pu4AssWqgyOc+wTFcA2jb4HNCLY=",
489 | "requires": {
490 | "lodash": "4.17.10"
491 | }
492 | },
493 | "rimraf": {
494 | "version": "2.6.2",
495 | "resolved": "http://registry.npm.taobao.org/rimraf/download/rimraf-2.6.2.tgz",
496 | "integrity": "sha1-LtgVDSShbqhlHm1u8PR8QVjOejY=",
497 | "requires": {
498 | "glob": "7.1.2"
499 | }
500 | },
501 | "safe-buffer": {
502 | "version": "5.1.2",
503 | "resolved": "http://registry.npm.taobao.org/safe-buffer/download/safe-buffer-5.1.2.tgz",
504 | "integrity": "sha1-mR7GnSluAxN0fVm9/St0XDX4go0="
505 | },
506 | "sshpk": {
507 | "version": "1.14.1",
508 | "resolved": "http://registry.npm.taobao.org/sshpk/download/sshpk-1.14.1.tgz",
509 | "integrity": "sha1-Ew9Zde3a2WPx1W+SuaxsUfqfg+s=",
510 | "requires": {
511 | "asn1": "0.2.3",
512 | "assert-plus": "1.0.0",
513 | "bcrypt-pbkdf": "1.0.1",
514 | "dashdash": "1.14.1",
515 | "ecc-jsbn": "0.1.1",
516 | "getpass": "0.1.7",
517 | "jsbn": "0.1.1",
518 | "tweetnacl": "0.14.5"
519 | }
520 | },
521 | "stealthy-require": {
522 | "version": "1.1.1",
523 | "resolved": "http://registry.npm.taobao.org/stealthy-require/download/stealthy-require-1.1.1.tgz",
524 | "integrity": "sha1-NbCYdbT/SfJqd35QmzCQoyJr8ks="
525 | },
526 | "string_decoder": {
527 | "version": "1.1.1",
528 | "resolved": "http://registry.npm.taobao.org/string_decoder/download/string_decoder-1.1.1.tgz",
529 | "integrity": "sha1-nPFhG6YmhdcDCunkujQUnDrwP8g=",
530 | "requires": {
531 | "safe-buffer": "5.1.2"
532 | }
533 | },
534 | "tough-cookie": {
535 | "version": "2.3.4",
536 | "resolved": "http://registry.npm.taobao.org/tough-cookie/download/tough-cookie-2.3.4.tgz",
537 | "integrity": "sha1-7GDO44rGdQY//JelwYlwV47oNlU=",
538 | "requires": {
539 | "punycode": "1.4.1"
540 | }
541 | },
542 | "tunnel-agent": {
543 | "version": "0.6.0",
544 | "resolved": "http://registry.npm.taobao.org/tunnel-agent/download/tunnel-agent-0.6.0.tgz",
545 | "integrity": "sha1-J6XeoGs2sEoKmWZ3SykIaPD8QP0=",
546 | "requires": {
547 | "safe-buffer": "5.1.2"
548 | }
549 | },
550 | "tweetnacl": {
551 | "version": "0.14.5",
552 | "resolved": "http://registry.npm.taobao.org/tweetnacl/download/tweetnacl-0.14.5.tgz",
553 | "integrity": "sha1-WuaBd/GS1EViadEIr6k/+HQ/T2Q=",
554 | "optional": true
555 | },
556 | "typedarray": {
557 | "version": "0.0.6",
558 | "resolved": "http://registry.npm.taobao.org/typedarray/download/typedarray-0.0.6.tgz",
559 | "integrity": "sha1-hnrHTjhkGHsdPUfZlqeOxciDB3c="
560 | },
561 | "ultron": {
562 | "version": "1.1.1",
563 | "resolved": "http://registry.npm.taobao.org/ultron/download/ultron-1.1.1.tgz",
564 | "integrity": "sha1-n+FTahCmZKZSZqHjzPhf02MCvJw="
565 | },
566 | "util-deprecate": {
567 | "version": "1.0.2",
568 | "resolved": "http://registry.npm.taobao.org/util-deprecate/download/util-deprecate-1.0.2.tgz",
569 | "integrity": "sha1-RQ1Nyfpw3nMnYvvS1KKJgUGaDM8="
570 | },
571 | "uuid": {
572 | "version": "3.2.1",
573 | "resolved": "http://registry.npm.taobao.org/uuid/download/uuid-3.2.1.tgz",
574 | "integrity": "sha1-EsUou51Y0LkmXZovbw/ovhf/HxQ="
575 | },
576 | "verror": {
577 | "version": "1.10.0",
578 | "resolved": "http://registry.npm.taobao.org/verror/download/verror-1.10.0.tgz",
579 | "integrity": "sha1-OhBcoXBTr1XW4nDB+CiGguGNpAA=",
580 | "requires": {
581 | "assert-plus": "1.0.0",
582 | "core-util-is": "1.0.2",
583 | "extsprintf": "1.3.0"
584 | }
585 | },
586 | "wrappy": {
587 | "version": "1.0.2",
588 | "resolved": "http://registry.npm.taobao.org/wrappy/download/wrappy-1.0.2.tgz",
589 | "integrity": "sha1-tSQ9jz7BqjXxNkYFvA0QNuMKtp8="
590 | },
591 | "ws": {
592 | "version": "3.3.3",
593 | "resolved": "http://registry.npm.taobao.org/ws/download/ws-3.3.3.tgz",
594 | "integrity": "sha1-8c+E/i1ekB686U767OeF8YeiKPI=",
595 | "requires": {
596 | "async-limiter": "1.0.0",
597 | "safe-buffer": "5.1.2",
598 | "ultron": "1.1.1"
599 | }
600 | },
601 | "yauzl": {
602 | "version": "2.4.1",
603 | "resolved": "http://registry.npm.taobao.org/yauzl/download/yauzl-2.4.1.tgz",
604 | "integrity": "sha1-lSj0QtqxsihOWLQ3m7GU4i4MQAU=",
605 | "requires": {
606 | "fd-slicer": "1.0.1"
607 | }
608 | }
609 | }
610 | }
611 |
--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "eva",
3 | "version": "1.0.0",
4 | "description": "",
5 | "main": "index.js",
6 | "scripts": {
7 | "test": "node ./src/tools/tools.js",
8 | "start": "node ./src/index.js",
9 | "req": "node ./src/req.js",
10 | "file": "node ./src/tools/file-upload.js",
11 | "img": "node ./src/tools/img-sina.js"
12 | },
13 | "author": "",
14 | "license": "ISC",
15 | "dependencies": {
16 | "moment": "^2.22.2",
17 | "multiparty": "^4.1.4",
18 | "puppeteer": "0.9.0",
19 | "request": "^2.87.0",
20 | "request-promise": "^4.2.2"
21 | }
22 | }
23 |
--------------------------------------------------------------------------------
/src/config.js:
--------------------------------------------------------------------------------
1 | module.exports = {
2 | key: 'js中文分词',
3 | company: '互联网'
4 | }
--------------------------------------------------------------------------------
/src/data-source/baidu.js:
--------------------------------------------------------------------------------
1 | var fs = require("fs");
2 |
3 | // 搜索源
4 | const source = 'baidu'
5 |
6 | // 路径
7 | const url = 'https://www.baidu.com'
8 |
9 | // 搜索按钮,选择器
10 | const submitSelectName = '#su'
11 |
12 | // 页面内容,选择器
13 | const domSelectName = '.c-container'
14 |
15 | // 下一页,选择器
16 | const nextPageSelectName = '#page > a:last-child'
17 |
18 | // 翻页数量
19 | const needPageMaxNum = 20
20 |
21 | module.exports = async(browser, timeout, key) => {
22 | require('./template/search-engine.js')(
23 | browser, timeout, key,
24 | source, url, submitSelectName, domSelectName,
25 | nextPageSelectName, needPageMaxNum
26 | )
27 | }
28 |
--------------------------------------------------------------------------------
/src/data-source/bing.js:
--------------------------------------------------------------------------------
1 | var fs = require("fs");
2 |
3 | const source = 'bing'
4 | const url = 'https://cn.bing.com/'
5 | const submitSelectName = '#sb_form_go'
6 | const domSelectName = '.b_algo'
7 | const nextPageSelectName = '[title="下一页"]'
8 | const needPageMaxNum = 20
9 |
10 | module.exports = async(browser, timeout, key) => {
11 | require('./template/search-engine.js') (
12 | browser, timeout, key,
13 | source, url, submitSelectName, domSelectName,
14 | nextPageSelectName, needPageMaxNum
15 | )
16 | }
--------------------------------------------------------------------------------
/src/data-source/so.js:
--------------------------------------------------------------------------------
1 | var fs = require("fs");
2 |
3 | // 搜索源
4 | const source = 'so'
5 |
6 | // 路径
7 | const url = 'https://www.so.com'
8 |
9 | // 搜索按钮,选择器
10 | const submitSelectName = '#search-button'
11 |
12 | // 页面内容,选择器
13 | const domSelectName = '.res-list'
14 |
15 | // 下一页,选择器
16 | const nextPageSelectName = '#snext'
17 |
18 | // 翻页数量
19 | const needPageMaxNum = 20
20 |
21 | module.exports = async(browser, timeout, key) => {
22 | require('./template/search-engine.js')(
23 | browser, timeout, key,
24 | source, url, submitSelectName, domSelectName,
25 | nextPageSelectName, needPageMaxNum
26 | )
27 | }
28 |
--------------------------------------------------------------------------------
/src/data-source/sogou.js:
--------------------------------------------------------------------------------
1 | var fs = require("fs");
2 |
3 | // 搜索源
4 | const source = 'sogou'
5 |
6 | // 路径
7 | const url = 'https://www.sogou.com/'
8 |
9 | // 搜索按钮,选择器
10 | const submitSelectName = '#stb'
11 |
12 | // 页面内容,选择器
13 | const domSelectName = '.vrwrap'
14 |
15 | // 下一页,选择器
16 | const nextPageSelectName = '#sogou_next'
17 |
18 | // 翻页数量
19 | const needPageMaxNum = 20
20 |
21 | module.exports = async(browser, timeout, key) => {
22 | require('./template/search-engine.js')(
23 | browser, timeout, key,
24 | source, url, submitSelectName, domSelectName,
25 | nextPageSelectName, needPageMaxNum
26 | )
27 | }
28 |
--------------------------------------------------------------------------------
/src/data-source/template/search-engine.js:
--------------------------------------------------------------------------------
1 | var fs = require("fs");
2 |
3 | module.exports = async (
4 | browser, timeout, key,
5 | source, url, submitSelectName, domSelectName,
6 | nextPageSelectName, needPageMaxNum
7 | ) => {
8 | // 从dom获取数据,并写文件
9 | var getDataFromDom = async () => {
10 | await timeout(1500);
11 | var data = await page.evaluate((domSelectName) => {
12 | var list = [...document.querySelectorAll(domSelectName)]
13 |
14 | return list.map(el => {
15 | return { html: el.innerHTML, content: el.innerText }
16 | })
17 | }, domSelectName)
18 |
19 | var content = []
20 | data.forEach(element => {
21 | content.push(element.content)
22 | console.log(content)
23 | });
24 |
25 | await fs.appendFileSync(`./src/data/${key}-${source}.txt`, `startTime: ${new Date().toUTCString()}`+'\r');
26 | await fs.appendFileSync(`./src/data/${key}-${source}.txt`, JSON.stringify(content, null , ' ')+'\r');
27 | await fs.appendFileSync(`./src/data/${key}-${source}.txt`, `endTime: ${new Date().toUTCString()}`+'\r\r');
28 | }
29 |
30 | // 1.跳转至相应的页面
31 | var page = await browser.newPage();
32 | await page.goto(url);
33 |
34 | // 2.输入关键字
35 | await timeout(3000* Math.random());
36 | await page.type(key, { delay: 100 })
37 |
38 | // 3.点击提交
39 | var submit = await page.$(submitSelectName)
40 | await submit.click()
41 |
42 | // 4.获取数据,数据写文件
43 | await getDataFromDom()
44 |
45 | // 6.翻页,数据写文件
46 | for (let index = 0; index < needPageMaxNum; index++) {
47 | var nextPage = await page.$(nextPageSelectName)
48 | await nextPage.click()
49 | await timeout(3000* Math.random());
50 | await getDataFromDom()
51 | }
52 |
53 | // 7.关闭页面
54 | await timeout(3000* Math.random());
55 | await page.close()
56 | }
--------------------------------------------------------------------------------
/src/data-source/tianyancha.js:
--------------------------------------------------------------------------------
1 | var fs = require("fs");
2 |
3 | module.exports = async (browser, timeout, key) => {
4 | var getDataFromDom = async () => {
5 | await timeout(1500);
6 | var data = await page.evaluate(() => {
7 | var list = [...document.querySelectorAll('.search_result_single')]
8 |
9 | return list.map(el => {
10 | return { html: el.innerHTML, content: el.innerText }
11 | })
12 | })
13 | var content = []
14 | data.forEach(element => {
15 | content.push(element.content)
16 | console.log(content)
17 | });
18 |
19 | await fs.appendFileSync(`./src/data/tianyancha-${key}.txt`, `startTime: ${new Date().toUTCString()}`+'\r');
20 | await fs.appendFileSync(`./src/data/tianyancha-${key}.txt`, JSON.stringify(content, null , ' ')+'\r');
21 | await fs.appendFileSync(`./src/data/tianyancha-${key}.txt`, `endTime: ${new Date().toUTCString()}`+'\r\r');
22 | }
23 |
24 | var page = await browser.newPage();
25 | await page.goto(`https://www.tianyancha.com/search?key=${key}`);
26 | await timeout(500);
27 | await getDataFromDom()
28 |
29 | for (let index = 0; index < 10000000; index++) {
30 | var nextPage = await page.$('#web-content > div > div > div > div.col-9.search-2017-2.pr15.pl0 > div.b-c-white.clearfix.position-rel.mb30 > div > div.search_pager.human_pager.in-block > ul > li.pagination-next.ng-scope > a')
31 |
32 | await nextPage.click()
33 | await timeout(6 * 1000 * Math.random());
34 | await getDataFromDom()
35 | }
36 |
37 | await page.close()
38 | }
--------------------------------------------------------------------------------
/src/duanzi/budejie-detail.js:
--------------------------------------------------------------------------------
1 | var origin_key = 'b61d91ea6c3'
2 | var crypto = require('crypto')
3 | var sha1 = crypto.createHash('sha1');
4 | var moment = require('moment')
5 | var str = moment().format('YYYY-MM-DD 00:00:00') + origin_key
6 | sha1.update(str)
7 | var keyCode = sha1.digest('hex')
8 | keyCode = keyCode.substr(-15)
9 | var fs = require("fs");
10 | var req = require('request-promise');
11 | var common = require('./common.js')
12 |
13 | var theUrl = 'http://www.budejie.com'
14 |
15 | var fromName = '不得姐'
16 | var fromFileName = 'budejie'
17 |
18 | module.exports = async (browser, timeout, key) => {
19 | var getDataFromDom = async () => {
20 | await timeout(1500);
21 |
22 | var data = await page.evaluate(() => {
23 | let discuss = []
24 | document.querySelectorAll('#hotCommentList').forEach(el => {
25 | if (el.querySelector('.g-mnc1') && el.querySelector('.g-mnc1').innerText) {
26 | let txt = el.querySelector('.g-mnc1').innerText
27 | discuss.push(txt.trim())
28 | }
29 | })
30 |
31 | return {
32 | title: null,
33 | content: document.querySelector('.j-r-list-c-desc') ? document.querySelector('.j-r-list-c-desc').innerText : null,
34 | discuss: JSON.stringify(discuss),
35 | imgurl: document.querySelector('.j-r-list-c-img > img') ? document.querySelector('.j-r-list-c-img > img').src : null,
36 | cdn_img_url: null,
37 | zan: document.querySelector('.j-r-list-tool-l-up').innerText.trim(),
38 | comments: document.querySelector('.comment-counts').innerText.trim(),
39 | type: null
40 | }
41 | })
42 |
43 | data = [data]
44 | // 上传图片
45 | // data = await common.uploadImg(data)
46 |
47 | // 写文件
48 | await common.wirteFile(data, fromFileName)
49 |
50 | // 上传到后台
51 | await common.uploadData(fromName, data, firstUrl)
52 | }
53 |
54 | var page = await browser.newPage();
55 | await page.goto(theUrl)
56 | await timeout(2000)
57 |
58 | var firstUrl = await page.evaluate(() => {
59 | var url = document.querySelector('.j-r-list-c-img > a').href;
60 | return url
61 | })
62 | console.log('firstUrl:', firstUrl)
63 |
64 | await page.goto(firstUrl);
65 | await timeout(3000);
66 | await getDataFromDom();
67 |
68 | for (let index = 0; index < 10000000; index++) {
69 | var nextPage = await page.$('.c-next-btn')
70 |
71 | await nextPage.click()
72 | await timeout(6 * 1000 * Math.random());
73 | await getDataFromDom()
74 | }
75 | await page.close()
76 | }
--------------------------------------------------------------------------------
/src/duanzi/common.js:
--------------------------------------------------------------------------------
1 | var fs = require("fs");
2 | var req = require('request-promise');
3 | var origin_key = 'b61d91ea6c3'
4 | var crypto = require('crypto')
5 | var sha1 = crypto.createHash('sha1');
6 | var moment = require('moment')
7 | var str = moment().format('YYYY-MM-DD 00:00:00') + origin_key
8 | sha1.update(str)
9 | var keyCode = sha1.digest('hex')
10 | keyCode = keyCode.substr(-15)
11 |
12 | var proxyPool = [
13 | 'http://119.27.177.169:80',
14 | // 'http://110.73.7.162:8123',
15 | // 'http://119.28.203.242:9000',
16 | // 'http://222.88.149.32:8060',
17 | // 'http://118.190.95.43:9001',
18 | // 'http://144.0.111.107:8060',
19 | // 'http://120.131.9.254:1080',
20 | // 'http://121.18.231.74:80',
21 | // 'http://39.137.69.9:80',
22 | // 'http://103.242.219.242:8080',
23 | // 'http://119.28.194.66:8888',
24 | ]
25 |
26 | module.exports = {
27 | // 上传图片文件
28 | uploadImg: async (data) => {
29 | for (let i = 0; i < data.length; i++) {
30 |
31 | let item = data[i]
32 | if (!item.imgurl) {
33 | continue
34 | }
35 |
36 | if (0) {
37 | // 小贱图床,(实际新浪)
38 | try {
39 | console.log('上传 小贱-图片')
40 | let imgurl = item.imgurl
41 | let type = imgurl.split('.').pop()
42 | let response = await req({
43 | url: `https://pic.xiaojianjian.net/webtools/picbed/uploadByUrl.htm?url=${imgurl}`
44 | })
45 | console.log('response:', response)
46 |
47 | response = JSON.parse(response)
48 | let cdn_img_url = response.original_pic
49 | item.cdn_img_url = cdn_img_url
50 | } catch (error) { }
51 | } else {
52 | // yum6图床(实际新浪)
53 | try {
54 | console.log('上传 yum6-图片')
55 | let imgurl = item.imgurl
56 | let type = imgurl.split('.').pop()
57 | let rdProxy = Math.floor(Math.random() * proxyPool.length)
58 | console.log(proxyPool[rdProxy])
59 |
60 | let response = await req({
61 | url: 'https://api.yum6.cn/sinaimg.php?img=' + imgurl,
62 | // proxy: proxyPool[rdProxy]
63 | })
64 | console.log('response:', response)
65 |
66 | response = JSON.parse(response)
67 | let cdn_img_url = 'https://ww2.sinaimg.cn/large/' + response.pid + '.' + type
68 | item.cdn_img_url = cdn_img_url
69 | } catch (error) {
70 | console.log(i, error)
71 | }
72 | }
73 |
74 | }
75 | return data
76 | },
77 | // 写文件
78 | wirteFile: async (data, fromFileName) => {
79 | try {
80 | console.log('写文件')
81 | await fs.appendFileSync(`./src/data/${fromFileName}.txt`, JSON.stringify(data[0], null, ' ') + '\r');
82 | } catch (error) {
83 | console.log('err:', error)
84 | }
85 | },
86 | // 上传到后台
87 | uploadData: async (fromName, data, url) => {
88 | var options = {
89 | method: 'POST',
90 | timeout: 3000000,
91 | uri: 'https://juhe.qqeasy.com/information/import-jokes',
92 | body: {
93 | "key": keyCode,
94 | "from": fromName,
95 | "from_url": url,
96 | "create_time": new Date().toUTCString(),
97 | "data": {
98 | "contents": data
99 | }
100 | },
101 | json: true
102 | }
103 |
104 | try {
105 | console.log('上传到后台')
106 | let response = await req(options)
107 | } catch (error) {
108 | console.log('err:', error)
109 | }
110 | }
111 | }
--------------------------------------------------------------------------------
/src/duanzi/gaoxiaogif-detail.js:
--------------------------------------------------------------------------------
1 | var origin_key = 'b61d91ea6c3'
2 | var crypto = require('crypto')
3 | var sha1 = crypto.createHash('sha1');
4 | var moment = require('moment')
5 | var str = moment().format('YYYY-MM-DD 00:00:00') + origin_key
6 | sha1.update(str)
7 | var keyCode = sha1.digest('hex')
8 | keyCode = keyCode.substr(-15)
9 | var fs = require("fs");
10 | var req = require('request-promise');
11 | var common = require('./common.js')
12 |
13 | var theUrl = 'http://www.gaoxiaogif.cn'
14 | var fromName = '搞笑动图'
15 | var fromFileName = 'gaoxiaogif'
16 |
17 | module.exports = async (browser, timeout, key) => {
18 | var getDataFromDom = async () => {
19 | await timeout(1500);
20 |
21 | var data = await page.evaluate(() => {
22 | let discuss = []
23 | document.querySelectorAll('.box-s li').forEach(el => {
24 | if (el.querySelector('.content') && el.querySelector('.content').innerText) {
25 | discuss.push(el.querySelector('.content').innerText.split(':')[1])
26 | }
27 | })
28 |
29 | return {
30 | title: null,
31 | content: document.querySelector('.showtxt') ? document.querySelector('.showtxt').innerText : null,
32 | discuss: JSON.stringify(discuss),
33 | imgurl: document.querySelector('.imgp img') ? document.querySelector('.imgp img').src : null,
34 | cdn_img_url: null,
35 | zan: document.querySelector('.up').innerText,
36 | comments: null,
37 | type: null
38 | }
39 | })
40 |
41 | data = [data]
42 | // 上传图片
43 | // data = await common.uploadImg(data)
44 |
45 | // 写文件
46 | await common.wirteFile(data, fromFileName)
47 |
48 | // 上传到后台
49 | console.log('firstUrl2:', firstUrl)
50 | await common.uploadData(fromName, data, firstUrl)
51 | }
52 |
53 | var page = await browser.newPage();
54 | await page.goto(theUrl)
55 | await timeout(2000)
56 |
57 | var firstUrl = await page.evaluate(() => {
58 | var url = document.querySelector('.img > a').href;
59 | return url
60 | })
61 | console.log('firstUrl:', firstUrl)
62 |
63 | await page.goto(firstUrl);
64 | await timeout(2000);
65 | await getDataFromDom()
66 |
67 | for (let index = 0; index < 10000000; index++) {
68 | var nextPage = await page.$('.fr a')
69 |
70 | await nextPage.click()
71 | await timeout(6 * 1000 * Math.random());
72 | await getDataFromDom()
73 | }
74 | await page.close()
75 | }
--------------------------------------------------------------------------------
/src/duanzi/qiubai-detail.js:
--------------------------------------------------------------------------------
1 | var fs = require("fs");
2 | var req = require('request-promise');
3 | var origin_key = 'b61d91ea6c3'
4 | var crypto = require('crypto')
5 | var sha1 = crypto.createHash('sha1');
6 | var moment = require('moment')
7 | var str = moment().format('YYYY-MM-DD 00:00:00') + origin_key
8 | sha1.update(str)
9 | var keyCode = sha1.digest('hex')
10 | keyCode = keyCode.substr(-15)
11 | var common = require('./common.js')
12 |
13 | var theUrl = 'https://www.qiushibaike.com'
14 | var fromName = '糗百'
15 | var fromFileName = 'qiubai-detail'
16 |
17 | module.exports = async (browser, timeout, key) => {
18 | // 从DOM爬取数据
19 | var getDataFromDom = async () => {
20 | await timeout(1500);
21 |
22 | var data = await page.evaluate(() => {
23 | var discuss = []
24 | document.querySelectorAll('.comments-table .main-text').forEach(el => {
25 | discuss.push(el.innerText)
26 | })
27 |
28 | return {
29 | title: null,
30 | content: document.querySelector('.content') ? document.querySelector('.content').innerText : null,
31 | discuss: JSON.stringify(discuss),
32 | imgurl: document.querySelector('.thumb img') ? document.querySelector('.thumb img').src : null,
33 | cdn_img_url: null,
34 | zan: document.querySelector('.stats-vote .number').innerText,
35 | comments: document.querySelector('.stats-comments .number').innerText,
36 | type: null
37 | }
38 | })
39 | data = [data]
40 |
41 | console.log(data)
42 | // 上传图片
43 | // data = await common.uploadImg(data)
44 |
45 | // 写文件
46 | await common.wirteFile(data, fromFileName)
47 |
48 | // 上传到后台
49 | console.log('firstUrl2:', firstUrl)
50 | await common.uploadData(fromName, data, firstUrl)
51 | }
52 |
53 | var page = await browser.newPage();
54 | await page.goto(theUrl)
55 | await timeout(2000)
56 |
57 | var firstUrl = await page.evaluate(() => {
58 | var url = document.querySelector('.contentHerf').href;
59 | return url
60 | })
61 | console.log('firstUrl:', firstUrl)
62 |
63 | await page.goto(firstUrl);
64 | await timeout(2000);
65 | await getDataFromDom()
66 |
67 | for (let index = 0; index < 10000000; index++) {
68 | var nextPage = await page.$('.page-nav-list-next')
69 |
70 | await nextPage.click()
71 | await timeout(6 * 1000 * Math.random());
72 | await getDataFromDom()
73 | }
74 | await page.close()
75 | }
--------------------------------------------------------------------------------
/src/img/5b3f553ed9fb7.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhentaoo/eva/fc06ca95716329d96c8036b445f4f10643feb771/src/img/5b3f553ed9fb7.gif
--------------------------------------------------------------------------------
/src/index.js:
--------------------------------------------------------------------------------
1 | const puppeteer = require('puppeteer');
2 | var { timeout } = require('./tools/tools.js');
3 | var key = require('./config.js').key
4 | var company = require('./config.js').company
5 |
6 | // puppeteer.launch().then(async (browser) => {
7 | puppeteer.launch({ headless: false }).then(async (browser) => {
8 | console.log('startTime:', new Date().toUTCString());
9 | try {
10 | /**
11 | * 搜索引擎
12 | */
13 | // await require('./data-source/so.js')(browser, timeout, key)
14 | // await require('./data-source/sogou.js')(browser, timeout, key)
15 | // await require('./data-source/baidu.js')(browser, timeout, key)
16 | // await require('./data-source/bing.js')(browser, timeout, key)
17 |
18 | /**
19 | * 段子
20 | */
21 | // await require('./duanzi/qiubai-detail.js')(browser, timeout, company)
22 | // await require('./duanzi/gaoxiaogif-detail.js')(browser, timeout, company)
23 | await require('./duanzi/budejie-detail.js')(browser, timeout, company)
24 |
25 | /**
26 | * 企业信息
27 | */
28 | // await require('./data-source/tianyancha.js')(browser, timeout, company)
29 | } catch (error) {
30 | console.log(error)
31 | console.log('endTime:', new Date().toUTCString());
32 | }
33 | console.log('endTime:', new Date().toUTCString());
34 | });
35 |
--------------------------------------------------------------------------------
/src/req.js:
--------------------------------------------------------------------------------
1 | var fs = require("fs");
2 | var req = require('request-promise');
3 | var url = 'https://www.pengfu.com/'
4 | var origin_key = 'b61d91ea6c3'
5 | var crypto = require('crypto')
6 | var sha1 = crypto.createHash('sha1');
7 | var moment = require('moment')
8 |
9 | var fn = async function (params) {
10 | var str = moment().format('YYYY-MM-DD 00:00:00') + origin_key
11 | console.log('str:', str)
12 | sha1.update(str)
13 | var keyCode = sha1.digest('hex')
14 | console.log('keyCode:', keyCode)
15 | keyCode = keyCode.substr(-15)
16 | console.log('keyCode15:', keyCode)
17 |
18 | var res = await req({
19 | method: 'POST',
20 | uri: 'http://juhe.qqeasy.com/information/import-jokes',
21 | body: {
22 | "key": keyCode,
23 | "from": url,
24 | "from_url": url,
25 | "create_time": new Date().toUTCString(),
26 | "data": {
27 | "contents": [
28 | {
29 | "title": "三地鼠使用了惊吓,你的防御下降了",
30 | "content": "",
31 | "img": "https://image7.pengfu.com/origin/180601/5b11467493589.gif"
32 | },
33 | {
34 | "title": "幸福送给别人,悲伤留给自己",
35 | "content": "女同学结婚,邀请我做伴娘。我深感荣幸,婚宴上喝得不亦乐乎。另一位女同学趁着酒意告诉我:新娘A杯,怕婚礼上被抢风头,为选伴娘伤透脑筋。终于大家一致推荐了你,因为没有最小只有更小!",
36 | "img": null
37 | }
38 | ]
39 | }
40 | },
41 | json: true // Aut
42 | })
43 |
44 | console.log(res)
45 | }
46 |
47 | fn()
--------------------------------------------------------------------------------
/src/tmp/pengfu.js:
--------------------------------------------------------------------------------
1 | var fs = require("fs");
2 | var req = require('request-promise');
3 | var url = 'https://www.pengfu.com/qutu_18.html'
4 |
5 | var origin_key = 'b61d91ea6c3'
6 | var crypto = require('crypto')
7 | var sha1 = crypto.createHash('sha1');
8 | var moment = require('moment')
9 |
10 | var url = 'https://www.pengfu.com/qutu_18.html'
11 | var fromName = '捧腹'
12 | var fromFileName = 'pengfu'
13 |
14 | var str = moment().format('YYYY-MM-DD 00:00:00') + origin_key
15 | sha1.update(str)
16 | var keyCode = sha1.digest('hex')
17 | keyCode = keyCode.substr(-15)
18 |
19 | module.exports = async (browser, timeout, key) => {
20 | var getDataFromDom = async () => {
21 | await timeout(1500);
22 | var data = await page.evaluate(() => {
23 | var list = [...document.querySelectorAll('.list-item')]
24 |
25 | return list.map(el => {
26 | return {
27 | title: el.querySelector('.dp-b').innerText,
28 | content: el.querySelector('.content-img').innerText,
29 | imgurl: el.querySelector('.content-img > img') ? (el.querySelector('.content-img > img').getAttribute('gifsrc') || el.querySelector('.content-img > img').src) : null,
30 | cdn_img_url: null,
31 | zan: el.querySelector('.fl .ding').innerText,
32 | comments: el.querySelector('.fl .commentClick').innerText,
33 | type: el.querySelectorAll('div.fr > a')
34 | && ([...el.querySelectorAll('div.fr > a')].map(i => { return i.innerText }).join(','))
35 | }
36 | })
37 | })
38 | console.log('data:', JSON.stringify(data))
39 |
40 | // 上传图片
41 | // data = await common.uploadImg(data)
42 |
43 | // 写文件
44 | await common.wirteFile(data, fromFileName)
45 |
46 | // 上传到后台
47 | await common.uploadData(fromName, data, url)
48 | }
49 |
50 | var page = await browser.newPage();
51 | await page.goto(url);
52 | await timeout(500);
53 | await getDataFromDom()
54 |
55 | for (let index = 0; index < 10000000; index++) {
56 | var nextPage = await page.$('.page > div > a:last-child')
57 |
58 | await nextPage.click()
59 | await timeout(6 * 1000 * Math.random());
60 | await getDataFromDom()
61 | }
62 |
63 | await page.close()
64 | }
--------------------------------------------------------------------------------
/src/tools/file-upload.js:
--------------------------------------------------------------------------------
1 | var req = require('request-promise');
2 | var fs = require('fs')
3 | var multiparty = require('multiparty');
4 |
5 | async function downloadImg(url, fileUrl) {
6 | req(url)
7 | .pipe(fs.createWriteStream(fileUrl))
8 | }
9 |
10 | async function uploadImg(fileName, fileUrl, cdnUrl) {
11 | console.log(fileUrl)
12 | console.log(cdnUrl)
13 |
14 | var options = {
15 | method: 'POST',
16 | uri: cdnUrl,
17 | formData: {
18 | // Like
19 | name: 'smfile',
20 | // Like
21 | file: {
22 | value: fs.createReadStream(fileUrl),
23 | options: {
24 | filename: fileName,
25 | contentType: 'image/jpg'
26 | }
27 | },
28 | filename: fileName
29 | },
30 | headers: {
31 | 'content-type': 'application/x-www-form-urlencoded' // Is set automatically
32 | }
33 | };
34 |
35 | req(options)
36 | .then(function (body) {
37 | console.log('body:', body)
38 | // POST succeeded...
39 | })
40 | .catch(function (err) {
41 | console.log('err:', err)
42 | // POST failed...
43 | });
44 | }
45 |
46 | async function uploadCdnImg(url) {
47 | var url = url
48 | var fileName = url.split('/').pop()
49 | var fileUrl = './src/img/' + fileName
50 | var cdnUrl = 'https://sm.ms/api/upload'
51 |
52 | await downloadImg(url, fileUrl)
53 |
54 | await uploadImg(fileName, fileUrl, cdnUrl)
55 | }
56 |
57 | uploadCdnImg('https://image7.pengfu.com/origin/180706/5b3f553ed9fb7.gif')
--------------------------------------------------------------------------------
/src/tools/img-sina.js:
--------------------------------------------------------------------------------
1 | var req = require('request-promise');
2 | var fs = require('fs')
3 |
4 | async function a () {
5 | let imgurl = 'https://image7.pengfu.com/origin/180727/5b5adaf912540.jpg'
6 | let type = imgurl.split('.').pop()
7 |
8 | let response = await req('https://api.yum6.cn/sinaimg.php?img=' + imgurl)
9 | response = JSON.parse(response)
10 | let cdn_img_url = 'https://ww2.sinaimg.cn/large/' + response.pid + '.' + type
11 |
12 | console.log(response)
13 | console.log(cdn_img_url)
14 | // item.cdn_img_url = 'https://ww2.sinaimg.cn/large/' + response.pid
15 | }
16 |
17 | a()
--------------------------------------------------------------------------------
/src/tools/tools.js:
--------------------------------------------------------------------------------
1 | var fs = require("fs");
2 |
3 | class Tools {
4 | static appendFileSync(path, data, callback) {
5 | fs.appendFileSync(path, data, callback);
6 | }
7 |
8 | static timeout(delay) {
9 | return new Promise((resolve, reject) => {
10 | setTimeout(() => {
11 | try {
12 | resolve(1)
13 | } catch (e) {
14 | reject(0)
15 | }
16 | }, delay)
17 | })
18 | }
19 |
20 | /**
21 | * [TimeTools description]
22 | * @param {[type]} timestamp 12312312312312
23 | * @param {[type]} formatStr Y年M月D日
24 | *
25 | * M: month 1~12
26 | * Y: year 2017
27 | * D: date 0 ~ 31
28 | */
29 | static moment(formatStr, timestamp) {
30 | let date = new Date(timestamp || new Date().getTime())
31 |
32 | let M = date.getMonth() + 1
33 |
34 | let Y = date.getFullYear()
35 |
36 | let D = date.getDate()
37 |
38 | let h = date.getHours()
39 |
40 | let m = date.getMinutes()
41 |
42 | let s = date.getSeconds()
43 |
44 | return formatStr.replace('M', M).replace('Y', Y).replace('D', D).replace('h', h).replace('m', m).replace('s', s)
45 | }
46 | }
47 |
48 | module.exports = Tools;
--------------------------------------------------------------------------------