├── .gitignore ├── README.md ├── package-lock.json ├── package.json └── src ├── config.js ├── data-source ├── baidu.js ├── bing.js ├── so.js ├── sogou.js ├── template │ └── search-engine.js └── tianyancha.js ├── duanzi ├── budejie-detail.js ├── common.js ├── gaoxiaogif-detail.js └── qiubai-detail.js ├── img └── 5b3f553ed9fb7.gif ├── index.js ├── req.js ├── tmp └── pengfu.js └── tools ├── file-upload.js ├── img-sina.js └── tools.js /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules 2 | src/data 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | ### 为什么要做信息聚合工具?意义是什么? 3 | - 在日常工作生活中,多数人遇到新知识、新事物,第一反应便是去百度、知乎、bing、各个社区论坛,搜集信息,然后消化学习~~~ 4 | - 那么作为Coder的我们,为什么要手动去多个平台搜集信息,能不能有什么工具帮我们去各个平台搜集信息,最好还是消化过的,然后把有用的信息给我们呢? 5 | - OK,今天先讨论如何从各个平台搜集信息 6 | - 嗯,如标题所示,这里采用的的技术方案是Puppeteer。至于为什么要用Puppeteer?原因有二:1. 使用Puppeteer模拟用户操作,而不是简单的接口调用去获取数据,极大降低被判定为爬虫的风险。2. Puppeteer可以抓取客户端渲染的站点内容,这是传统爬虫难以做到的! 7 | 8 | ### 其次对Puppeteer进行简介 9 | - Puppeteer本质是一个node库,他提供了一组通过DevTools Protocol来操纵Headless Chrome的高级API 10 | - 其他详细介绍:[请猛戳Puppeteer官方文档](https://github.com/GoogleChrome/puppeteer/blob/master/docs/api.md#puppeteerlaunchoptions) 11 | 12 | #### Puppeteer 核心功能 13 | - 生成网页PDF、截图 14 | - 从网站抓取内容 15 | - 爬取SPA应用,并生成预渲染内容 16 | - UI自动化操作:表单提交、UI测试、键盘输入等 17 | - 创建一个最新的自动化测试环境(chrome),可以直接在此运行测试用例 18 | - 捕获站点时间线,帮助分析网站性能问题 19 | 20 | ### 接下来开始实战,Repo在 [Github Eva](https://github.com/zhentaoo/eva) 21 | 1. 运行Puppeteer 22 | ```js 23 | puppeteer.launch().then(async browser => { 24 | ...... 25 | // do what you want 26 | ...... 27 | }) 28 | ``` 29 | 30 | 2. 开一个新的tab页,跳转至事先定义好的站点 31 | ```js 32 | let page = await browser.newPage(); 33 | await page.goto('https://www.tianyancha.com/search?key=${key}'); 34 | ``` 35 | 36 | 3. 分析站点dom结构,抓取你想要的内容 37 | ```js 38 | let data = await page.evaluate(() => { 39 | let list = [...document.querySelectorAll('.search_result_single')] 40 | 41 | return list.map(el => { 42 | return { html: el.innerHTML, content: el.innerText } 43 | }) 44 | }) 45 | ``` 46 | 47 | 4. 将内容写入文件(如果大家感兴趣可以做成数据库存取,这不是重点) 48 | ```js 49 | fs.appendFileSync(`./src/data/tianyancha-${key}.txt`, `startTime: ${new Date().toUTCString()}`+'\r'); 50 | 51 | fs.appendFileSync(`./src/data/tianyancha-${key}.txt`, JSON.stringify(content, null , ' ')+'\r'); 52 | ``` 53 | 54 | 5. 5.翻页操作,获取下一页的内容 55 | ```js 56 | for (let index = 0; index < needPageMaxNum; index++) { 57 | var nextPage = await page.$('#web-content > div > div > div > div.col-9.search-2017-2.pr15.pl0 > div.b-c-white.clearfix.position-rel.mb30 > div > div.search_pager.human_pager.in-block > ul > li.pagination-next.ng-scope > a') 58 | 59 | await nextPage.click() 60 | await timeout(6 * 1000 * Math.random()); 61 | await getDataFromDom() 62 | } 63 | ``` 64 | 65 | 6. 数据获取完毕,关闭页面 66 | ```js 67 | await page.close() 68 | ``` 69 | 70 | 7. 小细节,为了更加真实,在模拟用户操作的的过程中,使用随机时间作为间隔 71 | ```js 72 | await timeout(6 * 1000 * Math.random()); 73 | ``` 74 | 75 | 8. 最后说明,这里只做了最基础的数据爬取,后续可能会做出duckduckgo么?敬请期待 76 | 77 | 9. 完整代码在: https://github.com/zhentaoo/eva (Demo型项目,无敏感信息) 78 | 79 | 10. 项目运行 80 | - git clone https://github.com/zhentaoo/eva 81 | - npm install (puppeteer在win下100+M、mac下70+M,请耐心等候,如果安装不了,请使用cnpm) 82 | - npm start 83 | -------------------------------------------------------------------------------- /package-lock.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "eva", 3 | "version": "1.0.0", 4 | "lockfileVersion": 1, 5 | "requires": true, 6 | "dependencies": { 7 | "ajv": { 8 | "version": "5.5.2", 9 | "resolved": "http://registry.npm.taobao.org/ajv/download/ajv-5.5.2.tgz", 10 | "integrity": "sha1-c7Xuyj+rZT49P5Qis0GtQiBdyWU=", 11 | "requires": { 12 | "co": "4.6.0", 13 | "fast-deep-equal": "1.1.0", 14 | "fast-json-stable-stringify": "2.0.0", 15 | "json-schema-traverse": "0.3.1" 16 | } 17 | }, 18 | "asn1": { 19 | "version": "0.2.3", 20 | "resolved": "http://registry.npm.taobao.org/asn1/download/asn1-0.2.3.tgz", 21 | "integrity": "sha1-2sh4dxPJlmhJ/IGAd36+nB3fO4Y=" 22 | }, 23 | "assert-plus": { 24 | "version": "1.0.0", 25 | "resolved": "http://registry.npm.taobao.org/assert-plus/download/assert-plus-1.0.0.tgz", 26 | "integrity": "sha1-8S4PPF13sLHN2RRpQuTpbB5N1SU=" 27 | }, 28 | "async-limiter": { 29 | "version": "1.0.0", 30 | "resolved": "http://registry.npm.taobao.org/async-limiter/download/async-limiter-1.0.0.tgz", 31 | "integrity": "sha1-ePrtjD0HSrgfIrTphdeehzj3IPg=" 32 | }, 33 | "asynckit": { 34 | "version": "0.4.0", 35 | "resolved": "http://registry.npm.taobao.org/asynckit/download/asynckit-0.4.0.tgz", 36 | "integrity": "sha1-x57Zf380y48robyXkLzDZkdLS3k=" 37 | }, 38 | "aws-sign2": { 39 | "version": "0.7.0", 40 | "resolved": "http://registry.npm.taobao.org/aws-sign2/download/aws-sign2-0.7.0.tgz", 41 | "integrity": "sha1-tG6JCTSpWR8tL2+G1+ap8bP+dqg=" 42 | }, 43 | "aws4": { 44 | "version": "1.7.0", 45 | "resolved": "http://registry.npm.taobao.org/aws4/download/aws4-1.7.0.tgz", 46 | "integrity": "sha1-1NDpudv8p3vwjusKikcVUP454ok=" 47 | }, 48 | "balanced-match": { 49 | "version": "1.0.0", 50 | "resolved": "http://registry.npm.taobao.org/balanced-match/download/balanced-match-1.0.0.tgz", 51 | "integrity": "sha1-ibTRmasr7kneFk6gK4nORi1xt2c=" 52 | }, 53 | "bcrypt-pbkdf": { 54 | "version": "1.0.1", 55 | "resolved": "http://registry.npm.taobao.org/bcrypt-pbkdf/download/bcrypt-pbkdf-1.0.1.tgz", 56 | "integrity": "sha1-Y7xdy2EzG5K8Bf1SiVPDNGKgb40=", 57 | "optional": true, 58 | "requires": { 59 | "tweetnacl": "0.14.5" 60 | } 61 | }, 62 | "bluebird": { 63 | "version": "3.5.1", 64 | "resolved": "http://registry.npm.taobao.org/bluebird/download/bluebird-3.5.1.tgz", 65 | "integrity": "sha1-2VUfnemPH82h5oPRfukaBgLuLrk=" 66 | }, 67 | "brace-expansion": { 68 | "version": "1.1.11", 69 | "resolved": "http://registry.npm.taobao.org/brace-expansion/download/brace-expansion-1.1.11.tgz", 70 | "integrity": "sha1-PH/L9SnYcibz0vUrlm/1Jx60Qd0=", 71 | "requires": { 72 | "balanced-match": "1.0.0", 73 | "concat-map": "0.0.1" 74 | } 75 | }, 76 | "caseless": { 77 | "version": "0.12.0", 78 | "resolved": "http://registry.npm.taobao.org/caseless/download/caseless-0.12.0.tgz", 79 | "integrity": "sha1-G2gcIf+EAzyCZUMJBolCDRhxUdw=" 80 | }, 81 | "co": { 82 | "version": "4.6.0", 83 | "resolved": "http://registry.npm.taobao.org/co/download/co-4.6.0.tgz", 84 | "integrity": "sha1-bqa989hTrlTMuOR7+gvz+QMfsYQ=" 85 | }, 86 | "combined-stream": { 87 | "version": "1.0.6", 88 | "resolved": "http://registry.npm.taobao.org/combined-stream/download/combined-stream-1.0.6.tgz", 89 | "integrity": "sha1-cj599ugBrFYTETp+RFqbactjKBg=", 90 | "requires": { 91 | "delayed-stream": "1.0.0" 92 | } 93 | }, 94 | "concat-map": { 95 | "version": "0.0.1", 96 | "resolved": "http://registry.npm.taobao.org/concat-map/download/concat-map-0.0.1.tgz", 97 | "integrity": "sha1-2Klr13/Wjfd5OnMDajug1UBdR3s=" 98 | }, 99 | "concat-stream": { 100 | "version": "1.6.0", 101 | "resolved": "http://registry.npm.taobao.org/concat-stream/download/concat-stream-1.6.0.tgz", 102 | "integrity": "sha1-CqxmL9Ur54lk1VMvaUeE5wEQrPc=", 103 | "requires": { 104 | "inherits": "2.0.3", 105 | "readable-stream": "2.3.6", 106 | "typedarray": "0.0.6" 107 | } 108 | }, 109 | "core-util-is": { 110 | "version": "1.0.2", 111 | "resolved": "http://registry.npm.taobao.org/core-util-is/download/core-util-is-1.0.2.tgz", 112 | "integrity": "sha1-tf1UIgqivFq1eqtxQMlAdUUDwac=" 113 | }, 114 | "dashdash": { 115 | "version": "1.14.1", 116 | "resolved": "http://registry.npm.taobao.org/dashdash/download/dashdash-1.14.1.tgz", 117 | "integrity": "sha1-hTz6D3y+L+1d4gMmuN1YEDX24vA=", 118 | "requires": { 119 | "assert-plus": "1.0.0" 120 | } 121 | }, 122 | "debug": { 123 | "version": "2.6.9", 124 | "resolved": "http://registry.npm.taobao.org/debug/download/debug-2.6.9.tgz", 125 | "integrity": "sha1-XRKFFd8TT/Mn6QpMk/Tgd6U2NB8=", 126 | "requires": { 127 | "ms": "2.0.0" 128 | } 129 | }, 130 | "delayed-stream": { 131 | "version": "1.0.0", 132 | "resolved": "http://registry.npm.taobao.org/delayed-stream/download/delayed-stream-1.0.0.tgz", 133 | "integrity": "sha1-3zrhmayt+31ECqrgsp4icrJOxhk=" 134 | }, 135 | "ecc-jsbn": { 136 | "version": "0.1.1", 137 | "resolved": "http://registry.npm.taobao.org/ecc-jsbn/download/ecc-jsbn-0.1.1.tgz", 138 | "integrity": "sha1-D8c6ntXw1Tw4GTOYUj735UN3dQU=", 139 | "optional": true, 140 | "requires": { 141 | "jsbn": "0.1.1" 142 | } 143 | }, 144 | "extend": { 145 | "version": "3.0.1", 146 | "resolved": "http://registry.npm.taobao.org/extend/download/extend-3.0.1.tgz", 147 | "integrity": "sha1-p1Xqe8Gt/MWjHOfnYtuq3F5jZEQ=" 148 | }, 149 | "extract-zip": { 150 | "version": "1.6.6", 151 | "resolved": "http://registry.npm.taobao.org/extract-zip/download/extract-zip-1.6.6.tgz", 152 | "integrity": "sha1-EpDt6NINCHK0Kf0/NRyhKOxe+Fw=", 153 | "requires": { 154 | "concat-stream": "1.6.0", 155 | "debug": "2.6.9", 156 | "mkdirp": "0.5.0", 157 | "yauzl": "2.4.1" 158 | } 159 | }, 160 | "extsprintf": { 161 | "version": "1.3.0", 162 | "resolved": "http://registry.npm.taobao.org/extsprintf/download/extsprintf-1.3.0.tgz", 163 | "integrity": "sha1-lpGEQOMEGnpBT4xS48V06zw+HgU=" 164 | }, 165 | "fast-deep-equal": { 166 | "version": "1.1.0", 167 | "resolved": "http://registry.npm.taobao.org/fast-deep-equal/download/fast-deep-equal-1.1.0.tgz", 168 | "integrity": "sha1-wFNHeBfIa1HaqFPIHgWbcz0CNhQ=" 169 | }, 170 | "fast-json-stable-stringify": { 171 | "version": "2.0.0", 172 | "resolved": "http://registry.npm.taobao.org/fast-json-stable-stringify/download/fast-json-stable-stringify-2.0.0.tgz", 173 | "integrity": "sha1-1RQsDK7msRifh9OnYREGT4bIu/I=" 174 | }, 175 | "fd-slicer": { 176 | "version": "1.0.1", 177 | "resolved": "http://registry.npm.taobao.org/fd-slicer/download/fd-slicer-1.0.1.tgz", 178 | "integrity": "sha1-i1vL2ewyfFBBv5qwI/1nUPEXfmU=", 179 | "requires": { 180 | "pend": "1.2.0" 181 | } 182 | }, 183 | "forever-agent": { 184 | "version": "0.6.1", 185 | "resolved": "http://registry.npm.taobao.org/forever-agent/download/forever-agent-0.6.1.tgz", 186 | "integrity": "sha1-+8cfDEGt6zf5bFd60e1C2P2sypE=" 187 | }, 188 | "form-data": { 189 | "version": "2.3.2", 190 | "resolved": "http://registry.npm.taobao.org/form-data/download/form-data-2.3.2.tgz", 191 | "integrity": "sha1-SXBJi+YEwgwAXU9cI67NIda0kJk=", 192 | "requires": { 193 | "asynckit": "0.4.0", 194 | "combined-stream": "1.0.6", 195 | "mime-types": "2.1.18" 196 | } 197 | }, 198 | "fs.realpath": { 199 | "version": "1.0.0", 200 | "resolved": "http://registry.npm.taobao.org/fs.realpath/download/fs.realpath-1.0.0.tgz", 201 | "integrity": "sha1-FQStJSMVjKpA20onh8sBQRmU6k8=" 202 | }, 203 | "getpass": { 204 | "version": "0.1.7", 205 | "resolved": "http://registry.npm.taobao.org/getpass/download/getpass-0.1.7.tgz", 206 | "integrity": "sha1-Xv+OPmhNVprkyysSgmBOi6YhSfo=", 207 | "requires": { 208 | "assert-plus": "1.0.0" 209 | } 210 | }, 211 | "glob": { 212 | "version": "7.1.2", 213 | "resolved": "http://registry.npm.taobao.org/glob/download/glob-7.1.2.tgz", 214 | "integrity": "sha1-wZyd+aAocC1nhhI4SmVSQExjbRU=", 215 | "requires": { 216 | "fs.realpath": "1.0.0", 217 | "inflight": "1.0.6", 218 | "inherits": "2.0.3", 219 | "minimatch": "3.0.4", 220 | "once": "1.4.0", 221 | "path-is-absolute": "1.0.1" 222 | } 223 | }, 224 | "har-schema": { 225 | "version": "2.0.0", 226 | "resolved": "http://registry.npm.taobao.org/har-schema/download/har-schema-2.0.0.tgz", 227 | "integrity": "sha1-qUwiJOvKwEeCoNkDVSHyRzW37JI=" 228 | }, 229 | "har-validator": { 230 | "version": "5.0.3", 231 | "resolved": "http://registry.npm.taobao.org/har-validator/download/har-validator-5.0.3.tgz", 232 | "integrity": "sha1-ukAsJmGU8VlW7xXg/PJCmT9qff0=", 233 | "requires": { 234 | "ajv": "5.5.2", 235 | "har-schema": "2.0.0" 236 | } 237 | }, 238 | "http-signature": { 239 | "version": "1.2.0", 240 | "resolved": "http://registry.npm.taobao.org/http-signature/download/http-signature-1.2.0.tgz", 241 | "integrity": "sha1-muzZJRFHcvPZW2WmCruPfBj7rOE=", 242 | "requires": { 243 | "assert-plus": "1.0.0", 244 | "jsprim": "1.4.1", 245 | "sshpk": "1.14.1" 246 | } 247 | }, 248 | "inflight": { 249 | "version": "1.0.6", 250 | "resolved": "http://registry.npm.taobao.org/inflight/download/inflight-1.0.6.tgz", 251 | "integrity": "sha1-Sb1jMdfQLQwJvJEKEHW6gWW1bfk=", 252 | "requires": { 253 | "once": "1.4.0", 254 | "wrappy": "1.0.2" 255 | } 256 | }, 257 | "inherits": { 258 | "version": "2.0.3", 259 | "resolved": "http://registry.npm.taobao.org/inherits/download/inherits-2.0.3.tgz", 260 | "integrity": "sha1-Yzwsg+PaQqUC9SRmAiSA9CCCYd4=" 261 | }, 262 | "is-typedarray": { 263 | "version": "1.0.0", 264 | "resolved": "http://registry.npm.taobao.org/is-typedarray/download/is-typedarray-1.0.0.tgz", 265 | "integrity": "sha1-5HnICFjfDBsR3dppQPlgEfzaSpo=" 266 | }, 267 | "isarray": { 268 | "version": "1.0.0", 269 | "resolved": "http://registry.npm.taobao.org/isarray/download/isarray-1.0.0.tgz", 270 | "integrity": "sha1-u5NdSFgsuhaMBoNJV6VKPgcSTxE=" 271 | }, 272 | "isstream": { 273 | "version": "0.1.2", 274 | "resolved": "http://registry.npm.taobao.org/isstream/download/isstream-0.1.2.tgz", 275 | "integrity": "sha1-R+Y/evVa+m+S4VAOaQ64uFKcCZo=" 276 | }, 277 | "jsbn": { 278 | "version": "0.1.1", 279 | "resolved": "http://registry.npm.taobao.org/jsbn/download/jsbn-0.1.1.tgz", 280 | "integrity": "sha1-peZUwuWi3rXyAdls77yoDA7y9RM=", 281 | "optional": true 282 | }, 283 | "json-schema": { 284 | "version": "0.2.3", 285 | "resolved": "http://registry.npm.taobao.org/json-schema/download/json-schema-0.2.3.tgz", 286 | "integrity": "sha1-tIDIkuWaLwWVTOcnvT8qTogvnhM=" 287 | }, 288 | "json-schema-traverse": { 289 | "version": "0.3.1", 290 | "resolved": "http://registry.npm.taobao.org/json-schema-traverse/download/json-schema-traverse-0.3.1.tgz", 291 | "integrity": "sha1-NJptRMU6Ud6JtAgFxdXlm0F9M0A=" 292 | }, 293 | "json-stringify-safe": { 294 | "version": "5.0.1", 295 | "resolved": "http://registry.npm.taobao.org/json-stringify-safe/download/json-stringify-safe-5.0.1.tgz", 296 | "integrity": "sha1-Epai1Y/UXxmg9s4B1lcB4sc1tus=" 297 | }, 298 | "jsprim": { 299 | "version": "1.4.1", 300 | "resolved": "http://registry.npm.taobao.org/jsprim/download/jsprim-1.4.1.tgz", 301 | "integrity": "sha1-MT5mvB5cwG5Di8G3SZwuXFastqI=", 302 | "requires": { 303 | "assert-plus": "1.0.0", 304 | "extsprintf": "1.3.0", 305 | "json-schema": "0.2.3", 306 | "verror": "1.10.0" 307 | } 308 | }, 309 | "lodash": { 310 | "version": "4.17.10", 311 | "resolved": "http://registry.npm.taobao.org/lodash/download/lodash-4.17.10.tgz", 312 | "integrity": "sha1-G3eTz3JZ6jj7NmHU04syYK+K5Oc=" 313 | }, 314 | "mime": { 315 | "version": "1.6.0", 316 | "resolved": "http://registry.npm.taobao.org/mime/download/mime-1.6.0.tgz", 317 | "integrity": "sha1-Ms2eXGRVO9WNGaVor0Uqz/BJgbE=" 318 | }, 319 | "mime-db": { 320 | "version": "1.33.0", 321 | "resolved": "http://registry.npm.taobao.org/mime-db/download/mime-db-1.33.0.tgz", 322 | "integrity": "sha1-o0kgUKXLm2NFBUHjnZeI0icng9s=" 323 | }, 324 | "mime-types": { 325 | "version": "2.1.18", 326 | "resolved": "http://registry.npm.taobao.org/mime-types/download/mime-types-2.1.18.tgz", 327 | "integrity": "sha1-bzI/YKg9ERRvgx/xH9ZuL+VQO7g=", 328 | "requires": { 329 | "mime-db": "1.33.0" 330 | } 331 | }, 332 | "minimatch": { 333 | "version": "3.0.4", 334 | "resolved": "http://registry.npm.taobao.org/minimatch/download/minimatch-3.0.4.tgz", 335 | "integrity": "sha1-UWbihkV/AzBgZL5Ul+jbsMPTIIM=", 336 | "requires": { 337 | "brace-expansion": "1.1.11" 338 | } 339 | }, 340 | "minimist": { 341 | "version": "0.0.8", 342 | "resolved": "http://registry.npm.taobao.org/minimist/download/minimist-0.0.8.tgz", 343 | "integrity": "sha1-hX/Kv8M5fSYluCKCYuhqp6ARsF0=" 344 | }, 345 | "mkdirp": { 346 | "version": "0.5.0", 347 | "resolved": "http://registry.npm.taobao.org/mkdirp/download/mkdirp-0.5.0.tgz", 348 | "integrity": "sha1-HXMHam35hs2TROFecfzAWkyavxI=", 349 | "requires": { 350 | "minimist": "0.0.8" 351 | } 352 | }, 353 | "moment": { 354 | "version": "2.22.2", 355 | "resolved": "http://registry.npm.taobao.org/moment/download/moment-2.22.2.tgz", 356 | "integrity": "sha1-PCV/mDn8DpP/UxSWMiOeuQeD/2Y=" 357 | }, 358 | "ms": { 359 | "version": "2.0.0", 360 | "resolved": "http://registry.npm.taobao.org/ms/download/ms-2.0.0.tgz", 361 | "integrity": "sha1-VgiurfwAvmwpAd9fmGF4jeDVl8g=" 362 | }, 363 | "multiparty": { 364 | "version": "4.1.4", 365 | "resolved": "http://registry.npm.taobao.org/multiparty/download/multiparty-4.1.4.tgz", 366 | "integrity": "sha1-TJbcvcEeP3kX4WFeZAtLUCK+ZP0=", 367 | "requires": { 368 | "fd-slicer": "1.0.1", 369 | "safe-buffer": "5.1.2" 370 | } 371 | }, 372 | "oauth-sign": { 373 | "version": "0.8.2", 374 | "resolved": "http://registry.npm.taobao.org/oauth-sign/download/oauth-sign-0.8.2.tgz", 375 | "integrity": "sha1-Rqarfwrq2N6unsBWV4C31O/rnUM=" 376 | }, 377 | "once": { 378 | "version": "1.4.0", 379 | "resolved": "http://registry.npm.taobao.org/once/download/once-1.4.0.tgz", 380 | "integrity": "sha1-WDsap3WWHUsROsF9nFC6753Xa9E=", 381 | "requires": { 382 | "wrappy": "1.0.2" 383 | } 384 | }, 385 | "path-is-absolute": { 386 | "version": "1.0.1", 387 | "resolved": "http://registry.npm.taobao.org/path-is-absolute/download/path-is-absolute-1.0.1.tgz", 388 | "integrity": "sha1-F0uSaHNVNP+8es5r9TpanhtcX18=" 389 | }, 390 | "pend": { 391 | "version": "1.2.0", 392 | "resolved": "http://registry.npm.taobao.org/pend/download/pend-1.2.0.tgz", 393 | "integrity": "sha1-elfrVQpng/kRUzH89GY9XI4AelA=" 394 | }, 395 | "performance-now": { 396 | "version": "2.1.0", 397 | "resolved": "http://registry.npm.taobao.org/performance-now/download/performance-now-2.1.0.tgz", 398 | "integrity": "sha1-Ywn04OX6kT7BxpMHrjZLSzd8nns=" 399 | }, 400 | "process-nextick-args": { 401 | "version": "2.0.0", 402 | "resolved": "http://registry.npm.taobao.org/process-nextick-args/download/process-nextick-args-2.0.0.tgz", 403 | "integrity": "sha1-o31zL0JxtKsa0HDTVQjoKQeI/6o=" 404 | }, 405 | "progress": { 406 | "version": "2.0.0", 407 | "resolved": "http://registry.npm.taobao.org/progress/download/progress-2.0.0.tgz", 408 | "integrity": "sha1-ihvjZr+Pwj2yvSPxDG/pILQ4nR8=" 409 | }, 410 | "punycode": { 411 | "version": "1.4.1", 412 | "resolved": "http://registry.npm.taobao.org/punycode/download/punycode-1.4.1.tgz", 413 | "integrity": "sha1-wNWmOycYgArY4esPpSachN1BhF4=" 414 | }, 415 | "puppeteer": { 416 | "version": "0.9.0", 417 | "resolved": "http://registry.npm.taobao.org/puppeteer/download/puppeteer-0.9.0.tgz", 418 | "integrity": "sha1-1lmX/4PiTrVp5Vd9L3VpXcvlvko=", 419 | "requires": { 420 | "debug": "2.6.9", 421 | "extract-zip": "1.6.6", 422 | "mime": "1.6.0", 423 | "progress": "2.0.0", 424 | "rimraf": "2.6.2", 425 | "ws": "3.3.3" 426 | } 427 | }, 428 | "qs": { 429 | "version": "6.5.2", 430 | "resolved": "http://registry.npm.taobao.org/qs/download/qs-6.5.2.tgz", 431 | "integrity": "sha1-yzroBuh0BERYTvFUzo7pjUA/PjY=" 432 | }, 433 | "readable-stream": { 434 | "version": "2.3.6", 435 | "resolved": "http://registry.npm.taobao.org/readable-stream/download/readable-stream-2.3.6.tgz", 436 | "integrity": "sha1-sRwn2IuP8fvgcGQ8+UsMea4bCq8=", 437 | "requires": { 438 | "core-util-is": "1.0.2", 439 | "inherits": "2.0.3", 440 | "isarray": "1.0.0", 441 | "process-nextick-args": "2.0.0", 442 | "safe-buffer": "5.1.2", 443 | "string_decoder": "1.1.1", 444 | "util-deprecate": "1.0.2" 445 | } 446 | }, 447 | "request": { 448 | "version": "2.87.0", 449 | "resolved": "http://registry.npm.taobao.org/request/download/request-2.87.0.tgz", 450 | "integrity": "sha1-MvACNc0I1IK00NaNuTqCnA7VdW4=", 451 | "requires": { 452 | "aws-sign2": "0.7.0", 453 | "aws4": "1.7.0", 454 | "caseless": "0.12.0", 455 | "combined-stream": "1.0.6", 456 | "extend": "3.0.1", 457 | "forever-agent": "0.6.1", 458 | "form-data": "2.3.2", 459 | "har-validator": "5.0.3", 460 | "http-signature": "1.2.0", 461 | "is-typedarray": "1.0.0", 462 | "isstream": "0.1.2", 463 | "json-stringify-safe": "5.0.1", 464 | "mime-types": "2.1.18", 465 | "oauth-sign": "0.8.2", 466 | "performance-now": "2.1.0", 467 | "qs": "6.5.2", 468 | "safe-buffer": "5.1.2", 469 | "tough-cookie": "2.3.4", 470 | "tunnel-agent": "0.6.0", 471 | "uuid": "3.2.1" 472 | } 473 | }, 474 | "request-promise": { 475 | "version": "4.2.2", 476 | "resolved": "http://registry.npm.taobao.org/request-promise/download/request-promise-4.2.2.tgz", 477 | "integrity": "sha1-0epG1lSm7k+O5qT+oQGMIpEZBLQ=", 478 | "requires": { 479 | "bluebird": "3.5.1", 480 | "request-promise-core": "1.1.1", 481 | "stealthy-require": "1.1.1", 482 | "tough-cookie": "2.3.4" 483 | } 484 | }, 485 | "request-promise-core": { 486 | "version": "1.1.1", 487 | "resolved": "http://registry.npm.taobao.org/request-promise-core/download/request-promise-core-1.1.1.tgz", 488 | "integrity": "sha1-Pu4AssWqgyOc+wTFcA2jb4HNCLY=", 489 | "requires": { 490 | "lodash": "4.17.10" 491 | } 492 | }, 493 | "rimraf": { 494 | "version": "2.6.2", 495 | "resolved": "http://registry.npm.taobao.org/rimraf/download/rimraf-2.6.2.tgz", 496 | "integrity": "sha1-LtgVDSShbqhlHm1u8PR8QVjOejY=", 497 | "requires": { 498 | "glob": "7.1.2" 499 | } 500 | }, 501 | "safe-buffer": { 502 | "version": "5.1.2", 503 | "resolved": "http://registry.npm.taobao.org/safe-buffer/download/safe-buffer-5.1.2.tgz", 504 | "integrity": "sha1-mR7GnSluAxN0fVm9/St0XDX4go0=" 505 | }, 506 | "sshpk": { 507 | "version": "1.14.1", 508 | "resolved": "http://registry.npm.taobao.org/sshpk/download/sshpk-1.14.1.tgz", 509 | "integrity": "sha1-Ew9Zde3a2WPx1W+SuaxsUfqfg+s=", 510 | "requires": { 511 | "asn1": "0.2.3", 512 | "assert-plus": "1.0.0", 513 | "bcrypt-pbkdf": "1.0.1", 514 | "dashdash": "1.14.1", 515 | "ecc-jsbn": "0.1.1", 516 | "getpass": "0.1.7", 517 | "jsbn": "0.1.1", 518 | "tweetnacl": "0.14.5" 519 | } 520 | }, 521 | "stealthy-require": { 522 | "version": "1.1.1", 523 | "resolved": "http://registry.npm.taobao.org/stealthy-require/download/stealthy-require-1.1.1.tgz", 524 | "integrity": "sha1-NbCYdbT/SfJqd35QmzCQoyJr8ks=" 525 | }, 526 | "string_decoder": { 527 | "version": "1.1.1", 528 | "resolved": "http://registry.npm.taobao.org/string_decoder/download/string_decoder-1.1.1.tgz", 529 | "integrity": "sha1-nPFhG6YmhdcDCunkujQUnDrwP8g=", 530 | "requires": { 531 | "safe-buffer": "5.1.2" 532 | } 533 | }, 534 | "tough-cookie": { 535 | "version": "2.3.4", 536 | "resolved": "http://registry.npm.taobao.org/tough-cookie/download/tough-cookie-2.3.4.tgz", 537 | "integrity": "sha1-7GDO44rGdQY//JelwYlwV47oNlU=", 538 | "requires": { 539 | "punycode": "1.4.1" 540 | } 541 | }, 542 | "tunnel-agent": { 543 | "version": "0.6.0", 544 | "resolved": "http://registry.npm.taobao.org/tunnel-agent/download/tunnel-agent-0.6.0.tgz", 545 | "integrity": "sha1-J6XeoGs2sEoKmWZ3SykIaPD8QP0=", 546 | "requires": { 547 | "safe-buffer": "5.1.2" 548 | } 549 | }, 550 | "tweetnacl": { 551 | "version": "0.14.5", 552 | "resolved": "http://registry.npm.taobao.org/tweetnacl/download/tweetnacl-0.14.5.tgz", 553 | "integrity": "sha1-WuaBd/GS1EViadEIr6k/+HQ/T2Q=", 554 | "optional": true 555 | }, 556 | "typedarray": { 557 | "version": "0.0.6", 558 | "resolved": "http://registry.npm.taobao.org/typedarray/download/typedarray-0.0.6.tgz", 559 | "integrity": "sha1-hnrHTjhkGHsdPUfZlqeOxciDB3c=" 560 | }, 561 | "ultron": { 562 | "version": "1.1.1", 563 | "resolved": "http://registry.npm.taobao.org/ultron/download/ultron-1.1.1.tgz", 564 | "integrity": "sha1-n+FTahCmZKZSZqHjzPhf02MCvJw=" 565 | }, 566 | "util-deprecate": { 567 | "version": "1.0.2", 568 | "resolved": "http://registry.npm.taobao.org/util-deprecate/download/util-deprecate-1.0.2.tgz", 569 | "integrity": "sha1-RQ1Nyfpw3nMnYvvS1KKJgUGaDM8=" 570 | }, 571 | "uuid": { 572 | "version": "3.2.1", 573 | "resolved": "http://registry.npm.taobao.org/uuid/download/uuid-3.2.1.tgz", 574 | "integrity": "sha1-EsUou51Y0LkmXZovbw/ovhf/HxQ=" 575 | }, 576 | "verror": { 577 | "version": "1.10.0", 578 | "resolved": "http://registry.npm.taobao.org/verror/download/verror-1.10.0.tgz", 579 | "integrity": "sha1-OhBcoXBTr1XW4nDB+CiGguGNpAA=", 580 | "requires": { 581 | "assert-plus": "1.0.0", 582 | "core-util-is": "1.0.2", 583 | "extsprintf": "1.3.0" 584 | } 585 | }, 586 | "wrappy": { 587 | "version": "1.0.2", 588 | "resolved": "http://registry.npm.taobao.org/wrappy/download/wrappy-1.0.2.tgz", 589 | "integrity": "sha1-tSQ9jz7BqjXxNkYFvA0QNuMKtp8=" 590 | }, 591 | "ws": { 592 | "version": "3.3.3", 593 | "resolved": "http://registry.npm.taobao.org/ws/download/ws-3.3.3.tgz", 594 | "integrity": "sha1-8c+E/i1ekB686U767OeF8YeiKPI=", 595 | "requires": { 596 | "async-limiter": "1.0.0", 597 | "safe-buffer": "5.1.2", 598 | "ultron": "1.1.1" 599 | } 600 | }, 601 | "yauzl": { 602 | "version": "2.4.1", 603 | "resolved": "http://registry.npm.taobao.org/yauzl/download/yauzl-2.4.1.tgz", 604 | "integrity": "sha1-lSj0QtqxsihOWLQ3m7GU4i4MQAU=", 605 | "requires": { 606 | "fd-slicer": "1.0.1" 607 | } 608 | } 609 | } 610 | } 611 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "eva", 3 | "version": "1.0.0", 4 | "description": "", 5 | "main": "index.js", 6 | "scripts": { 7 | "test": "node ./src/tools/tools.js", 8 | "start": "node ./src/index.js", 9 | "req": "node ./src/req.js", 10 | "file": "node ./src/tools/file-upload.js", 11 | "img": "node ./src/tools/img-sina.js" 12 | }, 13 | "author": "", 14 | "license": "ISC", 15 | "dependencies": { 16 | "moment": "^2.22.2", 17 | "multiparty": "^4.1.4", 18 | "puppeteer": "0.9.0", 19 | "request": "^2.87.0", 20 | "request-promise": "^4.2.2" 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /src/config.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | key: 'js中文分词', 3 | company: '互联网' 4 | } -------------------------------------------------------------------------------- /src/data-source/baidu.js: -------------------------------------------------------------------------------- 1 | var fs = require("fs"); 2 | 3 | // 搜索源 4 | const source = 'baidu' 5 | 6 | // 路径 7 | const url = 'https://www.baidu.com' 8 | 9 | // 搜索按钮,选择器 10 | const submitSelectName = '#su' 11 | 12 | // 页面内容,选择器 13 | const domSelectName = '.c-container' 14 | 15 | // 下一页,选择器 16 | const nextPageSelectName = '#page > a:last-child' 17 | 18 | // 翻页数量 19 | const needPageMaxNum = 20 20 | 21 | module.exports = async(browser, timeout, key) => { 22 | require('./template/search-engine.js')( 23 | browser, timeout, key, 24 | source, url, submitSelectName, domSelectName, 25 | nextPageSelectName, needPageMaxNum 26 | ) 27 | } 28 | -------------------------------------------------------------------------------- /src/data-source/bing.js: -------------------------------------------------------------------------------- 1 | var fs = require("fs"); 2 | 3 | const source = 'bing' 4 | const url = 'https://cn.bing.com/' 5 | const submitSelectName = '#sb_form_go' 6 | const domSelectName = '.b_algo' 7 | const nextPageSelectName = '[title="下一页"]' 8 | const needPageMaxNum = 20 9 | 10 | module.exports = async(browser, timeout, key) => { 11 | require('./template/search-engine.js') ( 12 | browser, timeout, key, 13 | source, url, submitSelectName, domSelectName, 14 | nextPageSelectName, needPageMaxNum 15 | ) 16 | } -------------------------------------------------------------------------------- /src/data-source/so.js: -------------------------------------------------------------------------------- 1 | var fs = require("fs"); 2 | 3 | // 搜索源 4 | const source = 'so' 5 | 6 | // 路径 7 | const url = 'https://www.so.com' 8 | 9 | // 搜索按钮,选择器 10 | const submitSelectName = '#search-button' 11 | 12 | // 页面内容,选择器 13 | const domSelectName = '.res-list' 14 | 15 | // 下一页,选择器 16 | const nextPageSelectName = '#snext' 17 | 18 | // 翻页数量 19 | const needPageMaxNum = 20 20 | 21 | module.exports = async(browser, timeout, key) => { 22 | require('./template/search-engine.js')( 23 | browser, timeout, key, 24 | source, url, submitSelectName, domSelectName, 25 | nextPageSelectName, needPageMaxNum 26 | ) 27 | } 28 | -------------------------------------------------------------------------------- /src/data-source/sogou.js: -------------------------------------------------------------------------------- 1 | var fs = require("fs"); 2 | 3 | // 搜索源 4 | const source = 'sogou' 5 | 6 | // 路径 7 | const url = 'https://www.sogou.com/' 8 | 9 | // 搜索按钮,选择器 10 | const submitSelectName = '#stb' 11 | 12 | // 页面内容,选择器 13 | const domSelectName = '.vrwrap' 14 | 15 | // 下一页,选择器 16 | const nextPageSelectName = '#sogou_next' 17 | 18 | // 翻页数量 19 | const needPageMaxNum = 20 20 | 21 | module.exports = async(browser, timeout, key) => { 22 | require('./template/search-engine.js')( 23 | browser, timeout, key, 24 | source, url, submitSelectName, domSelectName, 25 | nextPageSelectName, needPageMaxNum 26 | ) 27 | } 28 | -------------------------------------------------------------------------------- /src/data-source/template/search-engine.js: -------------------------------------------------------------------------------- 1 | var fs = require("fs"); 2 | 3 | module.exports = async ( 4 | browser, timeout, key, 5 | source, url, submitSelectName, domSelectName, 6 | nextPageSelectName, needPageMaxNum 7 | ) => { 8 | // 从dom获取数据,并写文件 9 | var getDataFromDom = async () => { 10 | await timeout(1500); 11 | var data = await page.evaluate((domSelectName) => { 12 | var list = [...document.querySelectorAll(domSelectName)] 13 | 14 | return list.map(el => { 15 | return { html: el.innerHTML, content: el.innerText } 16 | }) 17 | }, domSelectName) 18 | 19 | var content = [] 20 | data.forEach(element => { 21 | content.push(element.content) 22 | console.log(content) 23 | }); 24 | 25 | await fs.appendFileSync(`./src/data/${key}-${source}.txt`, `startTime: ${new Date().toUTCString()}`+'\r'); 26 | await fs.appendFileSync(`./src/data/${key}-${source}.txt`, JSON.stringify(content, null , ' ')+'\r'); 27 | await fs.appendFileSync(`./src/data/${key}-${source}.txt`, `endTime: ${new Date().toUTCString()}`+'\r\r'); 28 | } 29 | 30 | // 1.跳转至相应的页面 31 | var page = await browser.newPage(); 32 | await page.goto(url); 33 | 34 | // 2.输入关键字 35 | await timeout(3000* Math.random()); 36 | await page.type(key, { delay: 100 }) 37 | 38 | // 3.点击提交 39 | var submit = await page.$(submitSelectName) 40 | await submit.click() 41 | 42 | // 4.获取数据,数据写文件 43 | await getDataFromDom() 44 | 45 | // 6.翻页,数据写文件 46 | for (let index = 0; index < needPageMaxNum; index++) { 47 | var nextPage = await page.$(nextPageSelectName) 48 | await nextPage.click() 49 | await timeout(3000* Math.random()); 50 | await getDataFromDom() 51 | } 52 | 53 | // 7.关闭页面 54 | await timeout(3000* Math.random()); 55 | await page.close() 56 | } -------------------------------------------------------------------------------- /src/data-source/tianyancha.js: -------------------------------------------------------------------------------- 1 | var fs = require("fs"); 2 | 3 | module.exports = async (browser, timeout, key) => { 4 | var getDataFromDom = async () => { 5 | await timeout(1500); 6 | var data = await page.evaluate(() => { 7 | var list = [...document.querySelectorAll('.search_result_single')] 8 | 9 | return list.map(el => { 10 | return { html: el.innerHTML, content: el.innerText } 11 | }) 12 | }) 13 | var content = [] 14 | data.forEach(element => { 15 | content.push(element.content) 16 | console.log(content) 17 | }); 18 | 19 | await fs.appendFileSync(`./src/data/tianyancha-${key}.txt`, `startTime: ${new Date().toUTCString()}`+'\r'); 20 | await fs.appendFileSync(`./src/data/tianyancha-${key}.txt`, JSON.stringify(content, null , ' ')+'\r'); 21 | await fs.appendFileSync(`./src/data/tianyancha-${key}.txt`, `endTime: ${new Date().toUTCString()}`+'\r\r'); 22 | } 23 | 24 | var page = await browser.newPage(); 25 | await page.goto(`https://www.tianyancha.com/search?key=${key}`); 26 | await timeout(500); 27 | await getDataFromDom() 28 | 29 | for (let index = 0; index < 10000000; index++) { 30 | var nextPage = await page.$('#web-content > div > div > div > div.col-9.search-2017-2.pr15.pl0 > div.b-c-white.clearfix.position-rel.mb30 > div > div.search_pager.human_pager.in-block > ul > li.pagination-next.ng-scope > a') 31 | 32 | await nextPage.click() 33 | await timeout(6 * 1000 * Math.random()); 34 | await getDataFromDom() 35 | } 36 | 37 | await page.close() 38 | } -------------------------------------------------------------------------------- /src/duanzi/budejie-detail.js: -------------------------------------------------------------------------------- 1 | var origin_key = 'b61d91ea6c3' 2 | var crypto = require('crypto') 3 | var sha1 = crypto.createHash('sha1'); 4 | var moment = require('moment') 5 | var str = moment().format('YYYY-MM-DD 00:00:00') + origin_key 6 | sha1.update(str) 7 | var keyCode = sha1.digest('hex') 8 | keyCode = keyCode.substr(-15) 9 | var fs = require("fs"); 10 | var req = require('request-promise'); 11 | var common = require('./common.js') 12 | 13 | var theUrl = 'http://www.budejie.com' 14 | 15 | var fromName = '不得姐' 16 | var fromFileName = 'budejie' 17 | 18 | module.exports = async (browser, timeout, key) => { 19 | var getDataFromDom = async () => { 20 | await timeout(1500); 21 | 22 | var data = await page.evaluate(() => { 23 | let discuss = [] 24 | document.querySelectorAll('#hotCommentList').forEach(el => { 25 | if (el.querySelector('.g-mnc1') && el.querySelector('.g-mnc1').innerText) { 26 | let txt = el.querySelector('.g-mnc1').innerText 27 | discuss.push(txt.trim()) 28 | } 29 | }) 30 | 31 | return { 32 | title: null, 33 | content: document.querySelector('.j-r-list-c-desc') ? document.querySelector('.j-r-list-c-desc').innerText : null, 34 | discuss: JSON.stringify(discuss), 35 | imgurl: document.querySelector('.j-r-list-c-img > img') ? document.querySelector('.j-r-list-c-img > img').src : null, 36 | cdn_img_url: null, 37 | zan: document.querySelector('.j-r-list-tool-l-up').innerText.trim(), 38 | comments: document.querySelector('.comment-counts').innerText.trim(), 39 | type: null 40 | } 41 | }) 42 | 43 | data = [data] 44 | // 上传图片 45 | // data = await common.uploadImg(data) 46 | 47 | // 写文件 48 | await common.wirteFile(data, fromFileName) 49 | 50 | // 上传到后台 51 | await common.uploadData(fromName, data, firstUrl) 52 | } 53 | 54 | var page = await browser.newPage(); 55 | await page.goto(theUrl) 56 | await timeout(2000) 57 | 58 | var firstUrl = await page.evaluate(() => { 59 | var url = document.querySelector('.j-r-list-c-img > a').href; 60 | return url 61 | }) 62 | console.log('firstUrl:', firstUrl) 63 | 64 | await page.goto(firstUrl); 65 | await timeout(3000); 66 | await getDataFromDom(); 67 | 68 | for (let index = 0; index < 10000000; index++) { 69 | var nextPage = await page.$('.c-next-btn') 70 | 71 | await nextPage.click() 72 | await timeout(6 * 1000 * Math.random()); 73 | await getDataFromDom() 74 | } 75 | await page.close() 76 | } -------------------------------------------------------------------------------- /src/duanzi/common.js: -------------------------------------------------------------------------------- 1 | var fs = require("fs"); 2 | var req = require('request-promise'); 3 | var origin_key = 'b61d91ea6c3' 4 | var crypto = require('crypto') 5 | var sha1 = crypto.createHash('sha1'); 6 | var moment = require('moment') 7 | var str = moment().format('YYYY-MM-DD 00:00:00') + origin_key 8 | sha1.update(str) 9 | var keyCode = sha1.digest('hex') 10 | keyCode = keyCode.substr(-15) 11 | 12 | var proxyPool = [ 13 | 'http://119.27.177.169:80', 14 | // 'http://110.73.7.162:8123', 15 | // 'http://119.28.203.242:9000', 16 | // 'http://222.88.149.32:8060', 17 | // 'http://118.190.95.43:9001', 18 | // 'http://144.0.111.107:8060', 19 | // 'http://120.131.9.254:1080', 20 | // 'http://121.18.231.74:80', 21 | // 'http://39.137.69.9:80', 22 | // 'http://103.242.219.242:8080', 23 | // 'http://119.28.194.66:8888', 24 | ] 25 | 26 | module.exports = { 27 | // 上传图片文件 28 | uploadImg: async (data) => { 29 | for (let i = 0; i < data.length; i++) { 30 | 31 | let item = data[i] 32 | if (!item.imgurl) { 33 | continue 34 | } 35 | 36 | if (0) { 37 | // 小贱图床,(实际新浪) 38 | try { 39 | console.log('上传 小贱-图片') 40 | let imgurl = item.imgurl 41 | let type = imgurl.split('.').pop() 42 | let response = await req({ 43 | url: `https://pic.xiaojianjian.net/webtools/picbed/uploadByUrl.htm?url=${imgurl}` 44 | }) 45 | console.log('response:', response) 46 | 47 | response = JSON.parse(response) 48 | let cdn_img_url = response.original_pic 49 | item.cdn_img_url = cdn_img_url 50 | } catch (error) { } 51 | } else { 52 | // yum6图床(实际新浪) 53 | try { 54 | console.log('上传 yum6-图片') 55 | let imgurl = item.imgurl 56 | let type = imgurl.split('.').pop() 57 | let rdProxy = Math.floor(Math.random() * proxyPool.length) 58 | console.log(proxyPool[rdProxy]) 59 | 60 | let response = await req({ 61 | url: 'https://api.yum6.cn/sinaimg.php?img=' + imgurl, 62 | // proxy: proxyPool[rdProxy] 63 | }) 64 | console.log('response:', response) 65 | 66 | response = JSON.parse(response) 67 | let cdn_img_url = 'https://ww2.sinaimg.cn/large/' + response.pid + '.' + type 68 | item.cdn_img_url = cdn_img_url 69 | } catch (error) { 70 | console.log(i, error) 71 | } 72 | } 73 | 74 | } 75 | return data 76 | }, 77 | // 写文件 78 | wirteFile: async (data, fromFileName) => { 79 | try { 80 | console.log('写文件') 81 | await fs.appendFileSync(`./src/data/${fromFileName}.txt`, JSON.stringify(data[0], null, ' ') + '\r'); 82 | } catch (error) { 83 | console.log('err:', error) 84 | } 85 | }, 86 | // 上传到后台 87 | uploadData: async (fromName, data, url) => { 88 | var options = { 89 | method: 'POST', 90 | timeout: 3000000, 91 | uri: 'https://juhe.qqeasy.com/information/import-jokes', 92 | body: { 93 | "key": keyCode, 94 | "from": fromName, 95 | "from_url": url, 96 | "create_time": new Date().toUTCString(), 97 | "data": { 98 | "contents": data 99 | } 100 | }, 101 | json: true 102 | } 103 | 104 | try { 105 | console.log('上传到后台') 106 | let response = await req(options) 107 | } catch (error) { 108 | console.log('err:', error) 109 | } 110 | } 111 | } -------------------------------------------------------------------------------- /src/duanzi/gaoxiaogif-detail.js: -------------------------------------------------------------------------------- 1 | var origin_key = 'b61d91ea6c3' 2 | var crypto = require('crypto') 3 | var sha1 = crypto.createHash('sha1'); 4 | var moment = require('moment') 5 | var str = moment().format('YYYY-MM-DD 00:00:00') + origin_key 6 | sha1.update(str) 7 | var keyCode = sha1.digest('hex') 8 | keyCode = keyCode.substr(-15) 9 | var fs = require("fs"); 10 | var req = require('request-promise'); 11 | var common = require('./common.js') 12 | 13 | var theUrl = 'http://www.gaoxiaogif.cn' 14 | var fromName = '搞笑动图' 15 | var fromFileName = 'gaoxiaogif' 16 | 17 | module.exports = async (browser, timeout, key) => { 18 | var getDataFromDom = async () => { 19 | await timeout(1500); 20 | 21 | var data = await page.evaluate(() => { 22 | let discuss = [] 23 | document.querySelectorAll('.box-s li').forEach(el => { 24 | if (el.querySelector('.content') && el.querySelector('.content').innerText) { 25 | discuss.push(el.querySelector('.content').innerText.split(':')[1]) 26 | } 27 | }) 28 | 29 | return { 30 | title: null, 31 | content: document.querySelector('.showtxt') ? document.querySelector('.showtxt').innerText : null, 32 | discuss: JSON.stringify(discuss), 33 | imgurl: document.querySelector('.imgp img') ? document.querySelector('.imgp img').src : null, 34 | cdn_img_url: null, 35 | zan: document.querySelector('.up').innerText, 36 | comments: null, 37 | type: null 38 | } 39 | }) 40 | 41 | data = [data] 42 | // 上传图片 43 | // data = await common.uploadImg(data) 44 | 45 | // 写文件 46 | await common.wirteFile(data, fromFileName) 47 | 48 | // 上传到后台 49 | console.log('firstUrl2:', firstUrl) 50 | await common.uploadData(fromName, data, firstUrl) 51 | } 52 | 53 | var page = await browser.newPage(); 54 | await page.goto(theUrl) 55 | await timeout(2000) 56 | 57 | var firstUrl = await page.evaluate(() => { 58 | var url = document.querySelector('.img > a').href; 59 | return url 60 | }) 61 | console.log('firstUrl:', firstUrl) 62 | 63 | await page.goto(firstUrl); 64 | await timeout(2000); 65 | await getDataFromDom() 66 | 67 | for (let index = 0; index < 10000000; index++) { 68 | var nextPage = await page.$('.fr a') 69 | 70 | await nextPage.click() 71 | await timeout(6 * 1000 * Math.random()); 72 | await getDataFromDom() 73 | } 74 | await page.close() 75 | } -------------------------------------------------------------------------------- /src/duanzi/qiubai-detail.js: -------------------------------------------------------------------------------- 1 | var fs = require("fs"); 2 | var req = require('request-promise'); 3 | var origin_key = 'b61d91ea6c3' 4 | var crypto = require('crypto') 5 | var sha1 = crypto.createHash('sha1'); 6 | var moment = require('moment') 7 | var str = moment().format('YYYY-MM-DD 00:00:00') + origin_key 8 | sha1.update(str) 9 | var keyCode = sha1.digest('hex') 10 | keyCode = keyCode.substr(-15) 11 | var common = require('./common.js') 12 | 13 | var theUrl = 'https://www.qiushibaike.com' 14 | var fromName = '糗百' 15 | var fromFileName = 'qiubai-detail' 16 | 17 | module.exports = async (browser, timeout, key) => { 18 | // 从DOM爬取数据 19 | var getDataFromDom = async () => { 20 | await timeout(1500); 21 | 22 | var data = await page.evaluate(() => { 23 | var discuss = [] 24 | document.querySelectorAll('.comments-table .main-text').forEach(el => { 25 | discuss.push(el.innerText) 26 | }) 27 | 28 | return { 29 | title: null, 30 | content: document.querySelector('.content') ? document.querySelector('.content').innerText : null, 31 | discuss: JSON.stringify(discuss), 32 | imgurl: document.querySelector('.thumb img') ? document.querySelector('.thumb img').src : null, 33 | cdn_img_url: null, 34 | zan: document.querySelector('.stats-vote .number').innerText, 35 | comments: document.querySelector('.stats-comments .number').innerText, 36 | type: null 37 | } 38 | }) 39 | data = [data] 40 | 41 | console.log(data) 42 | // 上传图片 43 | // data = await common.uploadImg(data) 44 | 45 | // 写文件 46 | await common.wirteFile(data, fromFileName) 47 | 48 | // 上传到后台 49 | console.log('firstUrl2:', firstUrl) 50 | await common.uploadData(fromName, data, firstUrl) 51 | } 52 | 53 | var page = await browser.newPage(); 54 | await page.goto(theUrl) 55 | await timeout(2000) 56 | 57 | var firstUrl = await page.evaluate(() => { 58 | var url = document.querySelector('.contentHerf').href; 59 | return url 60 | }) 61 | console.log('firstUrl:', firstUrl) 62 | 63 | await page.goto(firstUrl); 64 | await timeout(2000); 65 | await getDataFromDom() 66 | 67 | for (let index = 0; index < 10000000; index++) { 68 | var nextPage = await page.$('.page-nav-list-next') 69 | 70 | await nextPage.click() 71 | await timeout(6 * 1000 * Math.random()); 72 | await getDataFromDom() 73 | } 74 | await page.close() 75 | } -------------------------------------------------------------------------------- /src/img/5b3f553ed9fb7.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhentaoo/eva/fc06ca95716329d96c8036b445f4f10643feb771/src/img/5b3f553ed9fb7.gif -------------------------------------------------------------------------------- /src/index.js: -------------------------------------------------------------------------------- 1 | const puppeteer = require('puppeteer'); 2 | var { timeout } = require('./tools/tools.js'); 3 | var key = require('./config.js').key 4 | var company = require('./config.js').company 5 | 6 | // puppeteer.launch().then(async (browser) => { 7 | puppeteer.launch({ headless: false }).then(async (browser) => { 8 | console.log('startTime:', new Date().toUTCString()); 9 | try { 10 | /** 11 | * 搜索引擎 12 | */ 13 | // await require('./data-source/so.js')(browser, timeout, key) 14 | // await require('./data-source/sogou.js')(browser, timeout, key) 15 | // await require('./data-source/baidu.js')(browser, timeout, key) 16 | // await require('./data-source/bing.js')(browser, timeout, key) 17 | 18 | /** 19 | * 段子 20 | */ 21 | // await require('./duanzi/qiubai-detail.js')(browser, timeout, company) 22 | // await require('./duanzi/gaoxiaogif-detail.js')(browser, timeout, company) 23 | await require('./duanzi/budejie-detail.js')(browser, timeout, company) 24 | 25 | /** 26 | * 企业信息 27 | */ 28 | // await require('./data-source/tianyancha.js')(browser, timeout, company) 29 | } catch (error) { 30 | console.log(error) 31 | console.log('endTime:', new Date().toUTCString()); 32 | } 33 | console.log('endTime:', new Date().toUTCString()); 34 | }); 35 | -------------------------------------------------------------------------------- /src/req.js: -------------------------------------------------------------------------------- 1 | var fs = require("fs"); 2 | var req = require('request-promise'); 3 | var url = 'https://www.pengfu.com/' 4 | var origin_key = 'b61d91ea6c3' 5 | var crypto = require('crypto') 6 | var sha1 = crypto.createHash('sha1'); 7 | var moment = require('moment') 8 | 9 | var fn = async function (params) { 10 | var str = moment().format('YYYY-MM-DD 00:00:00') + origin_key 11 | console.log('str:', str) 12 | sha1.update(str) 13 | var keyCode = sha1.digest('hex') 14 | console.log('keyCode:', keyCode) 15 | keyCode = keyCode.substr(-15) 16 | console.log('keyCode15:', keyCode) 17 | 18 | var res = await req({ 19 | method: 'POST', 20 | uri: 'http://juhe.qqeasy.com/information/import-jokes', 21 | body: { 22 | "key": keyCode, 23 | "from": url, 24 | "from_url": url, 25 | "create_time": new Date().toUTCString(), 26 | "data": { 27 | "contents": [ 28 | { 29 | "title": "三地鼠使用了惊吓,你的防御下降了", 30 | "content": "", 31 | "img": "https://image7.pengfu.com/origin/180601/5b11467493589.gif" 32 | }, 33 | { 34 | "title": "幸福送给别人,悲伤留给自己", 35 | "content": "女同学结婚,邀请我做伴娘。我深感荣幸,婚宴上喝得不亦乐乎。另一位女同学趁着酒意告诉我:新娘A杯,怕婚礼上被抢风头,为选伴娘伤透脑筋。终于大家一致推荐了你,因为没有最小只有更小!", 36 | "img": null 37 | } 38 | ] 39 | } 40 | }, 41 | json: true // Aut 42 | }) 43 | 44 | console.log(res) 45 | } 46 | 47 | fn() -------------------------------------------------------------------------------- /src/tmp/pengfu.js: -------------------------------------------------------------------------------- 1 | var fs = require("fs"); 2 | var req = require('request-promise'); 3 | var url = 'https://www.pengfu.com/qutu_18.html' 4 | 5 | var origin_key = 'b61d91ea6c3' 6 | var crypto = require('crypto') 7 | var sha1 = crypto.createHash('sha1'); 8 | var moment = require('moment') 9 | 10 | var url = 'https://www.pengfu.com/qutu_18.html' 11 | var fromName = '捧腹' 12 | var fromFileName = 'pengfu' 13 | 14 | var str = moment().format('YYYY-MM-DD 00:00:00') + origin_key 15 | sha1.update(str) 16 | var keyCode = sha1.digest('hex') 17 | keyCode = keyCode.substr(-15) 18 | 19 | module.exports = async (browser, timeout, key) => { 20 | var getDataFromDom = async () => { 21 | await timeout(1500); 22 | var data = await page.evaluate(() => { 23 | var list = [...document.querySelectorAll('.list-item')] 24 | 25 | return list.map(el => { 26 | return { 27 | title: el.querySelector('.dp-b').innerText, 28 | content: el.querySelector('.content-img').innerText, 29 | imgurl: el.querySelector('.content-img > img') ? (el.querySelector('.content-img > img').getAttribute('gifsrc') || el.querySelector('.content-img > img').src) : null, 30 | cdn_img_url: null, 31 | zan: el.querySelector('.fl .ding').innerText, 32 | comments: el.querySelector('.fl .commentClick').innerText, 33 | type: el.querySelectorAll('div.fr > a') 34 | && ([...el.querySelectorAll('div.fr > a')].map(i => { return i.innerText }).join(',')) 35 | } 36 | }) 37 | }) 38 | console.log('data:', JSON.stringify(data)) 39 | 40 | // 上传图片 41 | // data = await common.uploadImg(data) 42 | 43 | // 写文件 44 | await common.wirteFile(data, fromFileName) 45 | 46 | // 上传到后台 47 | await common.uploadData(fromName, data, url) 48 | } 49 | 50 | var page = await browser.newPage(); 51 | await page.goto(url); 52 | await timeout(500); 53 | await getDataFromDom() 54 | 55 | for (let index = 0; index < 10000000; index++) { 56 | var nextPage = await page.$('.page > div > a:last-child') 57 | 58 | await nextPage.click() 59 | await timeout(6 * 1000 * Math.random()); 60 | await getDataFromDom() 61 | } 62 | 63 | await page.close() 64 | } -------------------------------------------------------------------------------- /src/tools/file-upload.js: -------------------------------------------------------------------------------- 1 | var req = require('request-promise'); 2 | var fs = require('fs') 3 | var multiparty = require('multiparty'); 4 | 5 | async function downloadImg(url, fileUrl) { 6 | req(url) 7 | .pipe(fs.createWriteStream(fileUrl)) 8 | } 9 | 10 | async function uploadImg(fileName, fileUrl, cdnUrl) { 11 | console.log(fileUrl) 12 | console.log(cdnUrl) 13 | 14 | var options = { 15 | method: 'POST', 16 | uri: cdnUrl, 17 | formData: { 18 | // Like 19 | name: 'smfile', 20 | // Like 21 | file: { 22 | value: fs.createReadStream(fileUrl), 23 | options: { 24 | filename: fileName, 25 | contentType: 'image/jpg' 26 | } 27 | }, 28 | filename: fileName 29 | }, 30 | headers: { 31 | 'content-type': 'application/x-www-form-urlencoded' // Is set automatically 32 | } 33 | }; 34 | 35 | req(options) 36 | .then(function (body) { 37 | console.log('body:', body) 38 | // POST succeeded... 39 | }) 40 | .catch(function (err) { 41 | console.log('err:', err) 42 | // POST failed... 43 | }); 44 | } 45 | 46 | async function uploadCdnImg(url) { 47 | var url = url 48 | var fileName = url.split('/').pop() 49 | var fileUrl = './src/img/' + fileName 50 | var cdnUrl = 'https://sm.ms/api/upload' 51 | 52 | await downloadImg(url, fileUrl) 53 | 54 | await uploadImg(fileName, fileUrl, cdnUrl) 55 | } 56 | 57 | uploadCdnImg('https://image7.pengfu.com/origin/180706/5b3f553ed9fb7.gif') -------------------------------------------------------------------------------- /src/tools/img-sina.js: -------------------------------------------------------------------------------- 1 | var req = require('request-promise'); 2 | var fs = require('fs') 3 | 4 | async function a () { 5 | let imgurl = 'https://image7.pengfu.com/origin/180727/5b5adaf912540.jpg' 6 | let type = imgurl.split('.').pop() 7 | 8 | let response = await req('https://api.yum6.cn/sinaimg.php?img=' + imgurl) 9 | response = JSON.parse(response) 10 | let cdn_img_url = 'https://ww2.sinaimg.cn/large/' + response.pid + '.' + type 11 | 12 | console.log(response) 13 | console.log(cdn_img_url) 14 | // item.cdn_img_url = 'https://ww2.sinaimg.cn/large/' + response.pid 15 | } 16 | 17 | a() -------------------------------------------------------------------------------- /src/tools/tools.js: -------------------------------------------------------------------------------- 1 | var fs = require("fs"); 2 | 3 | class Tools { 4 | static appendFileSync(path, data, callback) { 5 | fs.appendFileSync(path, data, callback); 6 | } 7 | 8 | static timeout(delay) { 9 | return new Promise((resolve, reject) => { 10 | setTimeout(() => { 11 | try { 12 | resolve(1) 13 | } catch (e) { 14 | reject(0) 15 | } 16 | }, delay) 17 | }) 18 | } 19 | 20 | /** 21 | * [TimeTools description] 22 | * @param {[type]} timestamp 12312312312312 23 | * @param {[type]} formatStr Y年M月D日 24 | * 25 | * M: month 1~12 26 | * Y: year 2017 27 | * D: date 0 ~ 31 28 | */ 29 | static moment(formatStr, timestamp) { 30 | let date = new Date(timestamp || new Date().getTime()) 31 | 32 | let M = date.getMonth() + 1 33 | 34 | let Y = date.getFullYear() 35 | 36 | let D = date.getDate() 37 | 38 | let h = date.getHours() 39 | 40 | let m = date.getMinutes() 41 | 42 | let s = date.getSeconds() 43 | 44 | return formatStr.replace('M', M).replace('Y', Y).replace('D', D).replace('h', h).replace('m', m).replace('s', s) 45 | } 46 | } 47 | 48 | module.exports = Tools; --------------------------------------------------------------------------------