├── .gitignore ├── README.md ├── demo.js ├── index.js ├── package-lock.json ├── package.json └── src ├── check-env.js ├── recognition.js └── spider.js /.gitignore: -------------------------------------------------------------------------------- 1 | # Logs 2 | logs 3 | *.log 4 | npm-debug.log* 5 | yarn-debug.log* 6 | yarn-error.log* 7 | 8 | # Runtime data 9 | pids 10 | *.pid 11 | *.seed 12 | *.pid.lock 13 | 14 | # Directory for instrumented libs generated by jscoverage/JSCover 15 | lib-cov 16 | 17 | # Coverage directory used by tools like istanbul 18 | coverage 19 | 20 | # nyc test coverage 21 | .nyc_output 22 | 23 | # Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files) 24 | .grunt 25 | 26 | # Bower dependency directory (https://bower.io/) 27 | bower_components 28 | 29 | # node-waf configuration 30 | .lock-wscript 31 | 32 | # Compiled binary addons (http://nodejs.org/api/addons.html) 33 | build/Release 34 | 35 | # Dependency directories 36 | node_modules/ 37 | jspm_packages/ 38 | 39 | # Typescript v1 declaration files 40 | typings/ 41 | 42 | # Optional npm cache directory 43 | .npm 44 | 45 | # Optional eslint cache 46 | .eslintcache 47 | 48 | # Optional REPL history 49 | .node_repl_history 50 | 51 | # Output of 'npm pack' 52 | *.tgz 53 | 54 | # Yarn Integrity file 55 | .yarn-integrity 56 | 57 | # dotenv environment variables file 58 | .env 59 | 60 | images/* 61 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # BAIDU-INDEX-SPIDER 2 | 3 | 百度指数爬虫,node版本 4 | 5 | ## Usage 6 | 7 | **依赖Tesseract,必须先安装Tesseract。** 8 | 9 | https://github.com/tesseract-ocr/tesseract/wiki#installation 10 | 11 | 12 | **在node里引用:** 13 | 14 | ``` 15 | npm install baidu-index-spider 16 | ``` 17 | 18 | ``` javascript 19 | const BDIndexSpider = require('baidu-index-spider'); 20 | 21 | const keyword = '要爬取的关键词'; 22 | const username = '百度帐号'; 23 | const password = ''; 24 | 25 | const result = BDIndexSpider.run(keyword, { sername, password}).then((result) => { 26 | console.log(result); 27 | }); 28 | ``` 29 | 30 | -------------------------------------------------------------------------------- /demo.js: -------------------------------------------------------------------------------- 1 | const BDIndexSpider = require('./index'); 2 | 3 | const account = { 4 | username: '', 5 | password: '' 6 | }; 7 | 8 | const result = BDIndexSpider.run('百度', {...account}).then((result) => { 9 | console.log('💻 数据:'); 10 | console.log(result); 11 | }); -------------------------------------------------------------------------------- /index.js: -------------------------------------------------------------------------------- 1 | const path = require('path'); 2 | const fs = require('fs'); 3 | const checkEnv = require('./src/check-env'); 4 | const recognition = require('./src/recognition'); 5 | const Spider = require('./src/spider'); 6 | 7 | // check env 8 | const check = checkEnv.run(); 9 | if (!check) { 10 | return; 11 | } 12 | 13 | const imgDir = path.resolve(process.cwd(), './images'); 14 | 15 | module.exports = { 16 | async run (word, options, puppeteerOptions = { headless: true }) { 17 | const spider = new Spider({ 18 | imgDir, 19 | ...options 20 | }, puppeteerOptions); 21 | 22 | // 抓取数据 23 | await spider.run(word); 24 | 25 | // 图像识别 26 | const wordDir = path.resolve(imgDir, word); 27 | const imgNames = fs.readdirSync(wordDir); 28 | const result = []; 29 | 30 | imgNames = imgNames.filter(item => path.extname(item) === '.png'); 31 | 32 | for (let i = 0; i < imgNames.length; i++) { 33 | const imgPath = path.resolve(wordDir, imgNames[i]); 34 | const val = await recognition.run(imgPath); 35 | result.push(val); 36 | } 37 | 38 | return result; 39 | } 40 | } 41 | 42 | -------------------------------------------------------------------------------- /package-lock.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "baidu-index-sprite", 3 | "version": "1.0.0", 4 | "lockfileVersion": 1, 5 | "requires": true, 6 | "dependencies": { 7 | "agent-base": { 8 | "version": "4.2.0", 9 | "resolved": "https://registry.npmjs.org/agent-base/-/agent-base-4.2.0.tgz", 10 | "integrity": "sha512-c+R/U5X+2zz2+UCrCFv6odQzJdoqI+YecuhnAJLa1zYaMc13zPfwMwZrr91Pd1DYNo/yPRbiM4WVf9whgwFsIg==", 11 | "requires": { 12 | "es6-promisify": "5.0.0" 13 | } 14 | }, 15 | "async-limiter": { 16 | "version": "1.0.0", 17 | "resolved": "https://registry.npmjs.org/async-limiter/-/async-limiter-1.0.0.tgz", 18 | "integrity": "sha512-jp/uFnooOiO+L211eZOoSyzpOITMXx1rBITauYykG3BRYPu8h0UcxsPNB04RR5vo4Tyz3+ay17tR6JVf9qzYWg==" 19 | }, 20 | "balanced-match": { 21 | "version": "1.0.0", 22 | "resolved": "https://registry.npmjs.org/balanced-match/-/balanced-match-1.0.0.tgz", 23 | "integrity": "sha1-ibTRmasr7kneFk6gK4nORi1xt2c=" 24 | }, 25 | "brace-expansion": { 26 | "version": "1.1.8", 27 | "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.8.tgz", 28 | "integrity": "sha1-wHshHHyVLsH479Uad+8NHTmQopI=", 29 | "requires": { 30 | "balanced-match": "1.0.0", 31 | "concat-map": "0.0.1" 32 | } 33 | }, 34 | "concat-map": { 35 | "version": "0.0.1", 36 | "resolved": "https://registry.npmjs.org/concat-map/-/concat-map-0.0.1.tgz", 37 | "integrity": "sha1-2Klr13/Wjfd5OnMDajug1UBdR3s=" 38 | }, 39 | "concat-stream": { 40 | "version": "1.6.0", 41 | "resolved": "https://registry.npmjs.org/concat-stream/-/concat-stream-1.6.0.tgz", 42 | "integrity": "sha1-CqxmL9Ur54lk1VMvaUeE5wEQrPc=", 43 | "requires": { 44 | "inherits": "2.0.3", 45 | "readable-stream": "2.3.3", 46 | "typedarray": "0.0.6" 47 | } 48 | }, 49 | "core-util-is": { 50 | "version": "1.0.2", 51 | "resolved": "https://registry.npmjs.org/core-util-is/-/core-util-is-1.0.2.tgz", 52 | "integrity": "sha1-tf1UIgqivFq1eqtxQMlAdUUDwac=" 53 | }, 54 | "debug": { 55 | "version": "2.6.9", 56 | "resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz", 57 | "integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==", 58 | "requires": { 59 | "ms": "2.0.0" 60 | } 61 | }, 62 | "es6-promise": { 63 | "version": "4.2.4", 64 | "resolved": "https://registry.npmjs.org/es6-promise/-/es6-promise-4.2.4.tgz", 65 | "integrity": "sha512-/NdNZVJg+uZgtm9eS3O6lrOLYmQag2DjdEXuPaHlZ6RuVqgqaVZfgYCepEIKsLqwdQArOPtC3XzRLqGGfT8KQQ==" 66 | }, 67 | "es6-promisify": { 68 | "version": "5.0.0", 69 | "resolved": "https://registry.npmjs.org/es6-promisify/-/es6-promisify-5.0.0.tgz", 70 | "integrity": "sha1-UQnWLz5W6pZ8S2NQWu8IKRyKUgM=", 71 | "requires": { 72 | "es6-promise": "4.2.4" 73 | } 74 | }, 75 | "extract-zip": { 76 | "version": "1.6.6", 77 | "resolved": "https://registry.npmjs.org/extract-zip/-/extract-zip-1.6.6.tgz", 78 | "integrity": "sha1-EpDt6NINCHK0Kf0/NRyhKOxe+Fw=", 79 | "requires": { 80 | "concat-stream": "1.6.0", 81 | "debug": "2.6.9", 82 | "mkdirp": "0.5.0", 83 | "yauzl": "2.4.1" 84 | } 85 | }, 86 | "fd-slicer": { 87 | "version": "1.0.1", 88 | "resolved": "https://registry.npmjs.org/fd-slicer/-/fd-slicer-1.0.1.tgz", 89 | "integrity": "sha1-i1vL2ewyfFBBv5qwI/1nUPEXfmU=", 90 | "requires": { 91 | "pend": "1.2.0" 92 | } 93 | }, 94 | "fs.realpath": { 95 | "version": "1.0.0", 96 | "resolved": "https://registry.npmjs.org/fs.realpath/-/fs.realpath-1.0.0.tgz", 97 | "integrity": "sha1-FQStJSMVjKpA20onh8sBQRmU6k8=" 98 | }, 99 | "glob": { 100 | "version": "7.1.2", 101 | "resolved": "https://registry.npmjs.org/glob/-/glob-7.1.2.tgz", 102 | "integrity": "sha512-MJTUg1kjuLeQCJ+ccE4Vpa6kKVXkPYJ2mOCQyUuKLcLQsdrMCpBPUi8qVE6+YuaJkozeA9NusTAw3hLr8Xe5EQ==", 103 | "requires": { 104 | "fs.realpath": "1.0.0", 105 | "inflight": "1.0.6", 106 | "inherits": "2.0.3", 107 | "minimatch": "3.0.4", 108 | "once": "1.4.0", 109 | "path-is-absolute": "1.0.1" 110 | } 111 | }, 112 | "https-proxy-agent": { 113 | "version": "2.1.1", 114 | "resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-2.1.1.tgz", 115 | "integrity": "sha512-LK6tQUR/VOkTI6ygAfWUKKP95I+e6M1h7N3PncGu1CATHCnex+CAv9ttR0lbHu1Uk2PXm/WoAHFo6JCGwMjVMw==", 116 | "requires": { 117 | "agent-base": "4.2.0", 118 | "debug": "3.1.0" 119 | }, 120 | "dependencies": { 121 | "debug": { 122 | "version": "3.1.0", 123 | "resolved": "https://registry.npmjs.org/debug/-/debug-3.1.0.tgz", 124 | "integrity": "sha512-OX8XqP7/1a9cqkxYw2yXss15f26NKWBpDXQd0/uK/KPqdQhxbPa994hnzjcE2VqQpDslf55723cKPUOGSmMY3g==", 125 | "requires": { 126 | "ms": "2.0.0" 127 | } 128 | } 129 | } 130 | }, 131 | "inflight": { 132 | "version": "1.0.6", 133 | "resolved": "https://registry.npmjs.org/inflight/-/inflight-1.0.6.tgz", 134 | "integrity": "sha1-Sb1jMdfQLQwJvJEKEHW6gWW1bfk=", 135 | "requires": { 136 | "once": "1.4.0", 137 | "wrappy": "1.0.2" 138 | } 139 | }, 140 | "inherits": { 141 | "version": "2.0.3", 142 | "resolved": "https://registry.npmjs.org/inherits/-/inherits-2.0.3.tgz", 143 | "integrity": "sha1-Yzwsg+PaQqUC9SRmAiSA9CCCYd4=" 144 | }, 145 | "isarray": { 146 | "version": "1.0.0", 147 | "resolved": "https://registry.npmjs.org/isarray/-/isarray-1.0.0.tgz", 148 | "integrity": "sha1-u5NdSFgsuhaMBoNJV6VKPgcSTxE=" 149 | }, 150 | "mime": { 151 | "version": "1.6.0", 152 | "resolved": "https://registry.npmjs.org/mime/-/mime-1.6.0.tgz", 153 | "integrity": "sha512-x0Vn8spI+wuJ1O6S7gnbaQg8Pxh4NNHb7KSINmEWKiPE4RKOplvijn+NkmYmmRgP68mc70j2EbeTFRsrswaQeg==" 154 | }, 155 | "minimatch": { 156 | "version": "3.0.4", 157 | "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.0.4.tgz", 158 | "integrity": "sha512-yJHVQEhyqPLUTgt9B83PXu6W3rx4MvvHvSUvToogpwoGDOUQ+yDrR0HRot+yOCdCO7u4hX3pWft6kWBBcqh0UA==", 159 | "requires": { 160 | "brace-expansion": "1.1.8" 161 | } 162 | }, 163 | "minimist": { 164 | "version": "0.0.8", 165 | "resolved": "https://registry.npmjs.org/minimist/-/minimist-0.0.8.tgz", 166 | "integrity": "sha1-hX/Kv8M5fSYluCKCYuhqp6ARsF0=" 167 | }, 168 | "mkdirp": { 169 | "version": "0.5.0", 170 | "resolved": "https://registry.npmjs.org/mkdirp/-/mkdirp-0.5.0.tgz", 171 | "integrity": "sha1-HXMHam35hs2TROFecfzAWkyavxI=", 172 | "requires": { 173 | "minimist": "0.0.8" 174 | } 175 | }, 176 | "ms": { 177 | "version": "2.0.0", 178 | "resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz", 179 | "integrity": "sha1-VgiurfwAvmwpAd9fmGF4jeDVl8g=" 180 | }, 181 | "once": { 182 | "version": "1.4.0", 183 | "resolved": "https://registry.npmjs.org/once/-/once-1.4.0.tgz", 184 | "integrity": "sha1-WDsap3WWHUsROsF9nFC6753Xa9E=", 185 | "requires": { 186 | "wrappy": "1.0.2" 187 | } 188 | }, 189 | "path-is-absolute": { 190 | "version": "1.0.1", 191 | "resolved": "https://registry.npmjs.org/path-is-absolute/-/path-is-absolute-1.0.1.tgz", 192 | "integrity": "sha1-F0uSaHNVNP+8es5r9TpanhtcX18=" 193 | }, 194 | "pend": { 195 | "version": "1.2.0", 196 | "resolved": "https://registry.npmjs.org/pend/-/pend-1.2.0.tgz", 197 | "integrity": "sha1-elfrVQpng/kRUzH89GY9XI4AelA=" 198 | }, 199 | "process-nextick-args": { 200 | "version": "1.0.7", 201 | "resolved": "https://registry.npmjs.org/process-nextick-args/-/process-nextick-args-1.0.7.tgz", 202 | "integrity": "sha1-FQ4gt1ZZCtP5EJPyWk8q2L/zC6M=" 203 | }, 204 | "progress": { 205 | "version": "2.0.0", 206 | "resolved": "https://registry.npmjs.org/progress/-/progress-2.0.0.tgz", 207 | "integrity": "sha1-ihvjZr+Pwj2yvSPxDG/pILQ4nR8=" 208 | }, 209 | "proxy-from-env": { 210 | "version": "1.0.0", 211 | "resolved": "https://registry.npmjs.org/proxy-from-env/-/proxy-from-env-1.0.0.tgz", 212 | "integrity": "sha1-M8UDmPcOp+uW0h97gXYwpVeRx+4=" 213 | }, 214 | "puppeteer": { 215 | "version": "1.0.0", 216 | "resolved": "https://registry.npmjs.org/puppeteer/-/puppeteer-1.0.0.tgz", 217 | "integrity": "sha512-e00NMdUL32YhBcua9OkVXHgyDEMBWJhDXkYNv0pyKRU1Z1OrsRm5zCpppAdxAsBI+/MJBspFNfOUZuZ24qPGMQ==", 218 | "requires": { 219 | "debug": "2.6.9", 220 | "extract-zip": "1.6.6", 221 | "https-proxy-agent": "2.1.1", 222 | "mime": "1.6.0", 223 | "progress": "2.0.0", 224 | "proxy-from-env": "1.0.0", 225 | "rimraf": "2.6.2", 226 | "ws": "3.3.3" 227 | } 228 | }, 229 | "readable-stream": { 230 | "version": "2.3.3", 231 | "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-2.3.3.tgz", 232 | "integrity": "sha512-m+qzzcn7KUxEmd1gMbchF+Y2eIUbieUaxkWtptyHywrX0rE8QEYqPC07Vuy4Wm32/xE16NcdBctb8S0Xe/5IeQ==", 233 | "requires": { 234 | "core-util-is": "1.0.2", 235 | "inherits": "2.0.3", 236 | "isarray": "1.0.0", 237 | "process-nextick-args": "1.0.7", 238 | "safe-buffer": "5.1.1", 239 | "string_decoder": "1.0.3", 240 | "util-deprecate": "1.0.2" 241 | } 242 | }, 243 | "rimraf": { 244 | "version": "2.6.2", 245 | "resolved": "https://registry.npmjs.org/rimraf/-/rimraf-2.6.2.tgz", 246 | "integrity": "sha512-lreewLK/BlghmxtfH36YYVg1i8IAce4TI7oao75I1g245+6BctqTVQiBP3YUJ9C6DQOXJmkYR9X9fCLtCOJc5w==", 247 | "requires": { 248 | "glob": "7.1.2" 249 | } 250 | }, 251 | "safe-buffer": { 252 | "version": "5.1.1", 253 | "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.1.1.tgz", 254 | "integrity": "sha512-kKvNJn6Mm93gAczWVJg7wH+wGYWNrDHdWvpUmHyEsgCtIwwo3bqPtV4tR5tuPaUhTOo/kvhVwd8XwwOllGYkbg==" 255 | }, 256 | "string_decoder": { 257 | "version": "1.0.3", 258 | "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.0.3.tgz", 259 | "integrity": "sha512-4AH6Z5fzNNBcH+6XDMfA/BTt87skxqJlO0lAh3Dker5zThcAxG6mKz+iGu308UKoPPQ8Dcqx/4JhujzltRa+hQ==", 260 | "requires": { 261 | "safe-buffer": "5.1.1" 262 | } 263 | }, 264 | "typedarray": { 265 | "version": "0.0.6", 266 | "resolved": "https://registry.npmjs.org/typedarray/-/typedarray-0.0.6.tgz", 267 | "integrity": "sha1-hnrHTjhkGHsdPUfZlqeOxciDB3c=" 268 | }, 269 | "ultron": { 270 | "version": "1.1.1", 271 | "resolved": "https://registry.npmjs.org/ultron/-/ultron-1.1.1.tgz", 272 | "integrity": "sha512-UIEXBNeYmKptWH6z8ZnqTeS8fV74zG0/eRU9VGkpzz+LIJNs8W/zM/L+7ctCkRrgbNnnR0xxw4bKOr0cW0N0Og==" 273 | }, 274 | "util-deprecate": { 275 | "version": "1.0.2", 276 | "resolved": "https://registry.npmjs.org/util-deprecate/-/util-deprecate-1.0.2.tgz", 277 | "integrity": "sha1-RQ1Nyfpw3nMnYvvS1KKJgUGaDM8=" 278 | }, 279 | "wrappy": { 280 | "version": "1.0.2", 281 | "resolved": "https://registry.npmjs.org/wrappy/-/wrappy-1.0.2.tgz", 282 | "integrity": "sha1-tSQ9jz7BqjXxNkYFvA0QNuMKtp8=" 283 | }, 284 | "ws": { 285 | "version": "3.3.3", 286 | "resolved": "https://registry.npmjs.org/ws/-/ws-3.3.3.tgz", 287 | "integrity": "sha512-nnWLa/NwZSt4KQJu51MYlCcSQ5g7INpOrOMt4XV8j4dqTXdmlUmSHQ8/oLC069ckre0fRsgfvsKwbTdtKLCDkA==", 288 | "requires": { 289 | "async-limiter": "1.0.0", 290 | "safe-buffer": "5.1.1", 291 | "ultron": "1.1.1" 292 | } 293 | }, 294 | "yauzl": { 295 | "version": "2.4.1", 296 | "resolved": "https://registry.npmjs.org/yauzl/-/yauzl-2.4.1.tgz", 297 | "integrity": "sha1-lSj0QtqxsihOWLQ3m7GU4i4MQAU=", 298 | "requires": { 299 | "fd-slicer": "1.0.1" 300 | } 301 | } 302 | } 303 | } 304 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "baidu-index-sprite", 3 | "version": "1.0.1", 4 | "description": "百度指数爬虫", 5 | "main": "index.js", 6 | "scripts": { 7 | "test": "echo \"Error: no test specified\" && exit 1" 8 | }, 9 | "author": "岛书", 10 | "license": "MIT", 11 | "dependencies": { 12 | "jimp": "^0.2.28", 13 | "node-tesseract": "^0.2.7", 14 | "puppeteer": "^1.0.0" 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /src/check-env.js: -------------------------------------------------------------------------------- 1 | /** 2 | * 环境依赖检查 3 | */ 4 | 5 | const cmd = require('child_process'); 6 | 7 | // 全局依赖 8 | const deps = ['tesseract']; 9 | 10 | module.exports = { 11 | run: () => { 12 | console.log('🍵 正在检测依赖...'); 13 | 14 | return deps.every((dep) => { 15 | try { 16 | cmd.execSync(dep); 17 | return true; 18 | } catch (err) { 19 | console.error('❌ 缺少依赖:' + dep + ',请先安装该依赖并配置到环境变量'); 20 | } 21 | }); 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /src/recognition.js: -------------------------------------------------------------------------------- 1 | const path = require('path'); 2 | const Tesseract = require('node-tesseract'); 3 | 4 | module.exports = { 5 | /** 6 | * 识别图片中的文字 7 | * @param {string} imgPath 图片路径 8 | */ 9 | run: async (imgPath) => { 10 | return new Promise((resolve, reject) => { 11 | console.log('📷 开始识别图片...'); 12 | 13 | Tesseract.process(imgPath, { psm: 7 }, function (err, val) { 14 | if (err || val == null) { 15 | console.error('❌ 识别失败:' + imgPath); 16 | reject(err); 17 | return; 18 | } 19 | 20 | const date = path.basename(imgPath, path.extname(imgPath)); 21 | // 针对常见错误做手动修复 22 | // 更复杂的场景靠训练模型来提高准确率 23 | val = val 24 | .replace(/(\,|\.|\s+)/g, '') 25 | .replace(/\?/g, '7') 26 | .replace(/\'3/g, 9) 27 | .replace(/\‘/g, ''); 28 | 29 | console.log('✅ 识别成功 ' + date + ':' + val); 30 | resolve({ 31 | title: date, 32 | value: val 33 | }); 34 | }); 35 | }); 36 | } 37 | } -------------------------------------------------------------------------------- /src/spider.js: -------------------------------------------------------------------------------- 1 | const path = require('path'); 2 | const fs = require('fs'); 3 | const puppeteer = require('puppeteer'); 4 | const jimp = require('jimp'); 5 | 6 | const BAIDU_INDEX_URL = 'http://index.baidu.com'; 7 | const BAIDU_INDEX_DETAIL_URL = 'http://index.baidu.com/?tpl=trend&word='; 8 | 9 | class Spider { 10 | constructor (options, puppeteerOptions) { 11 | this.options = options; 12 | this.puppeteerOptions = puppeteerOptions; 13 | } 14 | 15 | async initBrowser () { 16 | if (!this.browser) { 17 | this.browser = await puppeteer.launch(this.puppeteerOptions); 18 | } 19 | return this.browser; 20 | } 21 | 22 | async initPage () { 23 | const page = await this.browser.newPage(); 24 | page.setViewport({ 25 | width: 1280, 26 | height: 500 27 | }); 28 | return page; 29 | } 30 | 31 | createImgDir (word) { 32 | const imgDir = path.resolve(this.options.imgDir, './' + word); 33 | if (!fs.existsSync(imgDir)) { 34 | fs.mkdir(imgDir) 35 | } 36 | return imgDir; 37 | } 38 | 39 | async run (word) { 40 | console.log('🚀 Spider启动:[' + word + ']'); 41 | 42 | const browser = await this.initBrowser(); 43 | const page = await this.initPage(); 44 | const imgDir = this.createImgDir(word); 45 | 46 | await page.goto(BAIDU_INDEX_URL); 47 | 48 | // 模拟登陆 49 | console.log('😁 开始登录...'); 50 | await page.click('#userbar li:nth-child(4)'); 51 | await page.waitForSelector('#TANGRAM_12__userName'); 52 | await page.type('#TANGRAM_12__userName', this.options.username); 53 | await page.type('#TANGRAM_12__password', this.options.password); 54 | await page.click('#TANGRAM_12__submit'); 55 | await page.waitForNavigation(); 56 | console.log('✅ 登录成功'); 57 | 58 | // 跳到指数页面 59 | await page.type('#schword', word); 60 | await page.click('#searchWords'); 61 | 62 | // 等待ajax请求结束,图表绘制 63 | await page.waitForSelector('#trend > svg > image'); 64 | 65 | // 获取chart最前、最后的坐标 66 | const position = await page.evaluate(() => { 67 | const $image = document.querySelector('#trend > svg > image'); 68 | const $area = document.querySelector('#trend-wrap .grpArea'); 69 | 70 | const areaRect = $area.getBoundingClientRect(); 71 | const imageRect = $image.getBoundingClientRect(); 72 | 73 | // 滚动到图表可视化区域 74 | window.scrollBy(0, areaRect.top); 75 | 76 | return { 77 | startX: imageRect.x, 78 | endX: imageRect.x + imageRect.width - 1, 79 | y: 200 80 | } 81 | }); 82 | 83 | console.log('📝 开始抓取数据...'); 84 | 85 | for (let i = 0, count = 30, lastTitle; i < count; i++) { 86 | // 每次移动15像素,看tooltip上的日期是否与上一次相同,相同则继续移动15像素,否则抓取图片 87 | // 对起点做特殊处理 88 | let x = i === count - 1 ? (position.startX + 4) : (position.endX - i * 15); 89 | 90 | // 移动鼠标,触发tooltip 91 | await page.mouse.move(x, position.y); 92 | await page.waitForSelector('#trendPopTab .view-value .imgval'); 93 | await page.waitFor(150); 94 | 95 | // 获取tooltip信息 96 | const valueInfo = await page.evaluate(() => { 97 | const $tooltip = document.querySelector('#viewbox'); 98 | const $title = $tooltip.querySelector('.view-bd:first-child .view-table-wrap'); 99 | const $value = $tooltip.querySelector('#trendPopTab .view-value'); 100 | const valueRect = $value.getBoundingClientRect(); 101 | 102 | return { 103 | title: $title.textContent.split(' ')[0], 104 | x: valueRect.x - 5, 105 | y: valueRect.y, 106 | width: valueRect.width + 10, 107 | height: valueRect.height 108 | } 109 | }); 110 | 111 | // 本次无效,继续移动 112 | if (valueInfo.title === lastTitle) { 113 | count++; 114 | continue; 115 | } 116 | 117 | lastTitle = valueInfo.title; 118 | 119 | const imgPath = path.resolve(imgDir, valueInfo.title + '.png'); 120 | await page.screenshot({ path: imgPath }); 121 | 122 | // 对图片进行裁剪,只保留数字部分 123 | const img = await jimp.read(imgPath); 124 | await img.crop(valueInfo.x, valueInfo.y, valueInfo.width, valueInfo.height); 125 | // 放大图片,提高识别准确率 126 | await img.scale(5); 127 | await img.write(imgPath); 128 | 129 | console.log('✅ 抓取成功,生成图片:' + imgPath); 130 | } 131 | } 132 | } 133 | 134 | module.exports = Spider; --------------------------------------------------------------------------------