├── .gitignore ├── LICENSE ├── README.md ├── crawler.js ├── job └── cookie.js ├── lib ├── redis.js └── util.js └── package.json /.gitignore: -------------------------------------------------------------------------------- 1 | out 2 | node_modules 3 | .DS_Store 4 | result.xml 5 | .idea 6 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 yanjixiong 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # weixin-crawler-es5 2 | 3 | 这还是个非常简单的微信爬虫,但是实现了抓取的过程,以及cookie的问题。 4 | 5 | ## How to use 6 | 7 | ### 启动爬虫: 8 | ```js 9 | npm start 10 | ``` 11 | 12 | ### 启动cookie池任务 13 | 14 | ```js 15 | npm run cookie 16 | ``` 17 | 18 | ## Contact 19 | 20 | Email:yjk99@qq.com 21 | -------------------------------------------------------------------------------- /crawler.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | 3 | var request = require('urllib-sync').request; 4 | var sleep = require('sleep').sleep; 5 | var path = require('path'); 6 | var fs = require('fs'); 7 | var util = require('util'); 8 | var cheerio = require('cheerio'); 9 | var req = require('request'); 10 | var redis = require('./lib/redis'); 11 | var cookie = require('./job/cookie'); 12 | 13 | var apiRoot = 'http://weixin.sogou.com'; 14 | var userAgent = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36'; 15 | var mockHeaders = { 16 | 'Cookie': 'CXID=B3EBF622BC23A4DD15784FC9617F7C36; SUID=52FC111B142D900A55B72DFB0004A20B; SUV=1439361586856051; pgv_pvi=2340838400; GOTO=Af99046; ssuid=2533552660; ABTEST=7|' + parseInt(new Date().getTime() / 1000 + '') + '|v1; weixinIndexVisited=1; sct=28; ld=Lkllllllll2Q1IgtlllllVbA1FwlllllpenAGyllllwllllljZlll5@@@@@@@@@@; ad=$lllllllll2qHhTElllllVboMpolllllpe4DUkllll9lllll9llll5@@@@@@@@@@; SNUID={SNUID}; IPLOC=CN4201', 17 | 'Host': 'weixin.sogou.com', 18 | 'User-Agent': userAgent, 19 | }; 20 | var skipPage = 0; 21 | var totalPage = 10; 22 | var interval = 10; // 60s 23 | 24 | function onerror(err) { 25 | console.log(err); 26 | console.log(err.stack); 27 | process.exit(1); 28 | } 29 | 30 | function ensureResult(body) { 31 | if (body.indexOf('您的访问出错了') >= 0) { 32 | var err = new Error('Reached list request limit.'); 33 | err.name = 'RequestListLimited'; 34 | err.url = url; 35 | err.originBody = body; 36 | return onerror(err); 37 | } 38 | } 39 | 40 | // 请求文章列表页 41 | function requestList(page) { 42 | var url = apiRoot + util.format('/weixin?query=%s&sourceid=inttime_day&type=2&interation=&tsn=1&t=' + new Date().getTime(), 'node.js'); 43 | console.log(apiRoot); 44 | console.log('[%s] %s', new Date(), url); 45 | var result = request(url, { 46 | timeout: 5000, 47 | headers: mockHeaders, 48 | }); 49 | var body = result.data.toString(); 50 | // console.log(body); 51 | ensureResult(body); 52 | // console.log(body); 53 | handleList(body); 54 | } 55 | 56 | // 请求文章详情页 57 | function requestArticle(link) { 58 | var url = link.indexOf('weixin.qq') === -1 ? apiRoot + link : link; 59 | console.log('[%s] requestArticle => %s', new Date(), url); 60 | var result = request(url, { 61 | timeout: 5000, 62 | headers: mockHeaders, 63 | }); 64 | var body = result.data.toString(); 65 | var headers = result.headers || {}; 66 | var redirUrl = headers['location'] || ''; 67 | ensureResult(body); 68 | if (String(result.status)[0] !== '3' || 69 | !redirUrl || redirUrl.indexOf('antispider') >= 0) { 70 | 71 | var err = new Error('Request article failed.'); 72 | err.name = 'RequestArticleError'; 73 | err.url = url; 74 | err.originBody = body; 75 | return onerror(err); 76 | } 77 | console.log('[%s] redirUrl => %s', new Date(), redirUrl); 78 | var redirResult = request(redirUrl, {timeout: 5000}); 79 | var redirBody = redirResult.data.toString(); 80 | ensureResult(redirBody); 81 | return redirBody; 82 | } 83 | 84 | /** 85 | * 解析真实url 86 | * @returns {*|string} 87 | */ 88 | function handleRedirectUrl(link) { 89 | var url = link.indexOf('weixin.qq') === -1 ? apiRoot + link : link; 90 | //console.log('[%s] requestArticle => %s', new Date(), url); 91 | var result = request(url, { 92 | timeout: 5000, 93 | headers: mockHeaders 94 | }); 95 | var body = result.data.toString(); 96 | console.log(body); 97 | var headers = result.headers || {}; 98 | //console.log('redirectUrl', headers['location']); 99 | return headers['location'] || ''; 100 | } 101 | 102 | function handleList(res) { 103 | var articleList = []; 104 | var $ = cheerio.load(res, {normalizeWhitespace: true}); 105 | var $chapters = $('.wx-rb .txt-box'); 106 | 107 | $chapters.each(function(index, chapter) { 108 | var title = $(chapter).find('h4 a').text(); 109 | var link = $(chapter).find('h4 a').attr('href'); 110 | var $weixinAccount = $(chapter).find('.s-p a#weixin_account'); 111 | var weixinAccountName = $weixinAccount.attr('title'); 112 | var weixinAccountLink = $weixinAccount.attr('href'); 113 | 114 | var redirectUrl = handleRedirectUrl(link); 115 | 116 | if (redirectUrl && redirectUrl.indexOf('antispider') < 0) { 117 | var article = { 118 | title: title, 119 | link: redirectUrl, 120 | accountName: weixinAccountName, 121 | accountLink: apiRoot + weixinAccountLink, 122 | category: 'node.js' 123 | }; 124 | 125 | articleList.push(article); 126 | 127 | console.log(article); 128 | 129 | sleep(interval); 130 | } 131 | }); 132 | 133 | console.log('articleList: ', articleList); 134 | 135 | articleList.forEach(function(article) { 136 | console.log('post data =>', article); 137 | req 138 | .post('http://blog.gaoqixhb.com/api/topic/add') 139 | .form(article); 140 | }); 141 | } 142 | 143 | /** 144 | * 爬取 145 | */ 146 | function crawl() { 147 | console.log('key:', cookie.key); 148 | redis 149 | .multi() 150 | .srandmember(cookie.key) 151 | .exec(function(err, result) { 152 | if (err) return onerror(err); 153 | result = result || '6E58D903A9AD86069D3733E3A916887E'; 154 | 155 | console.log('get SNUID from pool:', result); 156 | mockHeaders.Cookie = mockHeaders.Cookie.replace('{SNUID}', result); 157 | if (err) return onerror(err); 158 | for (var page = 1 + skipPage; page <= totalPage; page++) { 159 | requestList(page); 160 | sleep(interval); 161 | } 162 | }); 163 | } 164 | 165 | crawl(); 166 | 167 | 168 | 169 | -------------------------------------------------------------------------------- /job/cookie.js: -------------------------------------------------------------------------------- 1 | /*! 2 | * 获取cookie 3 | * Copyright(c) 2016 yanjixiong 4 | */ 5 | 6 | var request = require('superagent'); 7 | var schedule = require('node-schedule'); 8 | var client = require('../lib/redis'); 9 | var util = require('../lib/util'); 10 | 11 | exports.key = 'crawler:snuidContainer'; 12 | 13 | /** 14 | * 获取cookie 15 | */ 16 | function getCookie() { 17 | for(var i = 0, len = 10; i < len; i++) { 18 | setTimeout(function() { 19 | request 20 | .get('http://weixin.sogou.com/weixin?query=' + util.randomStr(2).base62()) 21 | .end(function(err, res){ 22 | var SNUID = res.header['set-cookie'][1].split(';')[0]; 23 | SNUID = SNUID.indexOf('SNUID') !== -1 ? SNUID.split('=')[1] : ''; 24 | if (SNUID) { 25 | client.sadd(exports.key, SNUID, function(err, result) { 26 | console.log('result: ', result); 27 | }); 28 | } 29 | console.log('SNUID:', res.header['set-cookie'][1].split(';')[0].split('=')[1]); 30 | }); 31 | }, 1000 * i); 32 | } 33 | } 34 | 35 | /** 36 | * 清除容器 37 | */ 38 | function clearContainer() { 39 | client.del(exports.key); 40 | } 41 | 42 | console.log('Get Cookie Job Start...'); 43 | schedule.scheduleJob('* * */6 * *', getCookie); // do job every six hours 44 | schedule.scheduleJob('* * */6 * *', clearContainer);//clear cookie pool every six hours 45 | clearContainer(); 46 | getCookie(); -------------------------------------------------------------------------------- /lib/redis.js: -------------------------------------------------------------------------------- 1 | /*! 2 | * redis client 3 | * Copyright(c) 2016 yanjixiong 4 | */ 5 | 6 | var redis = require('redis'); 7 | var client = redis.createClient(6379, '127.0.0.1'); 8 | 9 | module.exports = client; -------------------------------------------------------------------------------- /lib/util.js: -------------------------------------------------------------------------------- 1 | /** 2 | * 工具类 3 | */ 4 | 5 | exports.randomStr = function (length) { 6 | var base62 = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'; 7 | var base36 = 'abcdefghijklmnopqrstuvwxyz0123456789'; 8 | var base10 = '0123456789'; 9 | 10 | function create(chars) { 11 | return function random() { 12 | var salt = ''; 13 | for (var i = 0; i < length; i++) salt += chars[Math.floor(chars.length * Math.random())] 14 | return salt 15 | } 16 | } 17 | 18 | return { 19 | base62: create(base62), 20 | base36: create(base36), 21 | base10: create(base10) 22 | } 23 | }; 24 | 25 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "weixin-crawler-es5", 3 | "version": "0.0.1", 4 | "description": "", 5 | "scripts": { 6 | "start": "node crawler.js", 7 | "cookie": "node job/cookie.js" 8 | }, 9 | "author": "luoyjx", 10 | "license": "MIT", 11 | "dependencies": { 12 | "cheerio": "0.15.0", 13 | "node-schedule": "^1.1.0", 14 | "redis": "^1.0.0", 15 | "request": "^2.69.0", 16 | "sleep": "~3.0.0", 17 | "superagent": "^1.8.1", 18 | "urllib-sync": "~1.1.2" 19 | }, 20 | "engines": { 21 | "node": "4.2.1" 22 | } 23 | } 24 | --------------------------------------------------------------------------------