├── weixin
    ├── __init__.py
    ├── spiders
    │   ├── __init__.py
    │   └── gongzhonghao.py
    ├── conn.py
    ├── getBody.js
    ├── items.py
    ├── pipelines.py
    └── settings.py
├── .gitignore
├── README.md
└── scrapy.cfg


/weixin/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | 


--------------------------------------------------------------------------------
/weixin/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # weixin-crawler
 2 | 微信公众号批量抓取器
 3 | ===================
 4 | 教程
 5 | ====
 6 |  * http://www.shareditor.com/blogshow/?blogId=43
 7 | 
 8 |  * http://www.shareditor.com/blogshow/?blogId=44
 9 | 
10 |  * http://www.shareditor.com/blogshow/?blogId=45
11 | 


--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = weixin.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = weixin
12 | 


--------------------------------------------------------------------------------
/weixin/conn.py:
--------------------------------------------------------------------------------
 1 | # coding:utf-8
 2 | 
 3 | import sys
 4 | reload(sys)
 5 | sys.setdefaultencoding( "utf-8" )
 6 | import MySQLdb
 7 | 
 8 | class Conn:
 9 |     def getConnection(self):
10 |         conn = MySQLdb.connect(host="127.0.0.1",user="lichuang",passwd="qwerty",db="sharenote2.0",charset="utf8")
11 |         return conn
12 | 


--------------------------------------------------------------------------------
/weixin/getBody.js:
--------------------------------------------------------------------------------
 1 | var page = require('webpage').create();
 2 | var system = require('system');
 3 | page.open(system.args[1], function(status) {
 4 |     var sc = page.evaluate(function() {
 5 |         return document.body.innerHTML;
 6 |     });
 7 |     window.setTimeout(function() {
 8 |         console.log(sc);
 9 |         phantom.exit();
10 |     }, 100);
11 | });
12 | 


--------------------------------------------------------------------------------
/weixin/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class WeixinItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     title = scrapy.Field()
14 |     create_time = scrapy.Field()
15 |     source = scrapy.Field()
16 |     body = scrapy.Field()
17 |     content = scrapy.Field()
18 | 


--------------------------------------------------------------------------------
/weixin/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import sys
 3 | reload(sys)
 4 | sys.setdefaultencoding('utf8')
 5 | import MySQLdb
 6 | from weixin.conn import Conn
 7 | 
 8 | class WeixinPipeline(object):
 9 |     def __init__(self):
10 |         self.conn = Conn().getConnection()
11 |         self.cursor = self.conn.cursor()
12 | 
13 |     def process_item(self, item, spider):
14 |         sql = "insert ignore into CrawlPage(title, source, body, content, create_time) values(%s, %s, %s, %s, %s)"
15 |         param = (item['title'], item['source'], item['body'], item['content'], item['create_time'])
16 |         self.cursor.execute(sql,param)
17 |         self.conn.commit()
18 | 


--------------------------------------------------------------------------------
/weixin/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for weixin project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'weixin'
13 | 
14 | SPIDER_MODULES = ['weixin.spiders']
15 | NEWSPIDER_MODULE = 'weixin.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'Mozilla/5.0 (iPhone; CPU iPhone OS 9_3_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Mobile/13E238 MicroMessenger/6.3.16 NetType/WIFI Language/zh_CN'
20 | USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
21 | 
22 | # Obey robots.txt rules
23 | ROBOTSTXT_OBEY = False
24 | 
25 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
26 | CONCURRENT_REQUESTS = 1
27 | 
28 | # Configure a delay for requests for the same website (default: 0)
29 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
30 | # See also autothrottle settings and docs
31 | DOWNLOAD_DELAY = 5
32 | DOWNLOAD_TIMEOUT = 5
33 | # The download delay setting will honor only one of:
34 | CONCURRENT_REQUESTS_PER_DOMAIN = 1
35 | CONCURRENT_REQUESTS_PER_IP = 1
36 | 
37 | # Disable cookies (enabled by default)
38 | #COOKIES_ENABLED = False
39 | 
40 | # Disable Telnet Console (enabled by default)
41 | #TELNETCONSOLE_ENABLED = False
42 | 
43 | # Override the default request headers:
44 | DEFAULT_REQUEST_HEADERS = {
45 |    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
46 |    'Accept-Encoding': 'gzip, deflate, sdch',
47 |    'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6',
48 |    'Cache-Control': 'no-cache',
49 |    'Cookie': 'CXID=21DFA4BB4F4AC8AC1D7B654C27A86E6E; SUID=DC3F6E244B6C860A568F3438000BBA02; SUV=00F22484246E3FDC5698E3B23EF02547; GOTO=; ld=Tkllllllll2ga@pYqh@euOtsDBPga@pzJpc@Tlllll9llllxjK@@@@@@@@@@@@@@; cd=1462759834&0e4f734bc44c6b94eb0c55d652856a58; rd=Tkllllllll2ga@pYqh@euOtsDBPga@pzJpc@Tlllll9llllxjK@@@@@@@@@@@@@@; ABTEST=2|1463014549|v1; weixinIndexVisited=1; SNUID=DC3E6F24000432CA00010C9A01146570; sct=10; JSESSIONID=aaas3h6v8QyHibVAGsnrv; PHPSESSID=f5f6fj5qermt5kmjkmvd7d3u86; SUIR=DC3E6F24000432CA00010C9A01146570; ad=8MLRxZllll2Q08yclllllVtfbIUlllllJpc@Tlllllwlllll9ylll5@@@@@@@@@@; IPLOC=CN1100; LSTMV=315%2C189; LCLKINT=9279',
50 |    'Upgrade-Insecure-Requests': '1',
51 |    'Referer': 'http://weixin.sogou.com/weixin?query=%E5%A4%A7%E6%95%B0%E6%8D%AE%E6%96%87%E6%91%98',
52 | }
53 | 
54 | # Enable or disable spider middlewares
55 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
56 | #SPIDER_MIDDLEWARES = {
57 | #    'weixin.middlewares.MyCustomSpiderMiddleware': 543,
58 | #}
59 | 
60 | # Enable or disable downloader middlewares
61 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
62 | #DOWNLOADER_MIDDLEWARES = {
63 | #    'weixin.middlewares.MyCustomDownloaderMiddleware': 543,
64 | #}
65 | 
66 | # Enable or disable extensions
67 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
68 | #EXTENSIONS = {
69 | #    'scrapy.extensions.telnet.TelnetConsole': None,
70 | #}
71 | 
72 | # Configure item pipelines
73 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
74 | ITEM_PIPELINES = {
75 |     'weixin.pipelines.WeixinPipeline': 300,
76 | }
77 | 
78 | # Enable and configure the AutoThrottle extension (disabled by default)
79 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
80 | #AUTOTHROTTLE_ENABLED = True
81 | # The initial download delay
82 | #AUTOTHROTTLE_START_DELAY = 5
83 | # The maximum download delay to be set in case of high latencies
84 | #AUTOTHROTTLE_MAX_DELAY = 60
85 | # The average number of requests Scrapy should be sending in parallel to
86 | # each remote server
87 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
88 | # Enable showing throttling stats for every response received:
89 | #AUTOTHROTTLE_DEBUG = False
90 | 
91 | # Enable and configure HTTP caching (disabled by default)
92 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
93 | #HTTPCACHE_ENABLED = True
94 | #HTTPCACHE_EXPIRATION_SECS = 0
95 | #HTTPCACHE_DIR = 'httpcache'
96 | #HTTPCACHE_IGNORE_HTTP_CODES = []
97 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
98 | 


--------------------------------------------------------------------------------
/weixin/spiders/gongzhonghao.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | # -*- coding: utf-8 -*-
 3 | import sys
 4 | reload(sys)
 5 | sys.setdefaultencoding('utf8')
 6 | import MySQLdb
 7 | import scrapy
 8 | import time
 9 | import subprocess
10 | from scrapy.http import HtmlResponse
11 | from scrapy.selector import Selector
12 | from weixin.items import WeixinItem
13 | from w3lib.html import remove_tags
14 | from weixin.conn import Conn
15 | 
16 | class GongzhonghaoSpider(scrapy.Spider):
17 |     name = "gongzhonghao"
18 |     allowed_domains = ["qq.com"]
19 |     start_urls = [
20 |         "http://weixin.sogou.com/weixin?query=算法与数学之美",
21 |         "http://weixin.sogou.com/weixin?query=大数据文摘",
22 |         "http://weixin.sogou.com/weixin?query=大数据技术",
23 |         "http://weixin.sogou.com/weixin?query=数据分析精选",
24 |         "http://weixin.sogou.com/weixin?query=算法与数学之美",
25 |         "http://weixin.sogou.com/weixin?query=数盟",
26 |         "http://weixin.sogou.com/weixin?query=MachineLearning_",
27 |         "http://weixin.sogou.com/weixin?query=犇犇机器学习",
28 |         "http://weixin.sogou.com/weixin?query=我爱机器学习",
29 |         "http://weixin.sogou.com/weixin?query=机器学习算法与Python学习",
30 |         "http://weixin.sogou.com/weixin?query=机器学习与大数据",
31 |         "http://weixin.sogou.com/weixin?query=PLY机器学习俱乐部",
32 |         "http://weixin.sogou.com/weixin?query=机器学习与人工智能",
33 |         "http://weixin.sogou.com/weixin?query=菜鸟的机器学习",
34 |         "http://weixin.sogou.com/weixin?query=培乐园机器学习社区",
35 |         "http://weixin.sogou.com/weixin?query=我为机器学习狂",
36 |         "http://weixin.sogou.com/weixin?query=深度学习",
37 |         "http://weixin.sogou.com/weixin?query=深度学习世界",
38 |         "http://weixin.sogou.com/weixin?query=数据挖掘菜鸟",
39 |         "http://weixin.sogou.com/weixin?query=数据挖掘DW",
40 |         "http://weixin.sogou.com/weixin?query=数据挖掘",
41 |         "http://weixin.sogou.com/weixin?query=大数据挖掘",
42 |         "http://weixin.sogou.com/weixin?query=数据分析精选",
43 |         "http://weixin.sogou.com/weixin?query=人工智能头条",
44 |         "http://weixin.sogou.com/weixin?query=程序员的自留地",
45 |         "http://weixin.sogou.com/weixin?query=互联网创业大佬",
46 |         "http://weixin.sogou.com/weixin?query=互联网创业刊",
47 |         "http://weixin.sogou.com/weixin?query=互联网创业交流",
48 |         "http://weixin.sogou.com/weixin?query=互联网创业指南",
49 |         "http://weixin.sogou.com/weixin?query=互联网创业空间",
50 |         "http://weixin.sogou.com/weixin?query=互联网创业思维",
51 |         "http://weixin.sogou.com/weixin?query=全栈笔记",
52 |         "http://weixin.sogou.com/weixin?query=全栈人生",
53 |         "http://weixin.sogou.com/weixin?query=全栈程序猿",
54 |     ]
55 | 
56 |     def __init__(self):
57 |         self.conn = Conn().getConnection()
58 |         self.cursor = self.conn.cursor()
59 | 
60 |     def parse(self, response):
61 |         href = response.selector.xpath('//div[@id="sogou_vr_11002301_box_0"]/@href').extract()[0]
62 |         cmd="~/bin/phantomjs ./getBody.js '%s'" % href
63 |         time.sleep(1)
64 |         stdout, stderr = subprocess.Popen(cmd, shell=True, stdout = subprocess.PIPE, stderr = subprocess.PIPE).communicate()
65 |         print stderr
66 |         response = HtmlResponse(url=href, body=stdout)
67 | 
68 |         for selector in Selector(response=response).xpath('//*[@id="history"]/div/div/div/div'):
69 |             href = selector.xpath('h4/@hrefs').extract()[0].strip()
70 |             title = ""
71 |             for elem in selector.xpath('h4/text()').extract():
72 |                 if len(elem.strip()) > 0:
73 |                     title = elem.strip()
74 |             abstract = selector.xpath('//*[contains(@class, "weui_media_desc")]/text()').extract()[0].strip()
75 |             pubtime = selector.xpath('//*[contains(@class, "weui_media_extra_info")]/text()').extract()[0].strip()
76 |             full_url = response.urljoin(href)
77 |             n = 0
78 |             if len(title) != 0:
79 |                 sql = "select * from CrawlPage where title='%s'" % title
80 |                 n = self.cursor.execute(sql)
81 |             if len(title) == 0 or n == 0:
82 |                 yield scrapy.Request(full_url, callback=self.parse_profile)
83 | 
84 |     def parse_profile(self, response):
85 |         title = response.xpath('//title/text()').extract()[0].strip()
86 |         create_time = response.xpath('//em[@id="post-date"]/text()').extract()[0].strip()
87 |         source = response.xpath('//a[@id="post-user"]/text()').extract()[0].strip()
88 |         body = response.body.strip()
89 |         tag_content = response.xpath('//div[@id="js_content"]').extract()[0].strip()
90 |         content = remove_tags(tag_content).strip()
91 |         item = WeixinItem()
92 |         item['title'] = title
93 |         item['create_time'] = create_time
94 |         item['source'] = source
95 |         item['body'] = body
96 |         item['content'] = content
97 |         return item
98 | 


--------------------------------------------------------------------------------