├── weixin ├── __init__.py ├── spiders │ ├── __init__.py │ └── gongzhonghao.py ├── conn.py ├── getBody.js ├── items.py ├── pipelines.py └── settings.py ├── .gitignore ├── README.md └── scrapy.cfg /weixin/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | -------------------------------------------------------------------------------- /weixin/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # weixin-crawler 2 | 微信公众号批量抓取器 3 | =================== 4 | 教程 5 | ==== 6 | * http://www.shareditor.com/blogshow/?blogId=43 7 | 8 | * http://www.shareditor.com/blogshow/?blogId=44 9 | 10 | * http://www.shareditor.com/blogshow/?blogId=45 11 | -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = weixin.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = weixin 12 | -------------------------------------------------------------------------------- /weixin/conn.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | 3 | import sys 4 | reload(sys) 5 | sys.setdefaultencoding( "utf-8" ) 6 | import MySQLdb 7 | 8 | class Conn: 9 | def getConnection(self): 10 | conn = MySQLdb.connect(host="127.0.0.1",user="lichuang",passwd="qwerty",db="sharenote2.0",charset="utf8") 11 | return conn 12 | -------------------------------------------------------------------------------- /weixin/getBody.js: -------------------------------------------------------------------------------- 1 | var page = require('webpage').create(); 2 | var system = require('system'); 3 | page.open(system.args[1], function(status) { 4 | var sc = page.evaluate(function() { 5 | return document.body.innerHTML; 6 | }); 7 | window.setTimeout(function() { 8 | console.log(sc); 9 | phantom.exit(); 10 | }, 100); 11 | }); 12 | -------------------------------------------------------------------------------- /weixin/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class WeixinItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | title = scrapy.Field() 14 | create_time = scrapy.Field() 15 | source = scrapy.Field() 16 | body = scrapy.Field() 17 | content = scrapy.Field() 18 | -------------------------------------------------------------------------------- /weixin/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import sys 3 | reload(sys) 4 | sys.setdefaultencoding('utf8') 5 | import MySQLdb 6 | from weixin.conn import Conn 7 | 8 | class WeixinPipeline(object): 9 | def __init__(self): 10 | self.conn = Conn().getConnection() 11 | self.cursor = self.conn.cursor() 12 | 13 | def process_item(self, item, spider): 14 | sql = "insert ignore into CrawlPage(title, source, body, content, create_time) values(%s, %s, %s, %s, %s)" 15 | param = (item['title'], item['source'], item['body'], item['content'], item['create_time']) 16 | self.cursor.execute(sql,param) 17 | self.conn.commit() 18 | -------------------------------------------------------------------------------- /weixin/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for weixin project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'weixin' 13 | 14 | SPIDER_MODULES = ['weixin.spiders'] 15 | NEWSPIDER_MODULE = 'weixin.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'Mozilla/5.0 (iPhone; CPU iPhone OS 9_3_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Mobile/13E238 MicroMessenger/6.3.16 NetType/WIFI Language/zh_CN' 20 | USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36' 21 | 22 | # Obey robots.txt rules 23 | ROBOTSTXT_OBEY = False 24 | 25 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 26 | CONCURRENT_REQUESTS = 1 27 | 28 | # Configure a delay for requests for the same website (default: 0) 29 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 30 | # See also autothrottle settings and docs 31 | DOWNLOAD_DELAY = 5 32 | DOWNLOAD_TIMEOUT = 5 33 | # The download delay setting will honor only one of: 34 | CONCURRENT_REQUESTS_PER_DOMAIN = 1 35 | CONCURRENT_REQUESTS_PER_IP = 1 36 | 37 | # Disable cookies (enabled by default) 38 | #COOKIES_ENABLED = False 39 | 40 | # Disable Telnet Console (enabled by default) 41 | #TELNETCONSOLE_ENABLED = False 42 | 43 | # Override the default request headers: 44 | DEFAULT_REQUEST_HEADERS = { 45 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 46 | 'Accept-Encoding': 'gzip, deflate, sdch', 47 | 'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6', 48 | 'Cache-Control': 'no-cache', 49 | 'Cookie': 'CXID=21DFA4BB4F4AC8AC1D7B654C27A86E6E; SUID=DC3F6E244B6C860A568F3438000BBA02; SUV=00F22484246E3FDC5698E3B23EF02547; GOTO=; ld=Tkllllllll2ga@pYqh@euOtsDBPga@pzJpc@Tlllll9llllxjK@@@@@@@@@@@@@@; cd=1462759834&0e4f734bc44c6b94eb0c55d652856a58; rd=Tkllllllll2ga@pYqh@euOtsDBPga@pzJpc@Tlllll9llllxjK@@@@@@@@@@@@@@; ABTEST=2|1463014549|v1; weixinIndexVisited=1; SNUID=DC3E6F24000432CA00010C9A01146570; sct=10; JSESSIONID=aaas3h6v8QyHibVAGsnrv; PHPSESSID=f5f6fj5qermt5kmjkmvd7d3u86; SUIR=DC3E6F24000432CA00010C9A01146570; ad=8MLRxZllll2Q08yclllllVtfbIUlllllJpc@Tlllllwlllll9ylll5@@@@@@@@@@; IPLOC=CN1100; LSTMV=315%2C189; LCLKINT=9279', 50 | 'Upgrade-Insecure-Requests': '1', 51 | 'Referer': 'http://weixin.sogou.com/weixin?query=%E5%A4%A7%E6%95%B0%E6%8D%AE%E6%96%87%E6%91%98', 52 | } 53 | 54 | # Enable or disable spider middlewares 55 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 56 | #SPIDER_MIDDLEWARES = { 57 | # 'weixin.middlewares.MyCustomSpiderMiddleware': 543, 58 | #} 59 | 60 | # Enable or disable downloader middlewares 61 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 62 | #DOWNLOADER_MIDDLEWARES = { 63 | # 'weixin.middlewares.MyCustomDownloaderMiddleware': 543, 64 | #} 65 | 66 | # Enable or disable extensions 67 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 68 | #EXTENSIONS = { 69 | # 'scrapy.extensions.telnet.TelnetConsole': None, 70 | #} 71 | 72 | # Configure item pipelines 73 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 74 | ITEM_PIPELINES = { 75 | 'weixin.pipelines.WeixinPipeline': 300, 76 | } 77 | 78 | # Enable and configure the AutoThrottle extension (disabled by default) 79 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 80 | #AUTOTHROTTLE_ENABLED = True 81 | # The initial download delay 82 | #AUTOTHROTTLE_START_DELAY = 5 83 | # The maximum download delay to be set in case of high latencies 84 | #AUTOTHROTTLE_MAX_DELAY = 60 85 | # The average number of requests Scrapy should be sending in parallel to 86 | # each remote server 87 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 88 | # Enable showing throttling stats for every response received: 89 | #AUTOTHROTTLE_DEBUG = False 90 | 91 | # Enable and configure HTTP caching (disabled by default) 92 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 93 | #HTTPCACHE_ENABLED = True 94 | #HTTPCACHE_EXPIRATION_SECS = 0 95 | #HTTPCACHE_DIR = 'httpcache' 96 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 97 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 98 | -------------------------------------------------------------------------------- /weixin/spiders/gongzhonghao.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | reload(sys) 5 | sys.setdefaultencoding('utf8') 6 | import MySQLdb 7 | import scrapy 8 | import time 9 | import subprocess 10 | from scrapy.http import HtmlResponse 11 | from scrapy.selector import Selector 12 | from weixin.items import WeixinItem 13 | from w3lib.html import remove_tags 14 | from weixin.conn import Conn 15 | 16 | class GongzhonghaoSpider(scrapy.Spider): 17 | name = "gongzhonghao" 18 | allowed_domains = ["qq.com"] 19 | start_urls = [ 20 | "http://weixin.sogou.com/weixin?query=算法与数学之美", 21 | "http://weixin.sogou.com/weixin?query=大数据文摘", 22 | "http://weixin.sogou.com/weixin?query=大数据技术", 23 | "http://weixin.sogou.com/weixin?query=数据分析精选", 24 | "http://weixin.sogou.com/weixin?query=算法与数学之美", 25 | "http://weixin.sogou.com/weixin?query=数盟", 26 | "http://weixin.sogou.com/weixin?query=MachineLearning_", 27 | "http://weixin.sogou.com/weixin?query=犇犇机器学习", 28 | "http://weixin.sogou.com/weixin?query=我爱机器学习", 29 | "http://weixin.sogou.com/weixin?query=机器学习算法与Python学习", 30 | "http://weixin.sogou.com/weixin?query=机器学习与大数据", 31 | "http://weixin.sogou.com/weixin?query=PLY机器学习俱乐部", 32 | "http://weixin.sogou.com/weixin?query=机器学习与人工智能", 33 | "http://weixin.sogou.com/weixin?query=菜鸟的机器学习", 34 | "http://weixin.sogou.com/weixin?query=培乐园机器学习社区", 35 | "http://weixin.sogou.com/weixin?query=我为机器学习狂", 36 | "http://weixin.sogou.com/weixin?query=深度学习", 37 | "http://weixin.sogou.com/weixin?query=深度学习世界", 38 | "http://weixin.sogou.com/weixin?query=数据挖掘菜鸟", 39 | "http://weixin.sogou.com/weixin?query=数据挖掘DW", 40 | "http://weixin.sogou.com/weixin?query=数据挖掘", 41 | "http://weixin.sogou.com/weixin?query=大数据挖掘", 42 | "http://weixin.sogou.com/weixin?query=数据分析精选", 43 | "http://weixin.sogou.com/weixin?query=人工智能头条", 44 | "http://weixin.sogou.com/weixin?query=程序员的自留地", 45 | "http://weixin.sogou.com/weixin?query=互联网创业大佬", 46 | "http://weixin.sogou.com/weixin?query=互联网创业刊", 47 | "http://weixin.sogou.com/weixin?query=互联网创业交流", 48 | "http://weixin.sogou.com/weixin?query=互联网创业指南", 49 | "http://weixin.sogou.com/weixin?query=互联网创业空间", 50 | "http://weixin.sogou.com/weixin?query=互联网创业思维", 51 | "http://weixin.sogou.com/weixin?query=全栈笔记", 52 | "http://weixin.sogou.com/weixin?query=全栈人生", 53 | "http://weixin.sogou.com/weixin?query=全栈程序猿", 54 | ] 55 | 56 | def __init__(self): 57 | self.conn = Conn().getConnection() 58 | self.cursor = self.conn.cursor() 59 | 60 | def parse(self, response): 61 | href = response.selector.xpath('//div[@id="sogou_vr_11002301_box_0"]/@href').extract()[0] 62 | cmd="~/bin/phantomjs ./getBody.js '%s'" % href 63 | time.sleep(1) 64 | stdout, stderr = subprocess.Popen(cmd, shell=True, stdout = subprocess.PIPE, stderr = subprocess.PIPE).communicate() 65 | print stderr 66 | response = HtmlResponse(url=href, body=stdout) 67 | 68 | for selector in Selector(response=response).xpath('//*[@id="history"]/div/div/div/div'): 69 | href = selector.xpath('h4/@hrefs').extract()[0].strip() 70 | title = "" 71 | for elem in selector.xpath('h4/text()').extract(): 72 | if len(elem.strip()) > 0: 73 | title = elem.strip() 74 | abstract = selector.xpath('//*[contains(@class, "weui_media_desc")]/text()').extract()[0].strip() 75 | pubtime = selector.xpath('//*[contains(@class, "weui_media_extra_info")]/text()').extract()[0].strip() 76 | full_url = response.urljoin(href) 77 | n = 0 78 | if len(title) != 0: 79 | sql = "select * from CrawlPage where title='%s'" % title 80 | n = self.cursor.execute(sql) 81 | if len(title) == 0 or n == 0: 82 | yield scrapy.Request(full_url, callback=self.parse_profile) 83 | 84 | def parse_profile(self, response): 85 | title = response.xpath('//title/text()').extract()[0].strip() 86 | create_time = response.xpath('//em[@id="post-date"]/text()').extract()[0].strip() 87 | source = response.xpath('//a[@id="post-user"]/text()').extract()[0].strip() 88 | body = response.body.strip() 89 | tag_content = response.xpath('//div[@id="js_content"]').extract()[0].strip() 90 | content = remove_tags(tag_content).strip() 91 | item = WeixinItem() 92 | item['title'] = title 93 | item['create_time'] = create_time 94 | item['source'] = source 95 | item['body'] = body 96 | item['content'] = content 97 | return item 98 | --------------------------------------------------------------------------------