├── wechat_spider ├── wechat_spider │ ├── __pycache__ │ │ ├── items.cpython-35.pyc │ │ ├── __init__.cpython-35.pyc │ │ ├── pipelines.cpython-35.pyc │ │ ├── settings.cpython-35.pyc │ │ └── middlewares.cpython-35.pyc │ ├── spiders │ │ ├── __pycache__ │ │ │ ├── wechat.cpython-35.pyc │ │ │ └── __init__.cpython-35.pyc │ │ ├── __init__.py │ │ └── wechat.py │ ├── items.py │ ├── pipelines.py │ ├── proxyCrawl.py │ ├── settings.py │ └── middlewares.py └── scrapy.cfg └── README.md /wechat_spider/wechat_spider/__pycache__/items.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alicelmx/wechat_public_spider/HEAD/wechat_spider/wechat_spider/__pycache__/items.cpython-35.pyc -------------------------------------------------------------------------------- /wechat_spider/wechat_spider/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alicelmx/wechat_public_spider/HEAD/wechat_spider/wechat_spider/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /wechat_spider/wechat_spider/__pycache__/pipelines.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alicelmx/wechat_public_spider/HEAD/wechat_spider/wechat_spider/__pycache__/pipelines.cpython-35.pyc -------------------------------------------------------------------------------- /wechat_spider/wechat_spider/__pycache__/settings.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alicelmx/wechat_public_spider/HEAD/wechat_spider/wechat_spider/__pycache__/settings.cpython-35.pyc -------------------------------------------------------------------------------- /wechat_spider/wechat_spider/__pycache__/middlewares.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alicelmx/wechat_public_spider/HEAD/wechat_spider/wechat_spider/__pycache__/middlewares.cpython-35.pyc -------------------------------------------------------------------------------- /wechat_spider/wechat_spider/spiders/__pycache__/wechat.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alicelmx/wechat_public_spider/HEAD/wechat_spider/wechat_spider/spiders/__pycache__/wechat.cpython-35.pyc -------------------------------------------------------------------------------- /wechat_spider/wechat_spider/spiders/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alicelmx/wechat_public_spider/HEAD/wechat_spider/wechat_spider/spiders/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /wechat_spider/wechat_spider/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /wechat_spider/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = wechat_spider.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = wechat_spider 12 | -------------------------------------------------------------------------------- /wechat_spider/wechat_spider/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | class WechatSpiderItem(scrapy.Item): 11 | # 文章标题 12 | title = scrapy.Field() 13 | # 发布时间 14 | publishTime = scrapy.Field() 15 | # 文章内容 16 | article = scrapy.Field() 17 | # 公众号名字 18 | publicName = scrapy.Field() 19 | 20 | 21 | -------------------------------------------------------------------------------- /wechat_spider/wechat_spider/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | import pymysql 9 | from pymysql import connections 10 | 11 | class WechatSpiderPipeline(object): 12 | def __init__(self): 13 | # 连接数据库 14 | self.conn = pymysql.connect(host='localhost',port=3306,user='root',passwd='bamajie521mysql',db='alicelmx',charset='utf8mb4') 15 | self.cursor = self.conn.cursor() 16 | 17 | def process_item(self, item, spider): 18 | 19 | # 向表中插入数据 20 | title= item['title'] 21 | publishTime = item['publishTime'] 22 | article = item['article'] 23 | publicName = item['publicName'] 24 | 25 | # 查询是否有标题和公众号名称相同的元组 26 | self.cursor.execute("select title from wechatArticle;") 27 | titleList = self.cursor.fetchall() 28 | titleStr = ''.join(map(str,titleList)) 29 | 30 | self.cursor.execute("select publicName from wechatArticle;") 31 | nameList = self.cursor.fetchall() 32 | nameStr = ''.join(map(str,nameList)) 33 | 34 | if titleStr.find(title) == -1 and nameStr.find(publicName) == -1: 35 | # 插入的sql语句 36 | sql="""INSERT INTO wechatArticle 37 | (title,publishTime,article,publicName) 38 | VALUES 39 | (%s,%s,%s,%s) 40 | """ 41 | self.cursor.execute(sql,(title,publishTime,article,publicName)) 42 | self.conn.commit() 43 | else: 44 | print("该文章已经存在在数据库中!") 45 | return item 46 | 47 | def close_spider(self,spider): 48 | self.conn.close() 49 | -------------------------------------------------------------------------------- /wechat_spider/wechat_spider/proxyCrawl.py: -------------------------------------------------------------------------------- 1 | import urllib.request 2 | from bs4 import BeautifulSoup 3 | import re 4 | import time 5 | import random 6 | 7 | 8 | 9 | # -------------------------------------------------------公用方法---------------------------------------------------- 10 | class CommanCalss: 11 | def __init__(self): 12 | self.header={'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36'} 13 | self.testurl="www.baidu.com" 14 | 15 | def getresponse(self,url): 16 | req = urllib.request.Request(url, headers=self.header) 17 | resp = urllib.request.urlopen(req, timeout=5) 18 | content = resp.read() 19 | return content 20 | 21 | def _is_alive(self,proxy): 22 | try: 23 | resp=0 24 | for i in range(3): 25 | proxy_support = urllib.request.ProxyHandler({"http": proxy}) 26 | opener = urllib.request.build_opener(proxy_support) 27 | urllib.request.install_opener(opener) 28 | req = urllib.request.Request(self.url, headers=self.header) 29 | # 访问 30 | resp = urllib.request.urlopen(req, timeout=5) 31 | if resp == 200: 32 | return True 33 | except: 34 | return False 35 | 36 | 37 | 38 | # -------------------------------------------------------代理池---------------------------------------------------- 39 | class ProxyPool: 40 | def __init__(self,proxy_finder): 41 | self.pool=[] 42 | self.proxy_finder=proxy_finder 43 | self.cominstan=CommanCalss() 44 | 45 | def get_proxies(self): 46 | self.pool=self.proxy_finder.find() 47 | for p in self.pool: 48 | if self.cominstan._is_alive(p): 49 | continue 50 | else: 51 | self.pool.remove(p) 52 | 53 | def get_one_proxy(self): 54 | return random.choice(self.pool) 55 | 56 | def writeToTxt(self,file_path): 57 | try: 58 | fp = open(file_path, "w+") 59 | for item in self.pool: 60 | fp.write('"'+'http://'+str(item)+'"' + '\n') 61 | fp.close() 62 | except IOError: 63 | print("fail to open file") 64 | 65 | 66 | #-------------------------------------------------------获取代理方法---------------------------------------------------- 67 | #定义一个基类 68 | class IProxyFinder: 69 | def __init__(self): 70 | self.pool = [] 71 | 72 | def find(self): 73 | return 74 | 75 | #西祠代理爬取 76 | class XiciProxyFinder(IProxyFinder): 77 | def __init__(self, url): 78 | super(XiciProxyFinder,self).__init__() 79 | self.url=url 80 | self.cominstan = CommanCalss() 81 | 82 | def find(self): 83 | for i in range(1, 10): 84 | content = self.cominstan.getresponse(self.url + str(i)) 85 | soup = BeautifulSoup(content,'lxml') 86 | ips = soup.findAll('tr') 87 | for x in range(2, len(ips)): 88 | ip = ips[x] 89 | tds = ip.findAll("td") 90 | if tds == []: 91 | continue 92 | ip_temp = tds[1].contents[0] + ":" + tds[2].contents[0] 93 | self.pool.append(ip_temp) 94 | time.sleep(1) 95 | return self.pool 96 | 97 | #-------------------------------------------------------测试---------------------------------------------------- 98 | if __name__ == '__main__': 99 | finder = XiciProxyFinder("http://www.xicidaili.com/wn/") 100 | ppool_instance = ProxyPool(finder) 101 | ppool_instance.get_proxies() 102 | ppool_instance.writeToTxt("proxy.txt") 103 | -------------------------------------------------------------------------------- /wechat_spider/wechat_spider/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import random 3 | # Scrapy settings for wechat_spider project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'wechat_spider' 13 | 14 | SPIDER_MODULES = ['wechat_spider.spiders'] 15 | NEWSPIDER_MODULE = 'wechat_spider.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = False 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | import random 31 | DOWNLOAD_DELAY = random.randint(1, 3) 32 | # The download delay setting will honor only one of: 33 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 34 | #CONCURRENT_REQUESTS_PER_IP = 16 35 | 36 | # Disable cookies (enabled by default) 37 | # COOKIES_ENABLED = False 38 | 39 | # Disable Telnet Console (enabled by default) 40 | #TELNETCONSOLE_ENABLED = False 41 | 42 | # Override the default request headers: 43 | #DEFAULT_REQUEST_HEADERS = { 44 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 45 | # 'Accept-Language': 'en', 46 | #} 47 | 48 | # Enable or disable spider middlewares 49 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 50 | #SPIDER_MIDDLEWARES = { 51 | # 'wechat_spider.middlewares.WechatSpiderSpiderMiddleware': 543, 52 | #} 53 | 54 | # Enable or disable downloader middlewares 55 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 56 | DOWNLOADER_MIDDLEWARES = { 57 | 'wechat_spider.middlewares.RandomUserAgent': 10, 58 | 'wechat_spider.middlewares.ProxyMiddleWare': 100, 59 | } 60 | 61 | # Enable or disable extensions 62 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 63 | #EXTENSIONS = { 64 | # 'scrapy.extensions.telnet.TelnetConsole': None, 65 | #} 66 | 67 | # Configure item pipelines 68 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 69 | ITEM_PIPELINES = { 70 | 'wechat_spider.pipelines.WechatSpiderPipeline': 300, 71 | } 72 | 73 | # Enable and configure the AutoThrottle extension (disabled by default) 74 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 75 | #AUTOTHROTTLE_ENABLED = True 76 | # The initial download delay 77 | #AUTOTHROTTLE_START_DELAY = 5 78 | # The maximum download delay to be set in case of high latencies 79 | #AUTOTHROTTLE_MAX_DELAY = 60 80 | # The average number of requests Scrapy should be sending in parallel to 81 | # each remote server 82 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 83 | # Enable showing throttling stats for every response received: 84 | #AUTOTHROTTLE_DEBUG = False 85 | 86 | # Enable and configure HTTP caching (disabled by default) 87 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 88 | #HTTPCACHE_ENABLED = True 89 | #HTTPCACHE_EXPIRATION_SECS = 0 90 | #HTTPCACHE_DIR = 'httpcache' 91 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 92 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 93 | 94 | # 连接到数据库中 95 | # start MySQL database configure setting 96 | MYSQL_HOST = 'localhost' 97 | MYSQL_DBNAME = 'alicelmx' 98 | MYSQL_USER = 'root' 99 | MYSQL_PASSWD = 'bamajie521mysql' 100 | # end of MySQL database configure setting 101 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # wechat_public_spider 2 | 通过搜狗搜索引擎爬取微信公众号文章 3 | 4 | 小明酱于2018年元旦更新,写的还是很糙,如果你在爬虫问题中遇到问题,欢迎交流哦,评论区随时为你开放! 5 | 实习两周过去了,目前任务量还不是很大。我的老板很nice,是个军校生,给我安排的任务也比我预想的要贴近我的研究方向,做的是微信公众号文章的舆情监控系统,以下是该系统总体设计流程图: 6 | 7 | ![舆情监控系统](http://img.blog.csdn.net/20171222085524606?watermark/2/text/aHR0cDovL2Jsb2cuY3Nkbi5uZXQvYWxpY2VsbXg=/font/5a6L5L2T/fontsize/400/fill/I0JBQkFCMA==/dissolve/70/gravity/SouthEast) 8 | 9 | 目前第一周是爬取微信公众号的文章,主要功能如下: 10 | 11 | - 按照搜索公众号id和关键字两种方法爬取文章的标题、内容、发布时间、 公众号名称 12 | - 以正确编码格式存储到数据库中 13 | - 实现将新增数据添加入数据库 14 | - 在关键字检索方式中按照时间顺序将文章排序,实现翻页爬取 15 | 16 | 以上功能均已实现,真心觉得在项目中学习才是最高效的方法,但同时也有不求甚解的毛病,希望自己能够深入把握下一周新学的知识,做一个总结,不能仅停留在插件式的编程。 17 | 18 | 下面我讲述下思路过程[GitHub代码点击此处](https://github.com/alicelmx/wechat_public_spider): 19 | ### ***主体思路*** 20 | - 通过[微信合作方搜狗搜索引擎](http://weixin.sogou.com/)发送相应请求来间接抓取,可以实现两种检索方式,如下图:搜文章和搜公众号。 21 | 22 | 输入公众号ID,获取爬虫起始地址 23 | ``` 24 | http://weixin.sogou.com/weixin?type=1&s_from=input&query=+公众号ID+&ie=utf8&_sug_=n&_sug_type_= 25 | ``` 26 | ![搜索指定公众号](http://img.blog.csdn.net/20171222092034734?watermark/2/text/aHR0cDovL2Jsb2cuY3Nkbi5uZXQvYWxpY2VsbXg=/font/5a6L5L2T/fontsize/400/fill/I0JBQkFCMA==/dissolve/70/gravity/SouthEast) 27 | 28 | - 所用环境:python 3.5 | scrapy爬虫框架 | Mac osX | mySQL数据库 29 | 30 | ### ***根据网页结构设计爬取规则*** 31 | 在这个阶段我徘徊了很久,看到很多demo设计了花里胡哨的反扒策略,让我慌张了很久,多谢漫步大佬给我的讲解,他的嫌弃让我进步。 32 | 33 | - 按照公众号id爬取该公众号最新的十条文章 34 | 简单的三级爬取 35 | **在搜索引擎上使用微信公众号英文名进行“搜公众号”操作(因为公众号英文名是公众号唯一的,而中文名可能会有重复,同时公众号名字一定要完全正确,不然可能搜到很多东西,这样我们可以减少数据的筛选工作,只要找到这个唯一英文名对应的那条数据即可)** 36 | 37 | 1.第一级:找到指定公众号,获取公众号主页链接 38 | 39 | ![这里写图片描述](http://img.blog.csdn.net/20171222092516780?watermark/2/text/aHR0cDovL2Jsb2cuY3Nkbi5uZXQvYWxpY2VsbXg=/font/5a6L5L2T/fontsize/400/fill/I0JBQkFCMA==/dissolve/70/gravity/SouthEast) 40 | 41 | 2.第二级:跳转到主页,找到每条文章的链接 42 | 43 | ![这里写图片描述](http://img.blog.csdn.net/20171222092612772?watermark/2/text/aHR0cDovL2Jsb2cuY3Nkbi5uZXQvYWxpY2VsbXg=/font/5a6L5L2T/fontsize/400/fill/I0JBQkFCMA==/dissolve/70/gravity/SouthEast) 44 | 45 | 3.第三级:进入每条文章页面进行信息爬取,三条绿框中的信息,还有页面主体内容 46 | 47 | ![这里写图片描述](http://img.blog.csdn.net/20171222093038688?watermark/2/text/aHR0cDovL2Jsb2cuY3Nkbi5uZXQvYWxpY2VsbXg=/font/5a6L5L2T/fontsize/400/fill/I0JBQkFCMA==/dissolve/70/gravity/SouthEast) 48 | 49 | 主体思路就是这个,其中利用chrome进行检查部分就不在细说,都是常规操作 50 | 51 | - 按照关键字爬取相关文章——二级爬取,思路同上,但是出现严重问题: 52 | 当我按照时间排序来获取具有时效性的文章时,通过筛选会得到一个URL 53 | 54 | ![这里写图片描述](http://img.blog.csdn.net/20180101222644849?watermark/2/text/aHR0cDovL2Jsb2cuY3Nkbi5uZXQvYWxpY2VsbXg=/font/5a6L5L2T/fontsize/400/fill/I0JBQkFCMA==/dissolve/70/gravity/SouthEast) 55 | 56 | URL: 57 | ``` 58 | http://weixin.sogou.com/weixin?type=2&ie=utf8&query=%E6%98%A5%E8%8A%82&tsn=1&ft=&et=&interation=&wxid=&usip= 59 | ``` 60 | 但是我将该URL复制到浏览器中时,他会返回到微信搜索的主页,啊哦那该怎么办呢?我们先来看下开发者工具 61 | 62 | ![这里写图片描述](http://img.blog.csdn.net/20180101222905740?watermark/2/text/aHR0cDovL2Jsb2cuY3Nkbi5uZXQvYWxpY2VsbXg=/font/5a6L5L2T/fontsize/400/fill/I0JBQkFCMA==/dissolve/70/gravity/SouthEast) 63 | 64 | 以上参数刚好和URL对应上了,重点关注tsn它代表访问的是一天内的文章,我们应该做如下请求: 65 | 66 | ``` 67 | return [scrapy.FormRequest(url='http://weixin.sogou.com/weixin', 68 | formdata={'type':'2', 69 | 'ie':'utf8', 70 | 'query':key, 71 | 'tsn':'1', 72 | 'ft':'', 73 | 'et':'', 74 | 'interation':'', 75 | 'sst0': str(int(time.time()*1000)), 76 | 'page': str(self.page), 77 | 'wxid':'', 78 | 'usip':''}, 79 | method='get'] 80 | ``` 81 | 以上问题就解决了~ 82 | ### ***存储细节*** 83 | 数据存储的量还是很大的,需要很多样本来训练模型,现在存储到数据库中,但是有一个细节值得引起我的注意就是数据的编码问题,现在公众号的文章中很多emoji字符,因此会出现以下错误: 84 | ``` 85 | pymysql.err.InternalError: (1366, "Incorrect string value: '\\xF0\\x9F\\x93\\xBD \\xC2...' for column 'article' at row 1") 86 | ``` 87 | 解决方法参照: 88 | http://blog.csdn.net/alicelmx/article/details/78890311 89 | http://blog.csdn.net/alicelmx/article/details/78890914 90 | 91 | ### ***与反爬虫作斗争*** 92 | 主要是遇到了验证码的问题,但是如果爬取速度不是很快的话,是可以避免的,因此采用如下两个策略: 93 | 94 | - 在下载中间件中添加随机User-Agent和随机代理IP 95 | 由于试了几个爬虫获取的代理都不能用,浪费了很长时间,代理IP添加方式如下: 96 | http://blog.csdn.net/alicelmx/article/details/78947884 97 | 98 | - 在设置中delay参数设置为随机数字 99 | ``` 100 | import random 101 | DOWNLOAD_DELAY = random.randint(1, 3) 102 | ``` 103 | -------------------------------------------------------------------------------- /wechat_spider/wechat_spider/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | import random 10 | import scrapy 11 | from scrapy import log 12 | 13 | class WechatSpiderSpiderMiddleware(object): 14 | # Not all methods need to be defined. If a method is not defined, 15 | # scrapy acts as if the spider middleware does not modify the 16 | # passed objects. 17 | 18 | @classmethod 19 | def from_crawler(cls, crawler): 20 | # This method is used by Scrapy to create your spiders. 21 | s = cls() 22 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 23 | return s 24 | 25 | def process_spider_input(self, response, spider): 26 | # Called for each response that goes through the spider 27 | # middleware and into the spider. 28 | 29 | # Should return None or raise an exception. 30 | return None 31 | 32 | def process_spider_output(self, response, result, spider): 33 | # Called with the results returned from the Spider, after 34 | # it has processed the response. 35 | 36 | # Must return an iterable of Request, dict or Item objects. 37 | for i in result: 38 | yield i 39 | 40 | def process_spider_exception(self, response, exception, spider): 41 | # Called when a spider or process_spider_input() method 42 | # (from other spider middleware) raises an exception. 43 | 44 | # Should return either None or an iterable of Response, dict 45 | # or Item objects. 46 | pass 47 | 48 | def process_start_requests(self, start_requests, spider): 49 | # Called with the start requests of the spider, and works 50 | # similarly to the process_spider_output() method, except 51 | # that it doesn’t have a response associated. 52 | 53 | # Must return only requests (not items). 54 | for r in start_requests: 55 | yield r 56 | 57 | def spider_opened(self, spider): 58 | spider.logger.info('Spider opened: %s' % spider.name) 59 | 60 | class RandomUserAgent: 61 | def __init__(self, agents): 62 | self.agents =[ 63 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", 64 | "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", 65 | "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)", 66 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)", 67 | "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6", 68 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1", 69 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0", 70 | "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5" 71 | ] 72 | 73 | @classmethod 74 | def from_crawler(cls,crawler): 75 | # 获取settings的USER_AGENT列表并返回 76 | return cls(crawler.settings.getlist('USER_AGENTS')) 77 | def process_request(self, request, spider): 78 | # 随机设置Request报头header的User-Agent 79 | request.headers.setdefault('User-Agent', random.choice(self.agents)) 80 | 81 | # 添加代理 82 | 83 | class ProxyMiddleWare(object): 84 | proxy_list=[ 85 | "http://58.87.89.234:3128", 86 | "http://139.201.202.140:53281", 87 | "http://27.37.123.30:9000", 88 | "http://218.67.82.146:36709", 89 | "http://222.222.169.60:53281", 90 | "http://120.33.247.233:46884", 91 | "http://114.215.18.7:3128", 92 | "http://112.74.94.142:3128", 93 | "http://122.72.18.34:80", 94 | "http://36.33.25.123:808", 95 | "http://123.138.89.133:9999", 96 | "http://111.231.192.61:8080", 97 | "http://59.41.202.228:53281", 98 | "http://222.241.14.187:8888", 99 | "http://61.155.164.106:3128", 100 | "http://27.40.156.43:61234", 101 | "http://14.29.84.50:8080", 102 | "http://116.25.100.62:9797", 103 | "http://58.21.183.144:80", 104 | "http://14.221.166.205:9000", 105 | "http://115.231.50.10:53281", 106 | "http://120.34.205.40:808", 107 | "http://123.139.56.238:9999", 108 | "http://113.116.170.232:9000", 109 | "http://116.17.236.36:808", 110 | "http://114.232.163.73:34837", 111 | "http://171.35.103.37:808", 112 | "http://27.46.51.232:9797", 113 | "http://223.247.255.207:24714", 114 | "http://223.241.117.179:8010", 115 | "http://222.186.12.102:57624"] 116 | def process_request(self,request,spider): 117 | # if not request.meta['proxies']: 118 | ip = random.choice(self.proxy_list) 119 | request.meta['proxy'] = ip 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | -------------------------------------------------------------------------------- /wechat_spider/wechat_spider/spiders/wechat.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | from wechat_spider.items import WechatSpiderItem 4 | import json 5 | import re 6 | from bs4 import BeautifulSoup 7 | import time 8 | 9 | class WechatSpider(scrapy.Spider): 10 | name = 'wechat' 11 | # allowed_domains = ['weixin.sogou.com/'] 12 | headers = {'Host': 'weixin.sogou.com', 13 | 'Referer': 'http://weixin.sogou.com/weixin?type=2&s_from=input&query=%E6%B5%85%E5%B1%B1%E5%B0%8F%E7%AD%91&ie=utf8&_sug_=y&_sug_type_=&w=01019900&sut=5109&sst0=1513697178371&lkt=0%2C0%2C0', 14 | 'Upgrade-Insecure-Requests': '1', 15 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'} 16 | 17 | print(''' 18 | ************************** 19 | Welcome to 公众号爬虫平台 20 | Created on 2017-12-21 21 | @author: 小明酱lmx 22 | ************************** 23 | ''' ) 24 | print("1.按公众号名称搜索;2.按关键字进行搜索") 25 | choice = input("请输入查询模式:") 26 | page = 1 27 | def start_requests(self): 28 | if self.choice == '1': 29 | id = input("请输入你要查询的公众号id:") 30 | start_urls = 'http://weixin.sogou.com/weixin?type=1&s_from=input&query='+id+'&ie=utf8&_sug_=n&_sug_type_=' 31 | return [scrapy.Request(start_urls,callback=self.parse1)] 32 | 33 | elif self.choice == '2': 34 | key = input("请输入要查询的关键字:") 35 | maxNo = int(input('请输入查询的最大页码:')) 36 | return [scrapy.FormRequest(url='http://weixin.sogou.com/weixin', 37 | formdata={'type':'2', 38 | 'ie':'utf8', 39 | 'query':key, 40 | 'tsn':'1', 41 | 'ft':'', 42 | 'et':'', 43 | 'interation':'', 44 | 'sst0': str(int(time.time()*1000)), 45 | 'page': str(self.page), 46 | 'wxid':'', 47 | 'usip':''}, 48 | method='get', 49 | meta={'key':key,'maxNo':maxNo}, 50 | headers=self.headers, 51 | callback=self.parse2)] 52 | else: 53 | print('输入有误,程序退出') 54 | return 55 | 56 | def parse1(self, response): 57 | # 获取公众号URL 58 | publicUrl = response.xpath("//p[@class='tit']/a[@target='_blank']/@href").extract()[0] 59 | print("*********"+publicUrl+"************") 60 | yield scrapy.Request(publicUrl,cookies={'viewed':'"1083428"', '__utmv':'30149280.3975'},callback = self.parseArticleList) 61 | 62 | def parseArticleList(self,response): 63 | patt = re.compile(r'var msgList = (\{.*?\});') 64 | result = patt.search(response.text) 65 | url_list = json.loads(result.group(1))['list'] 66 | for data in url_list: 67 | title = data['app_msg_ext_info']['title'] 68 | article_url = data['app_msg_ext_info']['content_url'] 69 | url = 'https://mp.weixin.qq.com' + article_url.replace(r'amp;', '') 70 | yield scrapy.Request(url,meta={'title':title}, callback=self.parseArticle) 71 | 72 | def parseArticle(self,response): 73 | item = WechatSpiderItem() 74 | item['title'] = response.meta['title'] 75 | soup = BeautifulSoup(response.text, 'lxml') 76 | item['publishTime'] = soup.find('em',attrs={'class':'rich_media_meta rich_media_meta_text'}).get_text() 77 | item['article'] = soup.find('div', attrs={'class': 'rich_media_content '}).get_text() 78 | item['publicName'] = response.xpath("//a[@class='rich_media_meta rich_media_meta_link rich_media_meta_nickname']/text()").extract()[0] 79 | yield item 80 | 81 | # 找到每一个文章的链接,组成一个列表,并对每一个链接实行第二个方法及爬取文章主体信息 82 | def parse2(self, response): 83 | key = response.meta['key'] 84 | maxNo = response.meta['maxNo'] 85 | soup = BeautifulSoup(response.text, 'lxml') 86 | node_soup = soup.find('ul', attrs={'class': 'news-list'}) 87 | 88 | for node in node_soup.findAll('li'): 89 | url = node.select('div h3 a')[0]['href'] 90 | yield scrapy.Request(url, callback=self.parseArticleBody) 91 | 92 | # 实现翻页爬取 93 | while self.page < maxNo: 94 | self.page += 1 95 | yield scrapy.FormRequest(url='http://weixin.sogou.com/weixin', 96 | formdata={'type': '2','ie': 'utf8','query': key,'tsn': '1','ft': '','et': '','interation': '','sst0': str(int(time.time() * 1000)),'page': str(self.page),'wxid': '','usip': ''}, 97 | method='get', 98 | meta={'key':key,'maxNo':maxNo}, 99 | headers=self.headers, 100 | callback=self.parse2) 101 | 102 | # 爬取每个文章的所需信息 103 | def parseArticleBody(self,response): 104 | item = WechatSpiderItem() 105 | item['title'] = response.xpath("//div[@id='img-content']/h2[@class='rich_media_title']/text()").extract()[0].strip().replace('\r','').replace('\n','').replace('\t','') 106 | soup = BeautifulSoup(response.text, 'lxml') 107 | item['publishTime'] = soup.find('em',attrs={'class':'rich_media_meta rich_media_meta_text'}).get_text() 108 | item['article'] = soup.find('div', attrs={'class': 'rich_media_content '}).get_text() 109 | item['publicName'] = response.xpath("//a[@class='rich_media_meta rich_media_meta_link rich_media_meta_nickname']/text()").extract()[0] 110 | yield item 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | --------------------------------------------------------------------------------