├── wechat_spider
    ├── wechat_spider
    │   ├── __pycache__
    │   │   ├── items.cpython-35.pyc
    │   │   ├── __init__.cpython-35.pyc
    │   │   ├── pipelines.cpython-35.pyc
    │   │   ├── settings.cpython-35.pyc
    │   │   └── middlewares.cpython-35.pyc
    │   ├── spiders
    │   │   ├── __pycache__
    │   │   │   ├── wechat.cpython-35.pyc
    │   │   │   └── __init__.cpython-35.pyc
    │   │   ├── __init__.py
    │   │   └── wechat.py
    │   ├── items.py
    │   ├── pipelines.py
    │   ├── proxyCrawl.py
    │   ├── settings.py
    │   └── middlewares.py
    └── scrapy.cfg
└── README.md


/wechat_spider/wechat_spider/__pycache__/items.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alicelmx/wechat_public_spider/HEAD/wechat_spider/wechat_spider/__pycache__/items.cpython-35.pyc


--------------------------------------------------------------------------------
/wechat_spider/wechat_spider/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alicelmx/wechat_public_spider/HEAD/wechat_spider/wechat_spider/__pycache__/__init__.cpython-35.pyc


--------------------------------------------------------------------------------
/wechat_spider/wechat_spider/__pycache__/pipelines.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alicelmx/wechat_public_spider/HEAD/wechat_spider/wechat_spider/__pycache__/pipelines.cpython-35.pyc


--------------------------------------------------------------------------------
/wechat_spider/wechat_spider/__pycache__/settings.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alicelmx/wechat_public_spider/HEAD/wechat_spider/wechat_spider/__pycache__/settings.cpython-35.pyc


--------------------------------------------------------------------------------
/wechat_spider/wechat_spider/__pycache__/middlewares.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alicelmx/wechat_public_spider/HEAD/wechat_spider/wechat_spider/__pycache__/middlewares.cpython-35.pyc


--------------------------------------------------------------------------------
/wechat_spider/wechat_spider/spiders/__pycache__/wechat.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alicelmx/wechat_public_spider/HEAD/wechat_spider/wechat_spider/spiders/__pycache__/wechat.cpython-35.pyc


--------------------------------------------------------------------------------
/wechat_spider/wechat_spider/spiders/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alicelmx/wechat_public_spider/HEAD/wechat_spider/wechat_spider/spiders/__pycache__/__init__.cpython-35.pyc


--------------------------------------------------------------------------------
/wechat_spider/wechat_spider/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/wechat_spider/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = wechat_spider.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = wechat_spider
12 | 


--------------------------------------------------------------------------------
/wechat_spider/wechat_spider/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | class WechatSpiderItem(scrapy.Item):
11 | 	# 文章标题
12 | 	title = scrapy.Field()
13 | 	# 发布时间
14 | 	publishTime = scrapy.Field()
15 | 	# 文章内容
16 | 	article = scrapy.Field()
17 | 	# 公众号名字
18 | 	publicName = scrapy.Field()
19 | 
20 | 
21 | 


--------------------------------------------------------------------------------
/wechat_spider/wechat_spider/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | import pymysql
 9 | from pymysql import connections
10 | 
11 | class WechatSpiderPipeline(object):
12 | 	def __init__(self):
13 | 		# 连接数据库
14 | 		self.conn = pymysql.connect(host='localhost',port=3306,user='root',passwd='bamajie521mysql',db='alicelmx',charset='utf8mb4')
15 | 		self.cursor = self.conn.cursor() 
16 | 
17 | 	def process_item(self, item, spider):
18 | 		
19 | 		# 向表中插入数据
20 | 		title= item['title']
21 | 		publishTime = item['publishTime']
22 | 		article = item['article']
23 | 		publicName = item['publicName']
24 | 		
25 | 		# 查询是否有标题和公众号名称相同的元组
26 | 		self.cursor.execute("select title from wechatArticle;")
27 | 		titleList = self.cursor.fetchall()
28 | 		titleStr = ''.join(map(str,titleList))
29 | 		
30 | 		self.cursor.execute("select publicName from wechatArticle;")
31 | 		nameList = self.cursor.fetchall()
32 | 		nameStr = ''.join(map(str,nameList))
33 | 
34 | 		if titleStr.find(title) == -1 and nameStr.find(publicName) == -1:
35 | 			# 插入的sql语句
36 | 			sql="""INSERT INTO wechatArticle 
37 | 			         (title,publishTime,article,publicName) 
38 | 			         VALUES 
39 | 			         (%s,%s,%s,%s)
40 | 			     """
41 | 			self.cursor.execute(sql,(title,publishTime,article,publicName))
42 | 			self.conn.commit()
43 | 		else:
44 | 			print("该文章已经存在在数据库中！")
45 | 		return item
46 |    
47 | 	def close_spider(self,spider):
48 | 		self.conn.close()
49 | 


--------------------------------------------------------------------------------
/wechat_spider/wechat_spider/proxyCrawl.py:
--------------------------------------------------------------------------------
  1 | import urllib.request
  2 | from bs4 import BeautifulSoup
  3 | import re
  4 | import time
  5 | import random
  6 | 
  7 | 
  8 | 
  9 | # -------------------------------------------------------公用方法----------------------------------------------------
 10 | class CommanCalss:
 11 |     def __init__(self):
 12 |         self.header={'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36'}
 13 |         self.testurl="www.baidu.com"
 14 | 
 15 |     def getresponse(self,url):
 16 |         req = urllib.request.Request(url, headers=self.header)
 17 |         resp = urllib.request.urlopen(req, timeout=5)
 18 |         content = resp.read()
 19 |         return content
 20 | 
 21 |     def _is_alive(self,proxy):
 22 |         try:
 23 |             resp=0
 24 |             for i in range(3):
 25 |                 proxy_support = urllib.request.ProxyHandler({"http": proxy})
 26 |                 opener = urllib.request.build_opener(proxy_support)
 27 |                 urllib.request.install_opener(opener)
 28 |                 req = urllib.request.Request(self.url, headers=self.header)
 29 |                 # 访问
 30 |                 resp = urllib.request.urlopen(req, timeout=5)
 31 |             if resp == 200:
 32 |                 return True
 33 |         except:
 34 |             return False
 35 | 
 36 | 
 37 | 
 38 |  # -------------------------------------------------------代理池----------------------------------------------------
 39 | class ProxyPool:
 40 |     def __init__(self,proxy_finder):
 41 |         self.pool=[]
 42 |         self.proxy_finder=proxy_finder
 43 |         self.cominstan=CommanCalss()
 44 | 
 45 |     def get_proxies(self):
 46 |         self.pool=self.proxy_finder.find()
 47 |         for p in self.pool:
 48 |             if self.cominstan._is_alive(p):
 49 |                 continue
 50 |             else:
 51 |                 self.pool.remove(p)
 52 | 
 53 |     def get_one_proxy(self):
 54 |         return random.choice(self.pool)
 55 | 
 56 |     def writeToTxt(self,file_path):
 57 |         try:
 58 |             fp = open(file_path, "w+")
 59 |             for item in self.pool:
 60 |                 fp.write('"'+'http://'+str(item)+'"' + '\n')
 61 |             fp.close()
 62 |         except IOError:
 63 |             print("fail to open file")
 64 | 
 65 | 
 66 | #-------------------------------------------------------获取代理方法----------------------------------------------------
 67 | #定义一个基类
 68 | class IProxyFinder:
 69 |     def __init__(self):
 70 |         self.pool = []
 71 | 
 72 |     def find(self):
 73 |         return
 74 | 
 75 | #西祠代理爬取
 76 | class XiciProxyFinder(IProxyFinder):
 77 |     def __init__(self, url):
 78 |         super(XiciProxyFinder,self).__init__()
 79 |         self.url=url
 80 |         self.cominstan = CommanCalss()
 81 | 
 82 |     def find(self):
 83 |         for i in range(1, 10):
 84 |             content = self.cominstan.getresponse(self.url + str(i))
 85 |             soup = BeautifulSoup(content,'lxml')
 86 |             ips = soup.findAll('tr')
 87 |             for x in range(2, len(ips)):
 88 |                 ip = ips[x]
 89 |                 tds = ip.findAll("td")
 90 |                 if tds == []:
 91 |                     continue
 92 |                 ip_temp = tds[1].contents[0] + ":" + tds[2].contents[0]
 93 |                 self.pool.append(ip_temp)
 94 |         time.sleep(1)
 95 |         return  self.pool
 96 | 
 97 | #-------------------------------------------------------测试----------------------------------------------------
 98 | if __name__ == '__main__':
 99 |     finder = XiciProxyFinder("http://www.xicidaili.com/wn/")
100 |     ppool_instance = ProxyPool(finder)
101 |     ppool_instance.get_proxies()
102 |     ppool_instance.writeToTxt("proxy.txt")
103 | 


--------------------------------------------------------------------------------
/wechat_spider/wechat_spider/settings.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import random
  3 | # Scrapy settings for wechat_spider project
  4 | #
  5 | # For simplicity, this file contains only settings considered important or
  6 | # commonly used. You can find more settings consulting the documentation:
  7 | #
  8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
  9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 11 | 
 12 | BOT_NAME = 'wechat_spider'
 13 | 
 14 | SPIDER_MODULES = ['wechat_spider.spiders']
 15 | NEWSPIDER_MODULE = 'wechat_spider.spiders'
 16 | 
 17 | 
 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
 19 | USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
 20 | 
 21 | # Obey robots.txt rules
 22 | ROBOTSTXT_OBEY = False
 23 | 
 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
 25 | #CONCURRENT_REQUESTS = 32
 26 | 
 27 | # Configure a delay for requests for the same website (default: 0)
 28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 29 | # See also autothrottle settings and docs
 30 | import random
 31 | DOWNLOAD_DELAY = random.randint(1, 3)
 32 | # The download delay setting will honor only one of:
 33 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 34 | #CONCURRENT_REQUESTS_PER_IP = 16
 35 | 
 36 | # Disable cookies (enabled by default)
 37 | # COOKIES_ENABLED = False
 38 | 
 39 | # Disable Telnet Console (enabled by default)
 40 | #TELNETCONSOLE_ENABLED = False
 41 | 
 42 | # Override the default request headers:
 43 | #DEFAULT_REQUEST_HEADERS = {
 44 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 45 | #   'Accept-Language': 'en',
 46 | #}
 47 | 
 48 | # Enable or disable spider middlewares
 49 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 50 | #SPIDER_MIDDLEWARES = {
 51 | #    'wechat_spider.middlewares.WechatSpiderSpiderMiddleware': 543,
 52 | #}
 53 | 
 54 | # Enable or disable downloader middlewares
 55 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 56 | DOWNLOADER_MIDDLEWARES = {
 57 |    'wechat_spider.middlewares.RandomUserAgent': 10,
 58 |    'wechat_spider.middlewares.ProxyMiddleWare': 100,
 59 | }
 60 | 
 61 | # Enable or disable extensions
 62 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
 63 | #EXTENSIONS = {
 64 | #    'scrapy.extensions.telnet.TelnetConsole': None,
 65 | #}
 66 | 
 67 | # Configure item pipelines
 68 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
 69 | ITEM_PIPELINES = {
 70 |    'wechat_spider.pipelines.WechatSpiderPipeline': 300,
 71 | }
 72 | 
 73 | # Enable and configure the AutoThrottle extension (disabled by default)
 74 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
 75 | #AUTOTHROTTLE_ENABLED = True
 76 | # The initial download delay
 77 | #AUTOTHROTTLE_START_DELAY = 5
 78 | # The maximum download delay to be set in case of high latencies
 79 | #AUTOTHROTTLE_MAX_DELAY = 60
 80 | # The average number of requests Scrapy should be sending in parallel to
 81 | # each remote server
 82 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
 83 | # Enable showing throttling stats for every response received:
 84 | #AUTOTHROTTLE_DEBUG = False
 85 | 
 86 | # Enable and configure HTTP caching (disabled by default)
 87 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
 88 | #HTTPCACHE_ENABLED = True
 89 | #HTTPCACHE_EXPIRATION_SECS = 0
 90 | #HTTPCACHE_DIR = 'httpcache'
 91 | #HTTPCACHE_IGNORE_HTTP_CODES = []
 92 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
 93 | 
 94 | # 连接到数据库中
 95 | # start MySQL database configure setting
 96 | MYSQL_HOST = 'localhost'
 97 | MYSQL_DBNAME = 'alicelmx'
 98 | MYSQL_USER = 'root'
 99 | MYSQL_PASSWD = 'bamajie521mysql'
100 | # end of MySQL database configure setting
101 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # wechat_public_spider
  2 | 通过搜狗搜索引擎爬取微信公众号文章
  3 | 
  4 | 小明酱于2018年元旦更新，写的还是很糙，如果你在爬虫问题中遇到问题，欢迎交流哦，评论区随时为你开放！
  5 | 实习两周过去了，目前任务量还不是很大。我的老板很nice，是个军校生，给我安排的任务也比我预想的要贴近我的研究方向，做的是微信公众号文章的舆情监控系统，以下是该系统总体设计流程图：
  6 | 
  7 | ![舆情监控系统](http://img.blog.csdn.net/20171222085524606?watermark/2/text/aHR0cDovL2Jsb2cuY3Nkbi5uZXQvYWxpY2VsbXg=/font/5a6L5L2T/fontsize/400/fill/I0JBQkFCMA==/dissolve/70/gravity/SouthEast)
  8 | 
  9 | 目前第一周是爬取微信公众号的文章，主要功能如下：
 10 | 
 11 |  - 按照搜索公众号id和关键字两种方法爬取文章的标题、内容、发布时间、 公众号名称
 12 |  - 以正确编码格式存储到数据库中
 13 |  - 实现将新增数据添加入数据库
 14 |  - 在关键字检索方式中按照时间顺序将文章排序，实现翻页爬取
 15 | 
 16 | 以上功能均已实现，真心觉得在项目中学习才是最高效的方法，但同时也有不求甚解的毛病，希望自己能够深入把握下一周新学的知识，做一个总结，不能仅停留在插件式的编程。
 17 | 
 18 | 下面我讲述下思路过程[GitHub代码点击此处](https://github.com/alicelmx/wechat_public_spider)：
 19 | ### ***主体思路***
 20 | - 通过[微信合作方搜狗搜索引擎](http://weixin.sogou.com/)发送相应请求来间接抓取，可以实现两种检索方式，如下图：搜文章和搜公众号。
 21 | 
 22 | 输入公众号ID，获取爬虫起始地址
 23 | ```
 24 | http://weixin.sogou.com/weixin?type=1&s_from=input&query=+公众号ID+&ie=utf8&_sug_=n&_sug_type_=
 25 | ```
 26 | ![搜索指定公众号](http://img.blog.csdn.net/20171222092034734?watermark/2/text/aHR0cDovL2Jsb2cuY3Nkbi5uZXQvYWxpY2VsbXg=/font/5a6L5L2T/fontsize/400/fill/I0JBQkFCMA==/dissolve/70/gravity/SouthEast)
 27 | 
 28 |  - 所用环境：python 3.5 | scrapy爬虫框架 | Mac osX | mySQL数据库
 29 | 
 30 | ### ***根据网页结构设计爬取规则***
 31 | 在这个阶段我徘徊了很久，看到很多demo设计了花里胡哨的反扒策略，让我慌张了很久，多谢漫步大佬给我的讲解，他的嫌弃让我进步。
 32 | 
 33 | - 按照公众号id爬取该公众号最新的十条文章
 34 | 简单的三级爬取
 35 | **在搜索引擎上使用微信公众号英文名进行“搜公众号”操作（因为公众号英文名是公众号唯一的，而中文名可能会有重复，同时公众号名字一定要完全正确，不然可能搜到很多东西，这样我们可以减少数据的筛选工作，只要找到这个唯一英文名对应的那条数据即可）**
 36 | 
 37 | 1.第一级：找到指定公众号，获取公众号主页链接
 38 | 
 39 | ![这里写图片描述](http://img.blog.csdn.net/20171222092516780?watermark/2/text/aHR0cDovL2Jsb2cuY3Nkbi5uZXQvYWxpY2VsbXg=/font/5a6L5L2T/fontsize/400/fill/I0JBQkFCMA==/dissolve/70/gravity/SouthEast)
 40 | 
 41 | 2.第二级：跳转到主页，找到每条文章的链接
 42 | 
 43 | ![这里写图片描述](http://img.blog.csdn.net/20171222092612772?watermark/2/text/aHR0cDovL2Jsb2cuY3Nkbi5uZXQvYWxpY2VsbXg=/font/5a6L5L2T/fontsize/400/fill/I0JBQkFCMA==/dissolve/70/gravity/SouthEast)
 44 | 
 45 | 3.第三级：进入每条文章页面进行信息爬取，三条绿框中的信息，还有页面主体内容
 46 | 
 47 | ![这里写图片描述](http://img.blog.csdn.net/20171222093038688?watermark/2/text/aHR0cDovL2Jsb2cuY3Nkbi5uZXQvYWxpY2VsbXg=/font/5a6L5L2T/fontsize/400/fill/I0JBQkFCMA==/dissolve/70/gravity/SouthEast)
 48 | 
 49 | 主体思路就是这个，其中利用chrome进行检查部分就不在细说，都是常规操作
 50 | 
 51 | - 按照关键字爬取相关文章——二级爬取，思路同上，但是出现严重问题：
 52 | 当我按照时间排序来获取具有时效性的文章时，通过筛选会得到一个URL
 53 | 
 54 | ![这里写图片描述](http://img.blog.csdn.net/20180101222644849?watermark/2/text/aHR0cDovL2Jsb2cuY3Nkbi5uZXQvYWxpY2VsbXg=/font/5a6L5L2T/fontsize/400/fill/I0JBQkFCMA==/dissolve/70/gravity/SouthEast)
 55 | 
 56 | URL：
 57 | ```
 58 | http://weixin.sogou.com/weixin?type=2&ie=utf8&query=%E6%98%A5%E8%8A%82&tsn=1&ft=&et=&interation=&wxid=&usip=
 59 | ```
 60 | 但是我将该URL复制到浏览器中时，他会返回到微信搜索的主页，啊哦那该怎么办呢？我们先来看下开发者工具
 61 | 
 62 | ![这里写图片描述](http://img.blog.csdn.net/20180101222905740?watermark/2/text/aHR0cDovL2Jsb2cuY3Nkbi5uZXQvYWxpY2VsbXg=/font/5a6L5L2T/fontsize/400/fill/I0JBQkFCMA==/dissolve/70/gravity/SouthEast)
 63 | 
 64 | 以上参数刚好和URL对应上了，重点关注tsn它代表访问的是一天内的文章，我们应该做如下请求：
 65 | 
 66 | ```
 67 | return [scrapy.FormRequest(url='http://weixin.sogou.com/weixin',
 68 |                            formdata={'type':'2',
 69 |                                      'ie':'utf8',
 70 |                                      'query':key,
 71 |                                      'tsn':'1',
 72 |                                      'ft':'',
 73 |                                      'et':'',
 74 |                                      'interation':'',
 75 |                                      'sst0': str(int(time.time()*1000)),
 76 |                                      'page': str(self.page),
 77 |                                      'wxid':'',
 78 |                                      'usip':''},
 79 |                             method='get']
 80 | ```
 81 | 以上问题就解决了～
 82 | ### ***存储细节***
 83 | 数据存储的量还是很大的，需要很多样本来训练模型，现在存储到数据库中，但是有一个细节值得引起我的注意就是数据的编码问题，现在公众号的文章中很多emoji字符，因此会出现以下错误：
 84 | ```
 85 | pymysql.err.InternalError: (1366, "Incorrect string value: '\\xF0\\x9F\\x93\\xBD \\xC2...' for column 'article' at row 1")
 86 | ```
 87 | 解决方法参照：
 88 | http://blog.csdn.net/alicelmx/article/details/78890311
 89 | http://blog.csdn.net/alicelmx/article/details/78890914
 90 | 
 91 | ### ***与反爬虫作斗争***
 92 | 主要是遇到了验证码的问题，但是如果爬取速度不是很快的话，是可以避免的，因此采用如下两个策略：
 93 | 
 94 |  - 在下载中间件中添加随机User-Agent和随机代理IP
 95 | 由于试了几个爬虫获取的代理都不能用，浪费了很长时间，代理IP添加方式如下：
 96 |  http://blog.csdn.net/alicelmx/article/details/78947884
 97 |  
 98 |  - 在设置中delay参数设置为随机数字
 99 | ```
100 | import random
101 | DOWNLOAD_DELAY = random.randint(1, 3)
102 | ```
103 | 


--------------------------------------------------------------------------------
/wechat_spider/wechat_spider/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
  7 | 
  8 | from scrapy import signals
  9 | import random
 10 | import scrapy  
 11 | from scrapy import log
 12 | 
 13 | class WechatSpiderSpiderMiddleware(object):
 14 |     # Not all methods need to be defined. If a method is not defined,
 15 |     # scrapy acts as if the spider middleware does not modify the
 16 |     # passed objects.
 17 | 
 18 |     @classmethod
 19 |     def from_crawler(cls, crawler):
 20 |         # This method is used by Scrapy to create your spiders.
 21 |         s = cls()
 22 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 23 |         return s
 24 | 
 25 |     def process_spider_input(self, response, spider):
 26 |         # Called for each response that goes through the spider
 27 |         # middleware and into the spider.
 28 | 
 29 |         # Should return None or raise an exception.
 30 |         return None
 31 | 
 32 |     def process_spider_output(self, response, result, spider):
 33 |         # Called with the results returned from the Spider, after
 34 |         # it has processed the response.
 35 | 
 36 |         # Must return an iterable of Request, dict or Item objects.
 37 |         for i in result:
 38 |             yield i
 39 | 
 40 |     def process_spider_exception(self, response, exception, spider):
 41 |         # Called when a spider or process_spider_input() method
 42 |         # (from other spider middleware) raises an exception.
 43 | 
 44 |         # Should return either None or an iterable of Response, dict
 45 |         # or Item objects.
 46 |         pass
 47 | 
 48 |     def process_start_requests(self, start_requests, spider):
 49 |         # Called with the start requests of the spider, and works
 50 |         # similarly to the process_spider_output() method, except
 51 |         # that it doesn’t have a response associated.
 52 | 
 53 |         # Must return only requests (not items).
 54 |         for r in start_requests:
 55 |             yield r
 56 | 
 57 |     def spider_opened(self, spider):
 58 |         spider.logger.info('Spider opened: %s' % spider.name)
 59 | 
 60 | class RandomUserAgent:
 61 |     def __init__(self, agents):
 62 |         self.agents =[
 63 |             "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
 64 |             "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
 65 |             "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
 66 |             "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
 67 |             "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
 68 |             "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
 69 |             "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
 70 |             "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5"
 71 |             ]
 72 | 
 73 |     @classmethod
 74 |     def from_crawler(cls,crawler):
 75 |         # 获取settings的USER_AGENT列表并返回
 76 |         return cls(crawler.settings.getlist('USER_AGENTS'))
 77 |     def process_request(self, request, spider):
 78 |         # 随机设置Request报头header的User-Agent
 79 |         request.headers.setdefault('User-Agent', random.choice(self.agents))
 80 | 
 81 | # 添加代理
 82 |   
 83 | class ProxyMiddleWare(object):  
 84 |     proxy_list=[
 85 |    "http://58.87.89.234:3128",
 86 |     "http://139.201.202.140:53281",
 87 |     "http://27.37.123.30:9000",
 88 |     "http://218.67.82.146:36709",
 89 |     "http://222.222.169.60:53281",
 90 |     "http://120.33.247.233:46884",
 91 |     "http://114.215.18.7:3128",
 92 |     "http://112.74.94.142:3128",
 93 |     "http://122.72.18.34:80",
 94 |     "http://36.33.25.123:808",
 95 |     "http://123.138.89.133:9999",
 96 |     "http://111.231.192.61:8080",
 97 |     "http://59.41.202.228:53281",
 98 |     "http://222.241.14.187:8888",
 99 |     "http://61.155.164.106:3128",
100 |     "http://27.40.156.43:61234",
101 |     "http://14.29.84.50:8080",
102 |     "http://116.25.100.62:9797",
103 |     "http://58.21.183.144:80",
104 |     "http://14.221.166.205:9000",
105 |     "http://115.231.50.10:53281",
106 |     "http://120.34.205.40:808",
107 |     "http://123.139.56.238:9999",
108 |     "http://113.116.170.232:9000",
109 |     "http://116.17.236.36:808",
110 |     "http://114.232.163.73:34837",
111 |     "http://171.35.103.37:808",
112 |     "http://27.46.51.232:9797",
113 |     "http://223.247.255.207:24714",
114 |     "http://223.241.117.179:8010",
115 |     "http://222.186.12.102:57624"]
116 | def process_request(self,request,spider):
117 |     # if not request.meta['proxies']:
118 |     ip = random.choice(self.proxy_list)
119 |     request.meta['proxy'] = ip
120 | 
121 | 
122 | 
123 | 
124 | 
125 | 
126 | 
127 | 
128 | 


--------------------------------------------------------------------------------
/wechat_spider/wechat_spider/spiders/wechat.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import scrapy
  3 | from wechat_spider.items import WechatSpiderItem
  4 | import json
  5 | import re
  6 | from bs4 import BeautifulSoup
  7 | import time
  8 | 
  9 | class WechatSpider(scrapy.Spider):
 10 | 	name = 'wechat'
 11 | 	# allowed_domains = ['weixin.sogou.com/']
 12 | 	headers = {'Host': 'weixin.sogou.com',
 13 | 			      'Referer': 'http://weixin.sogou.com/weixin?type=2&s_from=input&query=%E6%B5%85%E5%B1%B1%E5%B0%8F%E7%AD%91&ie=utf8&_sug_=y&_sug_type_=&w=01019900&sut=5109&sst0=1513697178371&lkt=0%2C0%2C0',
 14 | 			      'Upgrade-Insecure-Requests': '1',
 15 | 			      'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
 16 | 
 17 | 	print('''
 18 |             **************************  
 19 |             Welcome to 公众号爬虫平台          
 20 |                Created on 2017-12-21           
 21 |                  @author: 小明酱lmx                  
 22 |             **************************
 23 | 	''' )
 24 | 	print("1.按公众号名称搜索;2.按关键字进行搜索")
 25 | 	choice = input("请输入查询模式：")
 26 | 	page = 1
 27 | 	def start_requests(self):
 28 | 		if self.choice == '1':
 29 | 			id = input("请输入你要查询的公众号id：")
 30 | 			start_urls = 'http://weixin.sogou.com/weixin?type=1&s_from=input&query='+id+'&ie=utf8&_sug_=n&_sug_type_='
 31 | 			return [scrapy.Request(start_urls,callback=self.parse1)]
 32 | 		
 33 | 		elif self.choice == '2':
 34 | 			key = input("请输入要查询的关键字：")
 35 | 			maxNo = int(input('请输入查询的最大页码：'))
 36 | 			return [scrapy.FormRequest(url='http://weixin.sogou.com/weixin',
 37 |                                     formdata={'type':'2',
 38 |                                             'ie':'utf8',
 39 |                                             'query':key,
 40 |                                             'tsn':'1',
 41 |                                             'ft':'',
 42 |                                             'et':'',
 43 |                                             'interation':'',
 44 |                                             'sst0': str(int(time.time()*1000)),
 45 |                                             'page': str(self.page),
 46 |                                             'wxid':'',
 47 |                                             'usip':''},
 48 |                                     method='get',
 49 |                                     meta={'key':key,'maxNo':maxNo},
 50 |                                     headers=self.headers,
 51 |                                     callback=self.parse2)]
 52 | 		else:
 53 | 			print('输入有误，程序退出')
 54 | 			return
 55 | 
 56 | 	def parse1(self, response):
 57 | 		# 获取公众号URL
 58 | 		publicUrl = response.xpath("//p[@class='tit']/a[@target='_blank']/@href").extract()[0]
 59 | 		print("*********"+publicUrl+"************")
 60 | 		yield scrapy.Request(publicUrl,cookies={'viewed':'"1083428"', '__utmv':'30149280.3975'},callback = self.parseArticleList)
 61 | 
 62 | 	def parseArticleList(self,response):
 63 | 		patt = re.compile(r'var msgList = (\{.*?\});')
 64 | 		result = patt.search(response.text)
 65 | 		url_list = json.loads(result.group(1))['list']
 66 | 		for data in url_list:
 67 | 			title = data['app_msg_ext_info']['title']
 68 | 			article_url = data['app_msg_ext_info']['content_url']
 69 | 			url = 'https://mp.weixin.qq.com' + article_url.replace(r'amp;', '')
 70 | 			yield scrapy.Request(url,meta={'title':title}, callback=self.parseArticle)
 71 | 
 72 | 	def parseArticle(self,response):
 73 | 		item = WechatSpiderItem()
 74 | 		item['title'] = response.meta['title']
 75 | 		soup = BeautifulSoup(response.text, 'lxml')
 76 | 		item['publishTime'] = soup.find('em',attrs={'class':'rich_media_meta rich_media_meta_text'}).get_text()
 77 | 		item['article'] = soup.find('div', attrs={'class': 'rich_media_content '}).get_text()
 78 | 		item['publicName'] = response.xpath("//a[@class='rich_media_meta rich_media_meta_link rich_media_meta_nickname']/text()").extract()[0]
 79 | 		yield item
 80 | 		 
 81 | 	# 找到每一个文章的链接，组成一个列表，并对每一个链接实行第二个方法及爬取文章主体信息
 82 | 	def parse2(self, response):
 83 | 		key = response.meta['key']
 84 | 		maxNo = response.meta['maxNo']
 85 | 		soup = BeautifulSoup(response.text, 'lxml')
 86 | 		node_soup = soup.find('ul', attrs={'class': 'news-list'})
 87 | 		
 88 | 		for node in node_soup.findAll('li'):
 89 | 			url = node.select('div h3 a')[0]['href']
 90 | 			yield scrapy.Request(url, callback=self.parseArticleBody)
 91 | 
 92 | 		# 实现翻页爬取
 93 | 		while self.page < maxNo:
 94 |         			self.page += 1
 95 |         			yield scrapy.FormRequest(url='http://weixin.sogou.com/weixin',
 96 |                            		formdata={'type': '2','ie': 'utf8','query': key,'tsn': '1','ft': '','et': '','interation': '','sst0': str(int(time.time() * 1000)),'page': str(self.page),'wxid': '','usip': ''},
 97 | 				method='get',
 98 | 				meta={'key':key,'maxNo':maxNo},
 99 | 				headers=self.headers,
100 | 				callback=self.parse2)
101 | 
102 | 	# 爬取每个文章的所需信息
103 | 	def parseArticleBody(self,response):
104 | 		item = WechatSpiderItem()
105 | 		item['title'] = response.xpath("//div[@id='img-content']/h2[@class='rich_media_title']/text()").extract()[0].strip().replace('\r','').replace('\n','').replace('\t','')
106 | 		soup = BeautifulSoup(response.text, 'lxml')
107 | 		item['publishTime'] = soup.find('em',attrs={'class':'rich_media_meta rich_media_meta_text'}).get_text()
108 | 		item['article'] = soup.find('div', attrs={'class': 'rich_media_content '}).get_text()
109 | 		item['publicName'] = response.xpath("//a[@class='rich_media_meta rich_media_meta_link rich_media_meta_nickname']/text()").extract()[0]
110 | 		yield item
111 | 
112 | 
113 | 
114 | 
115 | 
116 | 
117 | 
118 | 		
119 | 


--------------------------------------------------------------------------------