├── .idea
    ├── encodings.xml
    ├── inspectionProfiles
    │   └── profiles_settings.xml
    ├── misc.xml
    ├── modules.xml
    ├── preferred-vcs.xml
    └── weixin.iml
├── README.md
├── scrapy.cfg
└── weixin
    ├── __init__.py
    ├── items.py
    ├── main.py
    ├── middlewares.py
    ├── pipelines.py
    ├── settings.py
    └── spiders
        ├── WeiXin.py
        └── __init__.py


/.idea/encodings.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="Encoding">
4 |     <file url="PROJECT" charset="UTF-8" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 | <component name="InspectionProjectProfileManager">
2 |   <settings>
3 |     <option name="useProjectProfile" value="false" />
4 |     <option name="USE_PROJECT_PROFILE" value="false" />
5 |     <version value="1.0" />
6 |   </settings>
7 | </component>


--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 2.7.13 (C:\Python27\python.exe)" project-jdk-type="Python SDK" />
4 | </project>


--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/weixin.iml" filepath="$PROJECT_DIR$/.idea/weixin.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/.idea/preferred-vcs.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="PreferredVcsStorage">
4 |     <preferredVcsName>Git</preferredVcsName>
5 |     <shelvedMapping directory="" vcs="" />
6 |   </component>
7 | </project>


--------------------------------------------------------------------------------
/.idea/weixin.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$" />
 5 |     <orderEntry type="inheritedJdk" />
 6 |     <orderEntry type="sourceFolder" forTests="false" />
 7 |   </component>
 8 |   <component name="TestRunnerService">
 9 |     <option name="PROJECT_TEST_RUNNER" value="Unittests" />
10 |   </component>
11 | </module>


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # 相关代码已经修改调试成功----2017-4-4 #
 2 | ## 一、说明 ##
 3 | **目标网址：**[http://weixin.sogou.com/weixin?type=2&query=python&ie=utf8](http://weixin.sogou.com/weixin?type=2&query=python&ie=utf8)
 4 | 
 5 | **实现**：关于python文章的抓取，抓取标题、标题链接、描述。如下图所示。
 6 | 
 7 | **数据**：数据我就没有保存，此实战主要是为了学习IP和用户代理池的设定，推荐一个开源项目关于搜狗微信公众号：[基于搜狗微信的公众号文章爬虫](https://github.com/pujinxiao/wechat_sogou_crawl)
 8 | 
 9 | 图1
10 | 
11 | ![](http://images2015.cnblogs.com/blog/1129740/201704/1129740-20170404160803769-1659644711.png)
12 | 
13 | ## 二、运行 ##
14 | 1. key是要搜索关键字，可以改变。
15 | 2. 设置好代理IP和User-Agent，运行main.py即可。
16 | 
17 | ## 三、问题----欢迎留言提出问题 ##
18 | 声明：此项目主要是学习IP代理池的设定
19 | > 1.我所用的ip是免费网站上找的，现在免费可用的ip真的是少，尤其实在搜狗微信端，很难找了。我用本地ip可以爬取，但是用代理ip，会出现xpath取到空值的情况，难道ip质量不好还是什么的,猜测是没有加载缓慢（待解决）如下所示
20 | 
21 |     2017-04-04 16:57:40 [scrapy.extensions.telnet] DEBUG: Telnet console listening on 127.0.0.1:6023
22 | 	当前使用的IP是124.65.238.166:80
23 | 	当前使用的user-agent是Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.75 Safari/537.36
24 | 	2017-04-04 16:57:40 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://sogou.com/> (referer: None)
25 | 	当前使用的IP是124.65.238.166:80
26 | 	当前使用的user-agent是Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.75 Safari/537.36
27 | 	当前使用的IP是124.65.238.166:80
28 | 	当前使用的user-agent是Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3
29 | 	当前使用的IP是124.65.238.166:80
30 | 	当前使用的user-agent是Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3
31 | 	当前使用的IP是124.65.238.166:80
32 | 	当前使用的user-agent是Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.75 Safari/537.36
33 | 	当前使用的IP是124.65.238.166:80
34 | 	当前使用的user-agent是Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.75 Safari/537.36
35 | 	当前使用的IP是124.65.238.166:80
36 | 	当前使用的user-agent是Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3
37 | 	当前使用的IP是124.65.238.166:80
38 | 	当前使用的user-agent是Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.75 Safari/537.36
39 | 	当前使用的IP是124.65.238.166:80
40 | 	当前使用的user-agent是Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.75 Safari/537.36
41 | 	当前使用的IP是124.65.238.166:80
42 | 	当前使用的user-agent是Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3
43 | 	当前使用的IP是124.65.238.166:80
44 | 	当前使用的user-agent是Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3
45 | 	2017-04-04 16:57:41 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://weixin.sogou.com/weixin?query=python&type=2&page=10> (referer: http://sogou.com/)
46 | 	2017-04-04 16:57:41 [scrapy.core.scraper] DEBUG: Scraped from <200 http://weixin.sogou.com/weixin?query=python&type=2&page=10>
47 | 	{'dec': [], 'link': [], 'title': []}
48 | 	2017-04-04 16:57:42 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://weixin.sogou.com/weixin?query=python&type=2&page=9> (referer: http://sogou.com/)
49 | 	2017-04-04 16:57:42 [scrapy.core.scraper] DEBUG: Scraped from <200 http://weixin.sogou.com/weixin?query=python&type=2&page=9>
50 | 	{'dec': [], 'link': [], 'title': []}
51 | 	2017-04-04 16:57:44 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://weixin.sogou.com/weixin?query=python&type=2&page=8> (referer: http://sogou.com/)
52 | **欢迎有兴趣的小伙伴帮我优化，找出问题，之后我将合并你的代码，作为贡献者,共同成长。**
53 | ## 四、笔记 ##
54 | 一.反爬虫机制处理思路：
55 | 
56 | 1. 浏览器伪装、用户代理池；
57 | 1. IP限制--------IP代理池；
58 | 1. ajax、js异步-------抓包；
59 | 1. 验证码-------打码平台。
60 | 
61 | 二.散点知识：
62 | 
63 |     1.def process_request():   #处理请求
64 |     　　  request.meta["proxy"]=.... #添加代理ip
65 |     2.scrapy中如果请求2次就会放弃，说明该代理ip不行。
66 | 
67 | ----------
68 | 如果本项目对你有用请给我一颗star，万分感谢。 


--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = weixin.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = weixin
12 | 


--------------------------------------------------------------------------------
/weixin/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pujinxiao/weixin/d3ec0c9204ee8688c9b1efbf9d9238b0296df34e/weixin/__init__.py


--------------------------------------------------------------------------------
/weixin/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class WeixinItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     title=scrapy.Field() #标题
15 |     link=scrapy.Field() #链接
16 |     dec=scrapy.Field() #描述
17 | 


--------------------------------------------------------------------------------
/weixin/main.py:
--------------------------------------------------------------------------------
1 | from scrapy import cmdline
2 | cmdline.execute('scrapy crawl WeiXin'.split())


--------------------------------------------------------------------------------
/weixin/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import random
 3 | from scrapy.downloadermiddlewares.httpproxy import HttpProxyMiddleware #代理ip，这是固定的导入
 4 | from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware #代理UA，固定导入
 5 | class IPPOOLS(HttpProxyMiddleware):
 6 |     def __init__(self,ip=''):
 7 |         '''初始化'''
 8 |         self.ip=ip
 9 |     def process_request(self, request, spider):
10 |         '''使用代理ip，随机选用'''
11 |         ip=random.choice(self.ip_pools) #随机选择一个ip
12 |         print '当前使用的IP是'+ip['ip']
13 |         try:
14 |             request.meta["proxy"]="http://"+ip['ip']
15 |         except Exception,e:
16 |             print e
17 |             pass
18 |     ip_pools=[
19 |         {'ip': '124.65.238.166:80'},
20 |         # {'ip':''},
21 |     ]
22 | class UAPOOLS(UserAgentMiddleware):
23 |     def __init__(self,user_agent=''):
24 |         self.user_agent=user_agent
25 |     def process_request(self, request, spider):
26 |         '''使用代理UA，随机选用'''
27 |         ua=random.choice(self.user_agent_pools)
28 |         print '当前使用的user-agent是'+ua
29 |         try:
30 |             request.headers.setdefault('User-Agent',ua)
31 |         except Exception,e:
32 |             print e
33 |             pass
34 |     user_agent_pools=[
35 |         'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3',
36 |         'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3',
37 |         'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.75 Safari/537.36',
38 |     ]


--------------------------------------------------------------------------------
/weixin/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | import re
 9 | class WeixinPipeline(object):
10 |     def process_item(self, item, spider):
11 |         for i in range(len(item['title'])):
12 |             html=item['title'][i]
13 |             reg=re.compile(r'<[^>]+>') #去html标签
14 |             title=reg.sub('',html)
15 |             print title
16 |             print item['link'][i]
17 |             html_1=item['dec'][i]
18 |             reg_1=re.compile(r'<[^>]+>')
19 |             dec=reg_1.sub('',html_1)
20 |             print dec
21 |         return item
22 | 


--------------------------------------------------------------------------------
/weixin/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for weixin project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'weixin'
13 | 
14 | SPIDER_MODULES = ['weixin.spiders']
15 | NEWSPIDER_MODULE = 'weixin.spiders'
16 | 
17 | 
18 | # Obey robots.txt rules
19 | ROBOTSTXT_OBEY = False
20 | 
21 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
22 | #CONCURRENT_REQUESTS = 32
23 | 
24 | # Configure a delay for requests for the same website (default: 0)
25 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
26 | # See also autothrottle settings and docs
27 | DOWNLOAD_DELAY = 1
28 | # The download delay setting will honor only one of:
29 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
30 | #CONCURRENT_REQUESTS_PER_IP = 16
31 | 
32 | # Disable cookies (enabled by default)
33 | COOKIES_ENABLED = False
34 | 
35 | # Disable Telnet Console (enabled by default)
36 | #TELNETCONSOLE_ENABLED = False
37 | 
38 | # Override the default request headers:
39 | #DEFAULT_REQUEST_HEADERS = {
40 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
41 | #   'Accept-Language': 'en',
42 | #}
43 | 
44 | # Enable or disable spider middlewares
45 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
46 | #SPIDER_MIDDLEWARES = {
47 | #    'weixin.middlewares.WeixinSpiderMiddleware': 543,
48 | #}
49 | 
50 | # Enable or disable downloader middlewares
51 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
52 | DOWNLOADER_MIDDLEWARES = {
53 |     'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware':123,
54 |     'weixin.middlewares.IPPOOLS':124,
55 |     'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware' : 125,
56 |     'weixin.middlewares.UAPOOLS':126
57 | }
58 | 
59 | # Enable or disable extensions
60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | #    'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 | 
65 | # Configure item pipelines
66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
67 | ITEM_PIPELINES = {
68 |    'weixin.pipelines.WeixinPipeline': 300,
69 | }
70 | 
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 | 
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 | 


--------------------------------------------------------------------------------
/weixin/spiders/WeiXin.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | from scrapy.http import Request
 4 | from weixin.items import WeixinItem
 5 | class WeixinSpider(scrapy.Spider):
 6 |     name = "WeiXin"
 7 |     allowed_domains = ["sogou.com"]
 8 |     start_urls = ['http://sogou.com/']
 9 | 
10 |     def parse(self, response):
11 |         '''建立循环页'''
12 |         key='python'
13 |         for i in range(1,11):
14 |             url='http://weixin.sogou.com/weixin?query='+key+'&type=2&page='+str(i)
15 |             yield Request(url=url,callback=self.get_content)
16 |     def get_content(self,response):
17 |         '''获取相关信息'''
18 |         item=WeixinItem()
19 |         item['title']=response.xpath('//div[@class="txt-box"]/h3/a').extract() #一页的全部标题,10条包含html标签
20 |         item['link']=response.xpath('//div[@class="txt-box"]/h3/a/@href').extract() #一页的全部标题链接 10条
21 |         item['dec']=response.xpath('//p[@class="txt-info"]').extract() #一页的全部描述,10条包含html标签
22 |         yield item


--------------------------------------------------------------------------------
/weixin/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------