├── crawler
    ├── __init__.py
    ├── commands
    │   ├── __init__.py
    │   └── crawlall.py
    ├── contrib
    │   ├── __init__.py
    │   └── socks.py
    ├── middleware
    │   ├── __init__.py
    │   ├── proxy.py
    │   └── random_user_agent.py
    ├── proxy.txt
    ├── spiders
    │   ├── __init__.py
    │   ├── cnproxy.py
    │   ├── socksproxy.py
    │   ├── 66ip.py
    │   ├── xici.py
    │   ├── haodaili.py
    │   ├── proxylists.py
    │   ├── xroxy.py
    │   ├── chunzhen.py
    │   ├── kuaidaili.py
    │   └── qiaodm.py
    ├── items.py
    ├── settings.py
    └── pipelines.py
├── .gitignore
├── scrapy.cfg
├── LICENSE.txt
└── README.md


/crawler/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/crawler/commands/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/crawler/contrib/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/crawler/middleware/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | *.pyc
3 | 
4 | 


--------------------------------------------------------------------------------
/crawler/proxy.txt:
--------------------------------------------------------------------------------
1 | anonymous://121.8.98.202:8080
2 | anonymous://121.8.98.201:8080
3 | socks5://27.152.181.217:80
4 | 


--------------------------------------------------------------------------------
/crawler/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html
 5 | 
 6 | [settings]
 7 | default = crawler.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = crawler
12 | 


--------------------------------------------------------------------------------
/crawler/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | class ProxyIPItem(scrapy.Item):
11 | 	ip = scrapy.Field()
12 | 	port = scrapy.Field()
13 | 	type = scrapy.Field()
14 | 	speed = scrapy.Field()
15 | 	post = scrapy.Field()
16 | 	ssl = scrapy.Field()
17 | 
18 | 


--------------------------------------------------------------------------------
/crawler/middleware/proxy.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import random
 3 | from scrapy.conf import settings
 4 | import logging
 5 | 
 6 | logger = logging.getLogger("crawler.middleware.proxy")
 7 | 
 8 | class StaticProxyMiddleware(object):
 9 | 	# overwrite process request
10 | 	def process_request(self, request, spider):
11 | 		# Set the location of the proxy
12 | 		proxy = settings.get("PROXY")
13 | 		logger.info("process request %s using proxy %s" % (request, proxy))
14 | 		request.meta['proxy'] = proxy
15 | 
16 | class RandomProxyMiddleware(object):
17 | 
18 | 	def process_request(self, request, spider):
19 | 		proxy = random.choice(settings.get('PROXY_LIST'))
20 | 		logger.info("process request %s using proxy %s" % (request, proxy))
21 | 		request.meta['proxy'] = proxy
22 | 


--------------------------------------------------------------------------------
/crawler/spiders/cnproxy.py:
--------------------------------------------------------------------------------
 1 | from scrapy.spiders import Spider
 2 | from scrapy.http import Request
 3 | from scrapy.selector import Selector
 4 | from crawler.items import ProxyIPItem
 5 | 
 6 | class CnProxySpider(Spider):
 7 |     name = "cnproxy"
 8 |     allowed_domains = ["cn-proxy.com"]
 9 |     start_urls = [
10 |         "http://cn-proxy.com/",
11 |         "http://cn-proxy.com/archives/218"
12 |     ]
13 |     referer = "http://cn-proxy.com/"
14 | 
15 |     def start_requests(self):
16 |         for item in self.start_urls:
17 |             yield Request(url=item, headers={'Referer': self.referer})
18 | 
19 |     def parse(self, response):
20 |         ip_list = response.xpath('//table[@class="sortable"]/tbody/tr')
21 |         for ip in ip_list:
22 |             item = ProxyIPItem()
23 |             item['ip'] = ip.xpath('td[1]/text()').extract()[0]
24 |             item['port'] = ip.xpath('td[2]/text()').extract()[0]
25 |             item['type'] = 'http'
26 |             yield item
27 | 


--------------------------------------------------------------------------------
/crawler/spiders/socksproxy.py:
--------------------------------------------------------------------------------
 1 | from scrapy.spiders import Spider
 2 | from scrapy.http import Request
 3 | from scrapy.selector import Selector
 4 | from crawler.items import ProxyIPItem
 5 | 
 6 | class SocksProxySpider(Spider):
 7 |     name = "socksproxy"
 8 |     allowed_domains = ["socks-proxy.net"]
 9 |     start_urls = [
10 |         "http://www.socks-proxy.net"
11 |     ]
12 |     referer = "http://www.socks-proxy.net"
13 | 
14 |     def start_requests(self):
15 |         for item in self.start_urls:
16 |             yield Request(url=item, headers={'Referer': self.referer})
17 | 
18 |     def parse(self, response):
19 |         ip_list = response.xpath('//*[@id="proxylisttable"]/tbody/tr')
20 |         for ip in ip_list:
21 |             item = ProxyIPItem()
22 |             item['ip'] = ip.xpath('td[1]/text()').extract()[0]
23 |             item['port'] = ip.xpath('td[2]/text()').extract()[0]
24 |             item['type'] = ip.xpath('td[5]/text()').extract()[0].lower()
25 |             yield item
26 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2017 xelzmm@gmail.com
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in all
11 | copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19 | SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/crawler/spiders/66ip.py:
--------------------------------------------------------------------------------
 1 | from scrapy.spiders import Spider
 2 | from scrapy.http import Request
 3 | from scrapy.selector import Selector
 4 | from crawler.items import ProxyIPItem
 5 | import random
 6 | import re
 7 | 
 8 | from urllib import unquote
 9 | from re import search
10 | 
11 | class _66IPSpider(Spider):
12 |     name = "66ip"
13 |     allowed_domains = ["66ip.cn"]
14 |     start_urls = [
15 |         "http://www.66ip.cn/mo.php?sxb=&tqsl=%s&port=&export=&ktip=&sxa=&submit=%%CC%%E1++%%C8%%A1&textarea=" % random.randint(3000, 5000),
16 |         "http://www.66ip.cn/nmtq.php?getnum=%s&isp=0&anonymoustype=0&start=&ports=&export=&ipaddress=&area=0&proxytype=2&api=71daili" % random.randint(3000, 5000)
17 |     ]
18 | 
19 |     def start_requests(self):
20 |         for item in self.start_urls:
21 |             yield Request(url=item, headers={'Referer': item[:item.index('?')]})
22 | 
23 |     def parse(self, response):
24 |         ip_list = re.findall("\d+\.\d+\.\d+\.\d+:\d+", response.body)
25 |         for ip in ip_list:
26 |             item = ProxyIPItem()
27 |             item['ip'] = ip[:ip.index(':')]
28 |             item['port'] = ip[ip.index(":") + 1:]
29 |             item['type'] = 'http'
30 |             yield item
31 | 
32 | 


--------------------------------------------------------------------------------
/crawler/spiders/xici.py:
--------------------------------------------------------------------------------
 1 | from scrapy.spiders import Spider
 2 | from scrapy.http import Request
 3 | from scrapy.selector import Selector
 4 | from crawler.items import ProxyIPItem
 5 | 
 6 | class XiciSpider(Spider):
 7 |     name = "xici"
 8 |     allowed_domains = ["xicidaili.com"]
 9 |     start_urls = [
10 |         "http://www.xicidaili.com/nn",
11 |         "http://www.xicidaili.com/nn/2",
12 |         "http://www.xicidaili.com/nn/3",
13 |         "http://www.xicidaili.com/nn/4",
14 |         "http://www.xicidaili.com/nn/5",
15 |         "http://www.xicidaili.com/nn/6",
16 |         "http://www.xicidaili.com/nn/7",
17 |         "http://www.xicidaili.com/nn/8",
18 |         "http://www.xicidaili.com/nn/9",
19 |         "http://www.xicidaili.com/nn/10"
20 |     ]
21 |     referer = 'http://www.xicidaili.com/nn'
22 | 
23 |     def start_requests(self):
24 |         for item in self.start_urls:
25 |             yield Request(url=item, headers={'Referer': self.referer})
26 | 
27 |     def parse(self, response):
28 |         ip_list = response.xpath('//table[@id="ip_list"]/tr')
29 |         if len(ip_list) > 0:
30 |             ip_list.pop(0)
31 |         for ip in ip_list:
32 |             item = ProxyIPItem()
33 |             item['ip'] = ip.xpath('td[2]/text()').extract()[0]
34 |             item['port'] = ip.xpath('td[3]/text()').extract()[0]
35 |             item['type'] = 'http'
36 |             yield item
37 | 


--------------------------------------------------------------------------------
/crawler/commands/crawlall.py:
--------------------------------------------------------------------------------
 1 | from scrapy.commands import ScrapyCommand  
 2 | from scrapy.crawler import CrawlerRunner
 3 | from scrapy.utils.conf import arglist_to_dict
 4 | class Command(ScrapyCommand):
 5 | 	requires_project = True
 6 | 	def syntax(self):  
 7 | 		return '[options]'  
 8 | 	def short_desc(self):  
 9 | 		return 'Runs all of the spiders'  
10 | 	def add_options(self, parser):
11 | 		ScrapyCommand.add_options(self, parser)
12 | 		parser.add_option("-a", dest="spargs", action="append", default=[], metavar="NAME=VALUE",
13 | 						  help="set spider argument (may be repeated)")
14 | 		parser.add_option("-o", "--output", metavar="FILE",
15 | 						  help="dump scraped items into FILE (use - for stdout)")
16 | 		parser.add_option("-t", "--output-format", metavar="FORMAT",
17 | 						  help="format to use for dumping items with -o")
18 | 	def process_options(self, args, opts):
19 | 		ScrapyCommand.process_options(self, args, opts)
20 | 		try:
21 | 			opts.spargs = arglist_to_dict(opts.spargs)
22 | 		except ValueError:
23 | 			raise UsageError("Invalid -a value, use -a NAME=VALUE", print_help=False)
24 | 	def run(self, args, opts):
25 | 		#settings = get_project_settings()
26 | 
27 | 		spider_loader = self.crawler_process.spider_loader
28 | 		for spidername in args or spider_loader.list():
29 | 			print "*********cralall spidername************" + spidername
30 | 			self.crawler_process.crawl(spidername, **opts.spargs)
31 | 		self.crawler_process.start()
32 | 


--------------------------------------------------------------------------------
/crawler/spiders/haodaili.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from scrapy.spiders import Spider
 4 | from scrapy.http import Request
 5 | from scrapy.selector import Selector
 6 | from crawler.items import ProxyIPItem
 7 | 
 8 | class HaodailiSpider(Spider):
 9 |     name = "haodaili"
10 |     allowed_domains = ["haodailiip.com"]
11 |     start_urls = [
12 |         "http://www.haodailiip.com/guonei",
13 |         "http://www.haodailiip.com/guoji"
14 |     ]
15 |     referer = "http://www.haodailiip.com"
16 | 
17 |     def start_requests(self):
18 |         for item in self.start_urls:
19 |             yield Request(url=item, headers={'Referer': self.referer})
20 | 
21 |     def parse(self, response):
22 |         ip_list = response.xpath('/html/body/center/table[2]/tr/td[1]/table/tr')
23 |         if len(ip_list) > 1:
24 |             ip_list.pop(0)
25 |         has_next = True
26 |         for ip in ip_list:
27 |             item = ProxyIPItem()
28 |             columns = ip.xpath('td/text()').extract()
29 |             item['ip'] = columns[0].strip()
30 |             item['port'] = columns[1].strip()
31 |             item['type'] = 'http'
32 |             if columns[-1].strip() == u'超时':
33 |                 has_next = False
34 |             yield item
35 |         if has_next:
36 |             url = "%s%s" % (self.referer, response.xpath('/html/body/center/table[2]/tr/td[1]/p/a[last()]/@href').extract()[0])
37 |             yield Request(url=url, headers={'Referer': response.url})
38 | 
39 | 


--------------------------------------------------------------------------------
/crawler/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for crawler project
 4 | #
 5 | # For simplicity, this file contains only the most important settings by
 6 | # default. All the other settings are documented here:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #
10 | 
11 | BOT_NAME = 'crawler'
12 | 
13 | SPIDER_MODULES = ['crawler.spiders']
14 | NEWSPIDER_MODULE = 'crawler.spiders'
15 | COMMANDS_MODULE = 'crawler.commands'
16 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
17 | #USER_AGENT = 'crawler (+http://www.yourdomain.com)'
18 | 
19 | DOWNLOADER_MIDDLEWARES = {
20 |     'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware' : None,
21 |     'scrapy.downloadermiddlewares.retry.RetryMiddleware' : 300,
22 |     'crawler.middleware.random_user_agent.RandomUserAgentMiddleware' : 500,
23 | 	# 'crawler.middleware.proxy.StaticProxyMiddleware' : 100 ,
24 | }
25 | 
26 | ITEM_PIPELINES = {
27 |     'crawler.pipelines.ProxyScanPipeline': 500,
28 |     # 'crawler.pipelines.PrintPipeline': 800,
29 | }
30 | 
31 | LOG_LEVEL = 'INFO'
32 | 
33 | DOWNLOAD_DELAY = 1
34 | import urllib2
35 | LOCAL_IP = urllib2.urlopen('http://ifconfig.io/ip').read()[:-1]
36 | # PROXY = "http://61.53.143.179:80"
37 | 
38 | # used by StaticProxyMiddleware, if you want to crawl through a proxy server
39 | PROXY = "http://localhost:8088"
40 | 
41 | # used by RandomProxyMiddleware, if you want to crawl through proxy servers
42 | PROXY_LIST = [
43 | 	"http://120.83.5.164:18000",
44 | 	"http://111.161.126.100:80",
45 | 	"http://61.53.143.179:80"
46 | ]
47 | 
48 | RETRY_HTTP_CODES = [500, 502, 503, 504, 400, 405, 408]
49 | 


--------------------------------------------------------------------------------
/crawler/spiders/proxylists.py:
--------------------------------------------------------------------------------
 1 | from scrapy.spiders import Spider
 2 | from scrapy.http import Request
 3 | from scrapy.selector import Selector
 4 | from crawler.items import ProxyIPItem
 5 | 
 6 | from urllib import unquote
 7 | from re import search
 8 | 
 9 | class ProxylistsSpider(Spider):
10 |     name = "proxylists"
11 |     allowed_domains = ["proxylists.net"]
12 |     start_urls = [
13 |         "http://www.proxylists.net/cn_1_ext.html",
14 |     ]
15 |     referer = "http://www.proxylists.net/"
16 | 
17 |     def start_requests(self):
18 |         for item in self.start_urls:
19 |             yield Request(url=item, headers={'Referer': self.referer})
20 | 
21 |     def parse(self, response):
22 |         ip_list = response.xpath('body/font/b/table/tr[1]/td[2]/table/tr')
23 |         if len(ip_list) > 3:
24 |             ip_list.pop(1)
25 |             ip_list.pop(0)
26 |             ip = ip_list.pop()
27 |             cur_page = int(search('cn_(\d+)_ext', response.url).group(1))
28 |             total = len(ip.xpath('td/b/a'))
29 |             if total - cur_page > 1:
30 |                 yield Request(url="http://www.proxylists.net/cn_%d_ext.html" % (cur_page + 1), headers={'Referer': response.url})
31 |         for ip in ip_list:
32 |             item = ProxyIPItem()
33 |             item['ip'] = unquote(search('%22(.*)%22', ip.xpath('td/script/text()').extract()[0]).group(1))
34 |             item['port'] = ip.xpath('td[2]/text()').extract()[0]
35 |             type = ip.xpath('td[3]/text()').extract()[0].lower()
36 |             type = 'http' if type in ['anonymous', 'transparent', 'high anonymity', 'distorting'] else type
37 |             item['type'] = type
38 |             yield item
39 | 
40 | 


--------------------------------------------------------------------------------
/crawler/spiders/xroxy.py:
--------------------------------------------------------------------------------
 1 | from scrapy.spiders import Spider
 2 | from scrapy.http import Request
 3 | from scrapy.selector import Selector
 4 | from crawler.items import ProxyIPItem
 5 | 
 6 | from re import search
 7 | 
 8 | class XroxySpider(Spider):
 9 |     name = "xroxy"
10 |     allowed_domains = ["xroxy.com"]
11 |     start_urls = [
12 |         "http://www.xroxy.com/proxylist.php",
13 |     ]
14 |     referer = "http://www.xroxy.com/"
15 | 
16 |     def start_requests(self):
17 |         for item in self.start_urls:
18 |             yield Request(url=item, headers={'Referer': self.referer})
19 | 
20 |     def parse(self, response):
21 |         total = int(response.xpath('//*[@id="content"]/table[2]/tr/td[1]/table/tr[2]/td/small/b/text()').extract()[0]) / 10
22 |         if response.url.find('pnum=') == -1:
23 |             cur_page = 0
24 |         else:
25 |             cur_page = int(search('pnum=(\d+)', response.url).group(1))
26 |         if total - cur_page > 1:
27 |             yield Request(url="http://www.xroxy.com/proxylist.php?pnum=%d" % (cur_page + 1), headers={'Referer': response.url})
28 |         ip_list = response.xpath('//*[@id="content"]/table[1]/tr[@class="row0"] | //*[@id="content"]/table[1]/tr[@class="row1"]')
29 |         for ip in ip_list:
30 |             item = ProxyIPItem()
31 |             item['ip'] = ip.xpath('td[2]/a/text()').extract()[0].strip()
32 |             item['port'] = ip.xpath('td[3]/a/text()').extract()[0].strip()
33 |             type = ip.xpath('td[4]/a/text()').extract()[0].strip().lower()
34 |             type = 'http' if type in ['anonymous', 'transparent', 'high anonymity', 'distorting'] else type
35 |             item['type'] = type
36 |             yield item
37 | 
38 | 


--------------------------------------------------------------------------------
/crawler/spiders/chunzhen.py:
--------------------------------------------------------------------------------
 1 | from scrapy.spiders import Spider
 2 | from scrapy.http import Request
 3 | from scrapy.selector import Selector
 4 | from crawler.items import ProxyIPItem
 5 | 
 6 | class ChunzhenSpider(Spider):
 7 |     name = "chunzhen"
 8 |     allowed_domains = ["cz88.net"]
 9 |     start_urls = [
10 |         "http://www.cz88.net/proxy/index.shtml",
11 |         "http://www.cz88.net/proxy/http_2.shtml",
12 |         "http://www.cz88.net/proxy/http_3.shtml",
13 |         "http://www.cz88.net/proxy/http_4.shtml",
14 |         "http://www.cz88.net/proxy/http_5.shtml",
15 |         "http://www.cz88.net/proxy/http_6.shtml",
16 |         "http://www.cz88.net/proxy/http_7.shtml",
17 |         "http://www.cz88.net/proxy/http_8.shtml",
18 |         "http://www.cz88.net/proxy/http_9.shtml",
19 |         "http://www.cz88.net/proxy/http_10.shtml",
20 |         "http://www.cz88.net/proxy/socks4.shtml",
21 |         "http://www.cz88.net/proxy/socks4_2.shtml",
22 |         "http://www.cz88.net/proxy/socks4_3.shtml",
23 |         "http://www.cz88.net/proxy/socks5.shtml",
24 |         "http://www.cz88.net/proxy/socks5_2.shtml"
25 |     ]
26 |     referer = 'http://www.cz88.net/proxy/index.shtml'
27 | 
28 |     def start_requests(self):
29 |         for item in self.start_urls:
30 |             yield Request(url=item, headers={'Referer': self.referer})
31 | 
32 |     def parse(self, response):
33 |         ip_list = response.xpath('//div[@id="boxright"]/div/ul/li')
34 |         if len(ip_list) > 0:
35 |             ip_list.pop(0)
36 |         for ip in ip_list:
37 |             item = ProxyIPItem()
38 |             item['ip'] = ip.xpath('div[@class="ip"]/text()').extract()[0]
39 |             item['port'] = ip.xpath('div[@class="port"]/text()').extract()[0]
40 |             if response.url.find('socks4') != -1:
41 |                 item['type'] = 'socks4'
42 |             elif response.url.find('socks5') != -1:
43 |                 item['type'] = 'socks5'
44 |             else:
45 |                 item['type'] = 'http'
46 |             yield item
47 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ##Introduction
 2 | 
 3 | **Proxy Server Crawler** is a tool used to crawl public proxy servers from proxy websites. When crawled a proxy server(ip::port::type), it will test the functionality of the server automatically.
 4 | 
 5 | Currently supported websites:
 6 | 
 7 | * http://www.66ip.cn
 8 | * http://www.cz88.net
 9 | * http://www.cn-proxy.com
10 | * http://www.haodailiip.com
11 | * http://www.kuaidaili.com
12 | * http://www.proxylists.net
13 | * http://www.qiaodm.net
14 | * http://www.socks-proxy.net
15 | * http://www.xroxy.com
16 | * http://www.xicidaili.com
17 | 
18 | Currently supported testing(for http proxy)
19 | 
20 | * ssl support
21 | * post support
22 | * speed (tested with 10 frequently used sites)
23 | * type(high/anonymous/transparent)
24 | 
25 | ## Requirements
26 | 
27 | * Python >= 2.7
28 | * Scrapy 1.3.0 (not tested for lower version)
29 | * node (for some sites, you need node to bypass waf based on javascript)
30 | 
31 | ## Usage
32 | 
33 | ```bash
34 | cd proxy_server_crawler
35 | scrapy crawl chunzhen
36 | ```
37 | 
38 | [log]
39 | 
40 | ```
41 | [ result] ip: 59.41.214.218  , port: 3128 , type: http, proxy server not alive or healthy.
42 | [ result] ip: 117.90.6.67    , port: 9000 , type: http, proxy server not alive or healthy.
43 | [ result] ip: 117.175.183.10 , port: 8123 , speed: 984 , type: high
44 | [ result] ip: 180.95.154.221 , port: 80   , type: http, proxy server not alive or healthy.
45 | [ result] ip: 110.73.0.206   , port: 8123 , type: http, proxy server not alive or healthy.
46 | [  proxy] ip: 124.88.67.54   , port: 80   , speed: 448 , type: high       , post: True , ssl: False
47 | [ result] ip: 117.90.2.149   , port: 9000 , type: http, proxy server not alive or healthy.
48 | [ result] ip: 115.212.165.170, port: 9000 , type: http, proxy server not alive or healthy.
49 | [  proxy] ip: 118.123.22.192 , port: 3128 , speed: 769 , type: high       , post: True , ssl: False
50 | [  proxy] ip: 117.175.183.10 , port: 8123 , speed: 908 , type: high       , post: True , ssl: True 
51 | ```
52 | 
53 | ##License
54 | 
55 | The MIT License (MIT)
56 | 


--------------------------------------------------------------------------------
/crawler/spiders/kuaidaili.py:
--------------------------------------------------------------------------------
 1 | from scrapy.spiders import Spider
 2 | from scrapy.http import Request
 3 | from scrapy.selector import Selector
 4 | from crawler.items import ProxyIPItem
 5 | import re
 6 | import os
 7 | 
 8 | class KuaidailiSpider(Spider):
 9 |     name = "kuaidaili"
10 |     allowed_domains = ["kuaidaili.com"]
11 |     start_urls = [
12 |         "http://www.kuaidaili.com"
13 |     ]
14 |     referer = 'http://www.kuaidaili.com'
15 | 
16 |     def start_requests(self):
17 |         for item in self.start_urls:
18 |             yield Request(url=item, headers={'Referer': self.referer}, meta={'handle_httpstatus_list': [521]}, callback=self.parseInitRequest)
19 | 
20 |     def parse(self, response):
21 |         ip_list = response.xpath('//div[@id="index_free_list"]/table/tbody/tr')
22 |         for line in ip_list:
23 |             item = ProxyIPItem(type="http")
24 |             item["ip"] = line.xpath('td[1]/text()').extract()[0].strip()
25 |             item["port"] = line.xpath('td[2]/text()').extract()[0].strip()
26 |             yield item
27 |         if response.request.url.find('proxylist') < 0:
28 |             pages = response.xpath('//div[@id="listnav"]/ul/li/a')
29 |             pages.pop(0)
30 |             for page in pages:
31 |                 path = page.xpath('@href').extract()[0]
32 |                 yield Request(url=self.start_urls[0] + path, headers={'Referer': response.request.url, 'User-Agent': response.request.headers.get('User-Agent')})
33 |         
34 | 
35 |     def parseInitRequest(self, response):
36 |         if response.status == 200:
37 |             yield Request(url=response.request.url, headers={'Referer': self.referer, 'User-Agent': response.request.headers.get('User-Agent')}, dont_filter=True)
38 |             return
39 |         group = re.search('setTimeout\("\w+\((\d+)\)".*(function .*"\);})', response.body)
40 |         key, function = group.group(1), group.group(2)
41 |         script = '!%s(%s)' % (function.replace('=eval', '=console.log'), key)
42 |         result = os.popen("node -e '%s'" % script).read()
43 |         path = re.search('"(.*)"', result).group(1)
44 | 	url = response.request.url + path
45 |         yield Request(url=url, headers={'Referer': response.request.url, 'User-Agent': response.request.headers.get('User-Agent')}, dont_filter=True)                                
46 | 


--------------------------------------------------------------------------------
/crawler/spiders/qiaodm.py:
--------------------------------------------------------------------------------
 1 | from scrapy.spiders import Spider
 2 | from scrapy.http import Request
 3 | from scrapy.selector import Selector
 4 | from crawler.items import ProxyIPItem
 5 | 
 6 | class QiaodmSpider(Spider):
 7 |     name = "qiaodm"
 8 |     allowed_domains = ["qiaodm.com"]
 9 |     start_urls = [
10 |         "http://ip.qiaodm.com/",
11 |         "http://ip.qiaodm.com/free/index.html"
12 |     ]
13 |     referer = "http://ip.qiaodm.com/"
14 | 
15 |     def start_requests(self):
16 |         for item in self.start_urls:
17 |             yield Request(url=item, headers={'Referer': self.referer}, dont_filter=True)
18 | 
19 |     def parse(self, response):
20 |         # pages = response.xpath('//*[@id="flip"]/div/span | //*[@id="flip"]/div/a')
21 |         # if len(pages) > 4:
22 |         #     next_page = pages[-2].xpath('@href').extract()
23 |         #     if len(next_page) == 1:
24 |         #         yield Request(url='%s/%s' % (self.referer, next_page[0]), headers={'Referer': response.url})
25 |         if response.request.url == 'http://ip.qiaodm.com/free/index.html':
26 |             hot_urls = response.xpath('//div[@class="freeb"]/a[contains(@href,"free")]/@href').extract()
27 |             for url in hot_urls:
28 |                 yield Request(url=url, headers={'Referer': self.referer})
29 |             country_urls = response.xpath('//a[@class="item"]/@href').extract()
30 |             for url in country_urls:
31 |                 yield Request(url=url, headers={'Referer': self.referer})
32 | 
33 |         ip_list = response.xpath('//*[@id="main_container"]/div[1]/table/tbody/tr')
34 |         if len(ip_list) > 2:
35 |             ip_list.pop(1)
36 |             ip_list.pop(0)
37 |         for line in ip_list:
38 |             item = ProxyIPItem()
39 |             columns = line.xpath('td')
40 |             ip_spans = columns[0].xpath('node()/script/text() | node()[not(contains(@style, "none"))]/text()').extract()
41 |             item['ip'] = ''.join([a.replace('document.write(\'','').replace('\');','') for a in ip_spans])
42 |             # port = columns[1].xpath('text()').extract()[0]
43 |             port = columns[1].xpath('@class').extract()[0].split(' ')[1]
44 |             port = int(''.join([str("ABCDEFGHIZ".index(c)) for c in port])) / 8
45 |             item['port'] = port
46 |             # port = columns[1].xpath('script/text()').extract()[0]
47 |             # port = port[port.index('=') + 1:port.index(';')]
48 |             # item['port'] = ''.join([str(eval(a)) for a in port.split('+')])
49 |             item['type'] = 'http'
50 |             yield item
51 | 


--------------------------------------------------------------------------------
/crawler/middleware/random_user_agent.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import random
 3 | from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware
 4 | import logging
 5 | 
 6 | class RandomUserAgentMiddleware(UserAgentMiddleware):
 7 | 
 8 |     def __init__(self, user_agent=''):
 9 | 	self.logger = logging.getLogger("crawler.middleware.randomua")
10 |         self.user_agent = user_agent
11 | 
12 |     def process_request(self, request, spider):
13 |         ua = random.choice(self.user_agent_list)
14 |         if request.headers.get('User-Agent') is not None:
15 |             return
16 |         request.headers.setdefault('User-Agent', ua)
17 |         self.logger.info("process request %s using random ua: %s" % (request, ua))
18 | 
19 |     #the default user_agent_list composes chrome,I E,firefox,Mozilla,opera,netscape
20 |     #for more user agent strings,you can find it in http://www.useragentstring.com/pages/useragentstring.php
21 |     user_agent_list = [
22 |         "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)",
23 |         "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET4.0C)",
24 |         "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; qihu theworld)",
25 |         "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; Trident/4.0)",
26 |         "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; Trident/4.0; 360SE)",
27 |         "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; Trident/4.0; InfoPath.2)",
28 |         "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.30729; LBBROWSER)",
29 |         "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; SV1)",
30 |         "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; GreenBrowser)",
31 |         "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; InfoPath.2; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",
32 |         "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; 2345Explorer 4.2.0.13850)",
33 |         "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; .NET CLR 3.5.30729; Alexa Toolbar; .NET CLR 1.1.4322)",
34 |         "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30618)",
35 |         "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; GTB7.4; KB974488)",
36 |         "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; iCafeMedia; .NET CLR 2.0.50727; .NET4.0C; .NET4.0E)",
37 |         "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; BIDUBrowser 2.x)",
38 |         "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; baiduie8; 2345Explorer 4.2.0.13929)",
39 |         "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; Apache; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729)",
40 |         "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0; 360SE)",
41 |         "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; WOW64; Trident/6.0; 2345Explorer 5.0.0.14067)",
42 |         "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/7.0)",
43 |         "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/7.0; KB974488)",
44 |         "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0) LBBROWSER",
45 |         "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; QQBrowser/8.0.2820.400)",
46 |         "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.99 Safari/537.36",
47 |         "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.99 Safari/537.36 LBBROWSER",
48 |         "Mozilla/5.0 (Windows NT 5.1; rv:32.0) Gecko/20100101 Firefox/32.0",
49 |         "Mozilla/5.0 (Windows NT 5.1; rv:25.0) Gecko/20100101 Firefox/25.0",
50 |         "Mozilla/5.0 (Windows NT 5.1; rv:25.0) Gecko/20100101 Firefox/31.0",
51 |         "Mozilla/5.0 (Windows NT 5.1; rv:25.0) Gecko/20100101 Firefox/36.0",
52 |         "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.12 (KHTML, like Gecko) Maxthon/3.0 Chrome/18.0.966.0 Safari/535.12",
53 |         "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.110 Safari/537.36 CoolNovo/2.0.9.20",
54 |         "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.66 Safari/537.36 LBBROWSER",
55 |         "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 UBrowser/3.1.1644.34 Safari/537.36",
56 |         "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.115 Safari/537.36 OPR/27.0.1689.76",
57 |         "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.2.0.3000 Chrome/30.0.1551.0 Safari/537.36",
58 |         "Mozilla/5.0 (Windows NT 6.1; rv:33.0) Gecko/20100101 Firefox/33.0",
59 |         "Mozilla/5.0 (Windows NT 6.1; Trident/7.0; MALC; rv:11.0; QQBrowser/8.0.3345.400) like Gecko",
60 |         "Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0; QQBrowser/8.0.3197.400) like Gecko",
61 |         "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.0.3000 Chrome/30.0.1599.101 Safari/537.36",
62 |         "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3647.11 Safari/537.36",
63 |         "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.0.8) Gecko/2009032609 Firefox/3.0.8 (.NET CLR 3.5.30729)",
64 |         "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:34.0) Gecko/20100101 Firefox/34.0",
65 |         "UCWEB/2.0 (Linux; U; Adr 2.3.5; zh-CN; Lenovo A288t) U2/1.0.0 UCBrowser/9.6.2.404 U2/1.0.0 Mobile",
66 |         "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.99 Safari/537.36",
67 |         "UCWEB/2.0 (MIDP-2.0; U; zh-CN; Lenovo S898t+) U2/1.0.0 UCBrowser/10.2.1.550  U2/1.0.0 Mobile",
68 |         "UCWEB/2.0 (MIDP-2.0; U; zh-CN; MI 4C) U2/1.0.0 UCBrowser/10.2.0.535  U2/1.0.0 Mobile",
69 |         "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.115 Safari/537.36",
70 |         "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.104 Safari/537.36 QQBrowser/3.3.3201.400",
71 |         "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:36.0) Gecko/20100101 Firefox/36.0",
72 |         "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:36.0) Gecko/20100101 Firefox/36.0",
73 |         "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; zh_cn) AppleWebKit/600.1.4.12.4 (KHTML, like Gecko) Version/5.0.5 Safari/600.1.4.12.4",
74 |         "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/600.3.18 (KHTML, like Gecko) Version/7.1.3 Safari/537.85.12"
75 |        ]
76 | 


--------------------------------------------------------------------------------
/crawler/pipelines.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define your item pipelines here
  4 | #
  5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
  6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
  7 | import urllib2
  8 | import json
  9 | import socket
 10 | import exceptions
 11 | import httplib
 12 | import time
 13 | from scrapy.exceptions import DropItem
 14 | from crawler.items import *
 15 | from threading import Thread
 16 | from scrapy.conf import settings
 17 | import logging
 18 | import Queue
 19 | import gzip
 20 | from StringIO import StringIO
 21 | 
 22 | # adding threading
 23 | import threading
 24 | lock = threading.Lock()
 25 | fs = open('proxy.txt','w') # adding here for writing http proxy
 26 | socket.setdefaulttimeout(2)
 27 | localhost = settings.get('LOCAL_IP')
 28 | logger = logging.getLogger('crawler.proxy.checker')
 29 | proxy_headers = [
 30 | 	'x-proxy-id',
 31 | 	'via',
 32 | 	'x-via',
 33 | 	'x-forwarded-for',
 34 | 	'forwarded-for',
 35 | 	'x-client-ip',
 36 | 	'client-ip',
 37 | 	'x-real-ip',
 38 | 	'real-ip',
 39 | 	'proxy-client-ip',
 40 | 	'wl-proxy-client-ip',
 41 | 	'x-bluecoat-via',
 42 | 	'x-cc-connectivity',
 43 | 	'x-mato-param',
 44 | 	'x-forwarded-host',
 45 | 	'x-forwarded-server'
 46 | ]
 47 | 
 48 | class CrawlerPipeline(object):
 49 |     def process_item(self, item, spider):
 50 |         return item
 51 | 
 52 | class PrintPipeline(object):
 53 | 	def process_item(self, item, spider):
 54 | 		if isinstance(item, ProxyIPItem):
 55 | 			logger.info('\033[33m[crawled]\033[m ip: \033[33m%-15s\033[m, port: \033[33m%-5s\033[m, type: \033[33m%s\033[m' % (item['ip'], item['port'], item['type']))
 56 | 			return item
 57 | 
 58 | class ProxyScanPipeline(object):
 59 | 
 60 | 	def __init__(self):
 61 | 		logger.info("local ip address: %s" % localhost)
 62 | 		self.queue = Queue.Queue()
 63 | 
 64 | 	def open_spider(self, spider):
 65 | 		logger.info("spider opened.")
 66 | 		self.running = True
 67 | 		for i in xrange(50):
 68 | 			thread = Thread(target=self.scan_task, args=())
 69 | 			thread.start()
 70 | 
 71 | 	def close_spider(self, spider):
 72 | 		self.running = False
 73 |  		fs.close()
 74 | 
 75 | 	def process_item(self, item, spider):
 76 | 		self.queue.put(item)
 77 | 		return item
 78 | 
 79 | 	def scan_task(self):
 80 | 		while self.running or not self.queue.empty():
 81 | 			try:
 82 | 				item = self.queue.get(True, 1)
 83 | 				scan(item)
 84 | 			except Queue.Empty:
 85 | 				pass
 86 | 
 87 | def scan(item, callback=None):
 88 | 	result = test_proxy(item)
 89 | 	if result is not None:
 90 |                 # only write available http proxy
 91 |                 lock.acquire()
 92 |                 fs.write('%s://%s:%s\n' % (item['type'].lower(),item['ip'],item['port']))
 93 | 		lock.release()
 94 | 		logger.info('\033[32m[ result]\033[m ip: \033[32m%-15s\033[m, port: \033[32m%-5s\033[m, speed: \033[32m%-4s\033[m, type: \033[32m%s\033[m' % (item['ip'], item['port'], item['speed'], item['type']))
 95 | 		if item['type'] in ['high', 'anonymous'] and test_http(item) is not None and item['speed'] < 2000:
 96 | 			logger.info('\033[36m[  proxy]\033[m ip: \033[36m%-15s\033[m, port: \033[36m%-5s\033[m, speed: \033[36m%-4s\033[m, type: \033[36m%-11s\033[m, post: \033[36m%-5s\033[m, ssl: \033[36m%-5s\033[m' % (item['ip'], item['port'], item['speed'], item['type'], item['post'], item['ssl']))
 97 | 			if callback is not None:
 98 | 				callback(item)
 99 | 	else:
100 | 		logger.info('\033[31m[ result]\033[m ip: \033[31m%-15s\033[m, port: \033[31m%-5s\033[m, type: \033[31m%s\033[m, \033[31mproxy server not alive or healthy.\033[m' % (item['ip'], item['port'], item['type']))
101 | 
102 | def test_http(item, verbose=False):
103 | 	proxyHandler = urllib2.ProxyHandler({'http':'http://%s:%s' % (item['ip'], item['port']), 'https':'http://%s:%s' % (item['ip'], item['port'])})  
104 | 	opener = urllib2.build_opener(proxyHandler)
105 | 	opener.addheaders = {
106 | 		'Accept-Encoding': 'gzip,deflate,sdch',
107 | 		'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6',
108 | 		'User-Agent': 'Mozilla/5.0 (compatible; WOW64; MSIE 10.0; Windows NT 6.2)',
109 | 		'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
110 | 		'Cache-Control': 'max-age=0'
111 | 	}.items()
112 | 	check_map = {
113 | 		"http://zhidao.baidu.com/robots.txt": "Baiduspider",
114 | 		"http://weibo.com/robots.txt": "sitemap",
115 | 		"http://www.qq.com/robots.txt": "Disallow",
116 | 		"http://xyq.163.com/robots.txt": "sitemap",
117 | 		"http://www.cnbeta.com/robots.txt": "manager",
118 | 		"http://www.zhihu.com/robots.txt": "resetpassword",
119 | 		"http://www.iqiyi.com/robots.txt": "Disallow",
120 | 		"http://www.taobao.com/robots.txt": "User-agent",
121 | 		"http://www.jd.com/robots.txt": "EtaoSpider",
122 | 		"http://www.58.com/robots.txt": "User-agent"
123 | 	}
124 | 	bad = 0
125 | 	total_time = item['speed']
126 | 	success = 1
127 | 	for url in check_map:
128 | 		try: 
129 | 			req = urllib2.Request(url)
130 | 			begin = time.time()
131 | 			resp = opener.open(req)
132 | 			content = resp.read()
133 | 			if resp.info().get('Content-Encoding') == 'gzip':
134 | 				buf = StringIO(content)
135 | 				f = gzip.GzipFile(fileobj=buf)
136 | 				content = f.read()
137 | 			if content.find(check_map[url]) < 0:
138 | 				bad += 1
139 | 				if verbose:
140 | 					log.msg(repr(content), log.DEBUG)
141 | 			else:
142 | 				success += 1;
143 | 				total_time += int((time.time() - begin) * 1000)
144 | 				if verbose:
145 | 					log.msg("%s %d" % (url, int((time.time() - begin) * 1000)), log.DEBUG)
146 | 		except Exception, e:
147 | 			bad += 1
148 | 			if verbose:
149 | 				logger.error("%s %s" % (url, e))
150 | 	if success * 1.0 / (len(check_map.items()) + 1) < 0.8:
151 | 		return None
152 | 	else:
153 | 		item['speed'] = total_time / success
154 | 		item['post'] = False
155 | 		try:
156 | 			req = urllib2.Request('http://httpbin.org/post', 'q=this_is_a_test')
157 | 			resp = opener.open(req)
158 | 			content = resp.read()
159 | 			if content.find('this_is_a_test') > 0:
160 | 				item['post'] = True
161 | 		except:
162 | 			pass
163 | 		item['ssl'] = False
164 | 		try:
165 | 			req = urllib2.Request('https://httpbin.org/get?q=this_is_a_test')
166 | 			resp = opener.open(req)
167 | 			content = resp.read()
168 | 			if content.find('this_is_a_test') > 0:
169 | 				item['ssl'] = True
170 | 		except:
171 | 			pass
172 | 		return item
173 | 
174 | def test_proxy(item):
175 | 	try:
176 | 		item['port'] = int(item['port'])
177 | 	except ValueError:
178 | 		return None
179 | 	if item['type'] == 'http':
180 | 		proxyHandler = urllib2.ProxyHandler({'http':'http://%s:%s' % (item['ip'], item['port']), 'https':'http://%s:%s' % (item['ip'], item['port'])})  
181 | 		opener = urllib2.build_opener(proxyHandler)
182 | 		opener.addheaders = {
183 | 			'Accept-Encoding': 'gzip',
184 | 			'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6',
185 | 			'User-Agent': 'Mozilla/5.0 (compatible; WOW64; MSIE 10.0; Windows NT 6.2)',
186 | 			'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
187 | 			'Cache-Control': 'max-age=0'
188 | 		}.items()
189 | 		try:
190 | 			req = urllib2.Request('http://httpbin.org/get')
191 | 			begin = time.time()
192 | 			resp = opener.open(req)
193 | 			content = resp.read()
194 | 			item['speed'] = int((time.time() - begin) * 1000)
195 | 			content = json.loads(content)
196 | 			if content['origin'].find(localhost) != -1:
197 | 				# print '\t[Leak Header] X-Forwarded-For: %s' % content['origin']
198 | 				item['type'] = 'transparent'
199 | 				return item
200 | 			if len(content['origin'].split(',')) > 1:
201 | 				# print '\t[Leak Header] X-Forwarded-For: %s' % content['origin']
202 | 				item['type'] = 'anonymous'
203 | 				return item
204 | 			# logger.error('ip: %s' % item['ip'])
205 | 			# for key in content['headers']:
206 | 			# 	logger.error('%s: %s' % (key, content['headers'][key]))
207 | 			for key in content['headers']:
208 | 				if content['headers'][key].find(localhost) != -1:
209 | 					# print '\t[Leak Header] %s: %s' % (key, content['headers'][key])
210 | 					item['type'] = 'transparent'
211 | 					return item
212 | 				if key.lower() in proxy_headers:
213 | 					# print '\t[Leak Header] %s: %s' % (key, content['headers'][key])
214 | 					item['type'] = 'anonymous'
215 | 			if item['type'] == 'http':
216 | 				item['type'] = 'high'
217 | 			return item
218 | 		except exceptions.ValueError, error:
219 | 			# print 'host seems to be a proxy with limitation'
220 | 			# print error
221 | 			pass
222 | 		except httplib.BadStatusLine, error:
223 | 			# print error
224 | 			pass
225 | 		except urllib2.URLError, error:
226 | 			# print error
227 | 			pass
228 | 		except socket.timeout, error:
229 | 			# print error
230 | 			pass
231 | 		except socket.error, error:
232 | 			# print error
233 | 			pass
234 | 	elif item['type'] == 'socks4':
235 | 		sock = socket.socket()
236 | 		try:
237 | 			begin = time.time()
238 | 			sock.connect((item['ip'], int(item['port'])))
239 | 			sock.send('\x04\x01\x00\x50\x36\xaf\xde\xf6MOZ\x00')
240 | 			response = sock.recv(10)
241 | 			# print repr(response) 
242 | 			if response.find('\x00\x5A') == 0:
243 | 				item['speed'] = int((time.time() - begin) * 1000)
244 | 				sock.close()
245 | 				return item
246 | 		except socket.timeout, error:
247 | 			# print error
248 | 			pass
249 | 		except socket.error, error:
250 | 			# print error
251 | 			pass
252 | 	elif item['type'] == 'socks5':
253 | 		sock = socket.socket()
254 | 		try:
255 | 			begin = time.time()
256 | 			sock.connect((item['ip'], int(item['port'])))
257 | 			sock.send('\x05\x01\x00')
258 | 			response = sock.recv(3)
259 | 			# print repr(response) 
260 | 			if response.find('\x05\x00') == 0:
261 | 				item['speed'] = int((time.time() - begin) * 1000)
262 | 				sock.close()
263 | 				return item
264 | 		except socket.timeout, error:
265 | 			# print error
266 | 			pass
267 | 		except socket.error, error:
268 | 			# print error
269 | 			pass
270 | 	return None
271 | 	# raise DropItem('proxy server not alive or healthy.')
272 | 
273 | 
274 | if __name__ == '__main__':
275 | 	item = {}
276 | 	item['ip'] = '120.52.72.58'
277 | 	item['port'] = '80'
278 | 	item['speed'] = 1000
279 | 	item['type'] = 'anonymous'
280 | 	print test_http(item, True)
281 | 


--------------------------------------------------------------------------------
/crawler/contrib/socks.py:
--------------------------------------------------------------------------------
  1 | """SocksiPy - Python SOCKS module.
  2 | Version 1.00
  3 | 
  4 | Copyright 2006 Dan-Haim. All rights reserved.
  5 | 
  6 | Redistribution and use in source and binary forms, with or without modification,
  7 | are permitted provided that the following conditions are met:
  8 | 1. Redistributions of source code must retain the above copyright notice, this
  9 |    list of conditions and the following disclaimer.
 10 | 2. Redistributions in binary form must reproduce the above copyright notice,
 11 |    this list of conditions and the following disclaimer in the documentation
 12 |    and/or other materials provided with the distribution.
 13 | 3. Neither the name of Dan Haim nor the names of his contributors may be used
 14 |    to endorse or promote products derived from this software without specific
 15 |    prior written permission.
 16 |    
 17 | THIS SOFTWARE IS PROVIDED BY DAN HAIM "AS IS" AND ANY EXPRESS OR IMPLIED
 18 | WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
 19 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
 20 | EVENT SHALL DAN HAIM OR HIS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 21 | INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 22 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA
 23 | OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 24 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
 25 | OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMANGE.
 26 | 
 27 | 
 28 | This module provides a standard socket-like interface for Python
 29 | for tunneling connections through SOCKS proxies.
 30 | 
 31 | """
 32 | 
 33 | import socket
 34 | import struct
 35 | 
 36 | PROXY_TYPE_SOCKS4 = 1
 37 | PROXY_TYPE_SOCKS5 = 2
 38 | PROXY_TYPE_HTTP = 3
 39 | 
 40 | _defaultproxy = None
 41 | _orgsocket = socket.socket
 42 | 
 43 | class ProxyError(Exception):
 44 | 	def __init__(self, value):
 45 | 		self.value = value
 46 | 	def __str__(self):
 47 | 		return repr(self.value)
 48 | 
 49 | class GeneralProxyError(ProxyError):
 50 | 	def __init__(self, value):
 51 | 		self.value = value
 52 | 	def __str__(self):
 53 | 		return repr(self.value)
 54 | 
 55 | class Socks5AuthError(ProxyError):
 56 | 	def __init__(self, value):
 57 | 		self.value = value
 58 | 	def __str__(self):
 59 | 		return repr(self.value)
 60 | 
 61 | class Socks5Error(ProxyError):
 62 | 	def __init__(self, value):
 63 | 		self.value = value
 64 | 	def __str__(self):
 65 | 		return repr(self.value)
 66 | 
 67 | class Socks4Error(ProxyError):
 68 | 	def __init__(self, value):
 69 | 		self.value = value
 70 | 	def __str__(self):
 71 | 		return repr(self.value)
 72 | 
 73 | class HTTPError(ProxyError):
 74 | 	def __init__(self, value):
 75 | 		self.value = value
 76 | 	def __str__(self):
 77 | 		return repr(self.value)
 78 | 
 79 | _generalerrors = ("success",
 80 | 		   "invalid data",
 81 | 		   "not connected",
 82 | 		   "not available",
 83 | 		   "bad proxy type",
 84 | 		   "bad input")
 85 | 
 86 | _socks5errors = ("succeeded",
 87 | 		  "general SOCKS server failure",
 88 | 		  "connection not allowed by ruleset",
 89 | 		  "Network unreachable",
 90 | 		  "Host unreachable",
 91 | 		  "Connection refused",
 92 | 		  "TTL expired",
 93 | 		  "Command not supported",
 94 | 		  "Address type not supported",
 95 | 		  "Unknown error")
 96 | 
 97 | _socks5autherrors = ("succeeded",
 98 | 		      "authentication is required",
 99 | 		      "all offered authentication methods were rejected",
100 | 		      "unknown username or invalid password",
101 | 		      "unknown error")
102 | 
103 | _socks4errors = ("request granted",
104 | 		  "request rejected or failed",
105 | 		  "request rejected because SOCKS server cannot connect to identd on the client",
106 | 		  "request rejected because the client program and identd report different user-ids",
107 | 		  "unknown error")
108 | 
109 | def setdefaultproxy(proxytype=None,addr=None,port=None,rdns=True,username=None,password=None):
110 | 	"""setdefaultproxy(proxytype, addr[, port[, rdns[, username[, password]]]])
111 | 	Sets a default proxy which all further socksocket objects will use,
112 | 	unless explicitly changed.
113 | 	"""
114 | 	global _defaultproxy
115 | 	_defaultproxy = (proxytype,addr,port,rdns,username,password)
116 | 	
117 | class socksocket(socket.socket):
118 | 	"""socksocket([family[, type[, proto]]]) -> socket object
119 | 	
120 | 	Open a SOCKS enabled socket. The parameters are the same as
121 | 	those of the standard socket init. In order for SOCKS to work,
122 | 	you must specify family=AF_INET, type=SOCK_STREAM and proto=0.
123 | 	"""
124 | 	
125 | 	def __init__(self, family=socket.AF_INET, type=socket.SOCK_STREAM, proto=0, _sock=None):
126 | 		_orgsocket.__init__(self,family,type,proto,_sock)
127 | 		if _defaultproxy != None:
128 | 			self.__proxy = _defaultproxy
129 | 		else:
130 | 			self.__proxy = (None, None, None, None, None, None)
131 | 		self.__proxysockname = None
132 | 		self.__proxypeername = None
133 | 	
134 | 	def __recvall(self, bytes):
135 | 		"""__recvall(bytes) -> data
136 | 		Receive EXACTLY the number of bytes requested from the socket.
137 | 		Blocks until the required number of bytes have been received.
138 | 		"""
139 | 		data = ""
140 | 		while len(data) < bytes:
141 | 			data = data + self.recv(bytes-len(data))
142 | 		return data
143 | 	
144 | 	def setproxy(self,proxytype=None,addr=None,port=None,rdns=True,username=None,password=None):
145 | 		"""setproxy(proxytype, addr[, port[, rdns[, username[, password]]]])
146 | 		Sets the proxy to be used.
147 | 		proxytype -	The type of the proxy to be used. Three types
148 | 				are supported: PROXY_TYPE_SOCKS4 (including socks4a),
149 | 				PROXY_TYPE_SOCKS5 and PROXY_TYPE_HTTP
150 | 		addr -		The address of the server (IP or DNS).
151 | 		port -		The port of the server. Defaults to 1080 for SOCKS
152 | 				servers and 8080 for HTTP proxy servers.
153 | 		rdns -		Should DNS queries be preformed on the remote side
154 | 				(rather than the local side). The default is True.
155 | 				Note: This has no effect with SOCKS4 servers.
156 | 		username -	Username to authenticate with to the server.
157 | 				The default is no authentication.
158 | 		password -	Password to authenticate with to the server.
159 | 				Only relevant when username is also provided.
160 | 		"""
161 | 		self.__proxy = (proxytype,addr,port,rdns,username,password)
162 | 	
163 | 	def __negotiatesocks5(self,destaddr,destport):
164 | 		"""__negotiatesocks5(self,destaddr,destport)
165 | 		Negotiates a connection through a SOCKS5 server.
166 | 		"""
167 | 		# First we'll send the authentication packages we support.
168 | 		if (self.__proxy[4]!=None) and (self.__proxy[5]!=None):
169 | 			# The username/password details were supplied to the
170 | 			# setproxy method so we support the USERNAME/PASSWORD
171 | 			# authentication (in addition to the standard none).
172 | 			self.sendall("\x05\x02\x00\x02")
173 | 		else:
174 | 			# No username/password were entered, therefore we
175 | 			# only support connections with no authentication.
176 | 			self.sendall("\x05\x01\x00")
177 | 		# We'll receive the server's response to determine which
178 | 		# method was selected
179 | 		chosenauth = self.__recvall(2)
180 | 		if chosenauth[0] != "\x05":
181 | 			self.close()
182 | 			raise GeneralProxyError((1,_generalerrors[1]))
183 | 		# Check the chosen authentication method
184 | 		if chosenauth[1] == "\x00":
185 | 			# No authentication is required
186 | 			pass
187 | 		elif chosenauth[1] == "\x02":
188 | 			# Okay, we need to perform a basic username/password
189 | 			# authentication.
190 | 			self.sendall("\x01" + chr(len(self.__proxy[4])) + self.__proxy[4] + chr(len(self.proxy[5])) + self.__proxy[5])
191 | 			authstat = self.__recvall(2)
192 | 			if authstat[0] != "\x01":
193 | 				# Bad response
194 | 				self.close()
195 | 				raise GeneralProxyError((1,_generalerrors[1]))
196 | 			if authstat[1] != "\x00":
197 | 				# Authentication failed
198 | 				self.close()
199 | 				raise Socks5AuthError,((3,_socks5autherrors[3]))
200 | 			# Authentication succeeded
201 | 		else:
202 | 			# Reaching here is always bad
203 | 			self.close()
204 | 			if chosenauth[1] == "\xFF":
205 | 				raise Socks5AuthError((2,_socks5autherrors[2]))
206 | 			else:
207 | 				raise GeneralProxyError((1,_generalerrors[1]))
208 | 		# Now we can request the actual connection
209 | 		req = "\x05\x01\x00"
210 | 		# If the given destination address is an IP address, we'll
211 | 		# use the IPv4 address request even if remote resolving was specified.
212 | 		try:
213 | 			ipaddr = socket.inet_aton(destaddr)
214 | 			req = req + "\x01" + ipaddr
215 | 		except socket.error:
216 | 			# Well it's not an IP number,  so it's probably a DNS name.
217 | 			if self.__proxy[3]==True:
218 | 				# Resolve remotely
219 | 				ipaddr = None
220 | 				req = req + "\x03" + chr(len(destaddr)) + destaddr
221 | 			else:
222 | 				# Resolve locally
223 | 				ipaddr = socket.inet_aton(socket.gethostbyname(destaddr))
224 | 				req = req + "\x01" + ipaddr
225 | 		req = req + struct.pack(">H",destport)
226 | 		self.sendall(req)
227 | 		# Get the response
228 | 		resp = self.__recvall(4)
229 | 		if resp[0] != "\x05":
230 | 			self.close()
231 | 			raise GeneralProxyError((1,_generalerrors[1]))
232 | 		elif resp[1] != "\x00":
233 | 			# Connection failed
234 | 			self.close()
235 | 			if ord(resp[1])<=8:
236 | 				raise Socks5Error(ord(resp[1]),_generalerrors[ord(resp[1])])
237 | 			else:
238 | 				raise Socks5Error(9,_generalerrors[9])
239 | 		# Get the bound address/port
240 | 		elif resp[3] == "\x01":
241 | 			boundaddr = self.__recvall(4)
242 | 		elif resp[3] == "\x03":
243 | 			resp = resp + self.recv(1)
244 | 			boundaddr = self.__recvall(resp[4])
245 | 		else:
246 | 			self.close()
247 | 			raise GeneralProxyError((1,_generalerrors[1]))
248 | 		boundport = struct.unpack(">H",self.__recvall(2))[0]
249 | 		self.__proxysockname = (boundaddr,boundport)
250 | 		if ipaddr != None:
251 | 			self.__proxypeername = (socket.inet_ntoa(ipaddr),destport)
252 | 		else:
253 | 			self.__proxypeername = (destaddr,destport)
254 | 	
255 | 	def getproxysockname(self):
256 | 		"""getsockname() -> address info
257 | 		Returns the bound IP address and port number at the proxy.
258 | 		"""
259 | 		return self.__proxysockname
260 | 	
261 | 	def getproxypeername(self):
262 | 		"""getproxypeername() -> address info
263 | 		Returns the IP and port number of the proxy.
264 | 		"""
265 | 		return _orgsocket.getpeername(self)
266 | 	
267 | 	def getpeername(self):
268 | 		"""getpeername() -> address info
269 | 		Returns the IP address and port number of the destination
270 | 		machine (note: getproxypeername returns the proxy)
271 | 		"""
272 | 		return self.__proxypeername
273 | 	
274 | 	def __negotiatesocks4(self,destaddr,destport):
275 | 		"""__negotiatesocks4(self,destaddr,destport)
276 | 		Negotiates a connection through a SOCKS4 server.
277 | 		"""
278 | 		# Check if the destination address provided is an IP address
279 | 		rmtrslv = False
280 | 		try:
281 | 			ipaddr = socket.inet_aton(destaddr)
282 | 		except socket.error:
283 | 			# It's a DNS name. Check where it should be resolved.
284 | 			if self.__proxy[3]==True:
285 | 				ipaddr = "\x00\x00\x00\x01"
286 | 				rmtrslv = True
287 | 			else:
288 | 				ipaddr = socket.inet_aton(socket.gethostbyname(destaddr))
289 | 		# Construct the request packet
290 | 		req = "\x04\x01" + struct.pack(">H",destport) + ipaddr
291 | 		# The username parameter is considered userid for SOCKS4
292 | 		if self.__proxy[4] != None:
293 | 			req = req + self.__proxy[4]
294 | 		req = req + "\x00"
295 | 		# DNS name if remote resolving is required
296 | 		# NOTE: This is actually an extension to the SOCKS4 protocol
297 | 		# called SOCKS4A and may not be supported in all cases.
298 | 		if rmtrslv==True:
299 | 			req = req + destaddr + "\x00"
300 | 		self.sendall(req)
301 | 		# Get the response from the server
302 | 		resp = self.__recvall(8)
303 | 		if resp[0] != "\x00":
304 | 			# Bad data
305 | 			self.close()
306 | 			raise GeneralProxyError((1,_generalerrors[1]))
307 | 		if resp[1] != "\x5A":
308 | 			# Server returned an error
309 | 			self.close()
310 | 			if ord(resp[1]) in (91,92,93):
311 | 				self.close()
312 | 				raise Socks4Error((ord(resp[1]),_socks4errors[ord(resp[1])-90]))
313 | 			else:
314 | 				raise Socks4Error((94,_socks4errors[4]))
315 | 		# Get the bound address/port
316 | 		self.__proxysockname = (socket.inet_ntoa(resp[4:]),struct.unpack(">H",resp[2:4])[0])
317 | 		if rmtrslv != None:
318 | 			self.__proxypeername = (socket.inet_ntoa(ipaddr),destport)
319 | 		else:
320 | 			self.__proxypeername = (destaddr,destport)
321 | 	
322 | 	def __negotiatehttp(self,destaddr,destport):
323 | 		"""__negotiatehttp(self,destaddr,destport)
324 | 		Negotiates a connection through an HTTP server.
325 | 		"""
326 | 		# If we need to resolve locally, we do this now
327 | 		if self.__proxy[3] == False:
328 | 			addr = socket.gethostbyname(destaddr)
329 | 		else:
330 | 			addr = destaddr
331 | 		self.sendall("CONNECT " + addr + ":" + str(destport) + " HTTP/1.1\r\n" + "Host: " + destaddr + "\r\n\r\n")
332 | 		# We read the response until we get the string "\r\n\r\n"
333 | 		resp = self.recv(1)
334 | 		while resp.find("\r\n\r\n")==-1:
335 | 			resp = resp + self.recv(1)
336 | 		# We just need the first line to check if the connection
337 | 		# was successful
338 | 		statusline = resp.splitlines()[0].split(" ",2)
339 | 		if statusline[0] not in ("HTTP/1.0","HTTP/1.1"):
340 | 			self.close()
341 | 			raise GeneralProxyError((1,_generalerrors[1]))
342 | 		try:
343 | 			statuscode = int(statusline[1])
344 | 		except ValueError:
345 | 			self.close()
346 | 			raise GeneralProxyError((1,_generalerrors[1]))
347 | 		if statuscode != 200:
348 | 			self.close()
349 | 			raise HTTPError((statuscode,statusline[2]))
350 | 		self.__proxysockname = ("0.0.0.0",0)
351 | 		self.__proxypeername = (addr,destport)
352 | 	
353 | 	def connect(self,destpair):
354 | 		"""connect(self,despair)
355 | 		Connects to the specified destination through a proxy.
356 | 		destpar - A tuple of the IP/DNS address and the port number.
357 | 		(identical to socket's connect).
358 | 		To select the proxy server use setproxy().
359 | 		"""
360 | 		# Do a minimal input check first
361 | 		if (type(destpair) in (list,tuple)==False) or (len(destpair)<2) or (type(destpair[0])!=str) or (type(destpair[1])!=int):
362 | 			raise GeneralProxyError((5,_generalerrors[5]))
363 | 		if self.__proxy[0] == PROXY_TYPE_SOCKS5:
364 | 			if self.__proxy[2] != None:
365 | 				portnum = self.__proxy[2]
366 | 			else:
367 | 				portnum = 1080
368 | 			_orgsocket.connect(self,(self.__proxy[1],portnum))
369 | 			self.__negotiatesocks5(destpair[0],destpair[1])
370 | 		elif self.__proxy[0] == PROXY_TYPE_SOCKS4:
371 | 			if self.__proxy[2] != None:
372 | 				portnum = self.__proxy[2]
373 | 			else:
374 | 				portnum = 1080
375 | 			_orgsocket.connect(self,(self.__proxy[1],portnum))
376 | 			self.__negotiatesocks4(destpair[0],destpair[1])
377 | 		elif self.__proxy[0] == PROXY_TYPE_HTTP:
378 | 			if self.__proxy[2] != None:
379 | 				portnum = self.__proxy[2]
380 | 			else:
381 | 				portnum = 8080
382 | 			_orgsocket.connect(self,(self.__proxy[1],portnum))
383 | 			self.__negotiatehttp(destpair[0],destpair[1])
384 | 		elif self.__proxy[0] == None:
385 | 			_orgsocket.connect(self,(destpair[0],destpair[1]))
386 | 		else:
387 | 			raise GeneralProxyError((4,_generalerrors[4]))
388 | 


--------------------------------------------------------------------------------