├── crawler ├── __init__.py ├── commands │ ├── __init__.py │ └── crawlall.py ├── contrib │ ├── __init__.py │ └── socks.py ├── middleware │ ├── __init__.py │ ├── proxy.py │ └── random_user_agent.py ├── proxy.txt ├── spiders │ ├── __init__.py │ ├── cnproxy.py │ ├── socksproxy.py │ ├── 66ip.py │ ├── xici.py │ ├── haodaili.py │ ├── proxylists.py │ ├── xroxy.py │ ├── chunzhen.py │ ├── kuaidaili.py │ └── qiaodm.py ├── items.py ├── settings.py └── pipelines.py ├── .gitignore ├── scrapy.cfg ├── LICENSE.txt └── README.md /crawler/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /crawler/commands/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /crawler/contrib/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /crawler/middleware/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | *.pyc 3 | 4 | -------------------------------------------------------------------------------- /crawler/proxy.txt: -------------------------------------------------------------------------------- 1 | anonymous://121.8.98.202:8080 2 | anonymous://121.8.98.201:8080 3 | socks5://27.152.181.217:80 4 | -------------------------------------------------------------------------------- /crawler/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html 5 | 6 | [settings] 7 | default = crawler.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = crawler 12 | -------------------------------------------------------------------------------- /crawler/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | class ProxyIPItem(scrapy.Item): 11 | ip = scrapy.Field() 12 | port = scrapy.Field() 13 | type = scrapy.Field() 14 | speed = scrapy.Field() 15 | post = scrapy.Field() 16 | ssl = scrapy.Field() 17 | 18 | -------------------------------------------------------------------------------- /crawler/middleware/proxy.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import random 3 | from scrapy.conf import settings 4 | import logging 5 | 6 | logger = logging.getLogger("crawler.middleware.proxy") 7 | 8 | class StaticProxyMiddleware(object): 9 | # overwrite process request 10 | def process_request(self, request, spider): 11 | # Set the location of the proxy 12 | proxy = settings.get("PROXY") 13 | logger.info("process request %s using proxy %s" % (request, proxy)) 14 | request.meta['proxy'] = proxy 15 | 16 | class RandomProxyMiddleware(object): 17 | 18 | def process_request(self, request, spider): 19 | proxy = random.choice(settings.get('PROXY_LIST')) 20 | logger.info("process request %s using proxy %s" % (request, proxy)) 21 | request.meta['proxy'] = proxy 22 | -------------------------------------------------------------------------------- /crawler/spiders/cnproxy.py: -------------------------------------------------------------------------------- 1 | from scrapy.spiders import Spider 2 | from scrapy.http import Request 3 | from scrapy.selector import Selector 4 | from crawler.items import ProxyIPItem 5 | 6 | class CnProxySpider(Spider): 7 | name = "cnproxy" 8 | allowed_domains = ["cn-proxy.com"] 9 | start_urls = [ 10 | "http://cn-proxy.com/", 11 | "http://cn-proxy.com/archives/218" 12 | ] 13 | referer = "http://cn-proxy.com/" 14 | 15 | def start_requests(self): 16 | for item in self.start_urls: 17 | yield Request(url=item, headers={'Referer': self.referer}) 18 | 19 | def parse(self, response): 20 | ip_list = response.xpath('//table[@class="sortable"]/tbody/tr') 21 | for ip in ip_list: 22 | item = ProxyIPItem() 23 | item['ip'] = ip.xpath('td[1]/text()').extract()[0] 24 | item['port'] = ip.xpath('td[2]/text()').extract()[0] 25 | item['type'] = 'http' 26 | yield item 27 | -------------------------------------------------------------------------------- /crawler/spiders/socksproxy.py: -------------------------------------------------------------------------------- 1 | from scrapy.spiders import Spider 2 | from scrapy.http import Request 3 | from scrapy.selector import Selector 4 | from crawler.items import ProxyIPItem 5 | 6 | class SocksProxySpider(Spider): 7 | name = "socksproxy" 8 | allowed_domains = ["socks-proxy.net"] 9 | start_urls = [ 10 | "http://www.socks-proxy.net" 11 | ] 12 | referer = "http://www.socks-proxy.net" 13 | 14 | def start_requests(self): 15 | for item in self.start_urls: 16 | yield Request(url=item, headers={'Referer': self.referer}) 17 | 18 | def parse(self, response): 19 | ip_list = response.xpath('//*[@id="proxylisttable"]/tbody/tr') 20 | for ip in ip_list: 21 | item = ProxyIPItem() 22 | item['ip'] = ip.xpath('td[1]/text()').extract()[0] 23 | item['port'] = ip.xpath('td[2]/text()').extract()[0] 24 | item['type'] = ip.xpath('td[5]/text()').extract()[0].lower() 25 | yield item 26 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2017 xelzmm@gmail.com 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. 20 | -------------------------------------------------------------------------------- /crawler/spiders/66ip.py: -------------------------------------------------------------------------------- 1 | from scrapy.spiders import Spider 2 | from scrapy.http import Request 3 | from scrapy.selector import Selector 4 | from crawler.items import ProxyIPItem 5 | import random 6 | import re 7 | 8 | from urllib import unquote 9 | from re import search 10 | 11 | class _66IPSpider(Spider): 12 | name = "66ip" 13 | allowed_domains = ["66ip.cn"] 14 | start_urls = [ 15 | "http://www.66ip.cn/mo.php?sxb=&tqsl=%s&port=&export=&ktip=&sxa=&submit=%%CC%%E1++%%C8%%A1&textarea=" % random.randint(3000, 5000), 16 | "http://www.66ip.cn/nmtq.php?getnum=%s&isp=0&anonymoustype=0&start=&ports=&export=&ipaddress=&area=0&proxytype=2&api=71daili" % random.randint(3000, 5000) 17 | ] 18 | 19 | def start_requests(self): 20 | for item in self.start_urls: 21 | yield Request(url=item, headers={'Referer': item[:item.index('?')]}) 22 | 23 | def parse(self, response): 24 | ip_list = re.findall("\d+\.\d+\.\d+\.\d+:\d+", response.body) 25 | for ip in ip_list: 26 | item = ProxyIPItem() 27 | item['ip'] = ip[:ip.index(':')] 28 | item['port'] = ip[ip.index(":") + 1:] 29 | item['type'] = 'http' 30 | yield item 31 | 32 | -------------------------------------------------------------------------------- /crawler/spiders/xici.py: -------------------------------------------------------------------------------- 1 | from scrapy.spiders import Spider 2 | from scrapy.http import Request 3 | from scrapy.selector import Selector 4 | from crawler.items import ProxyIPItem 5 | 6 | class XiciSpider(Spider): 7 | name = "xici" 8 | allowed_domains = ["xicidaili.com"] 9 | start_urls = [ 10 | "http://www.xicidaili.com/nn", 11 | "http://www.xicidaili.com/nn/2", 12 | "http://www.xicidaili.com/nn/3", 13 | "http://www.xicidaili.com/nn/4", 14 | "http://www.xicidaili.com/nn/5", 15 | "http://www.xicidaili.com/nn/6", 16 | "http://www.xicidaili.com/nn/7", 17 | "http://www.xicidaili.com/nn/8", 18 | "http://www.xicidaili.com/nn/9", 19 | "http://www.xicidaili.com/nn/10" 20 | ] 21 | referer = 'http://www.xicidaili.com/nn' 22 | 23 | def start_requests(self): 24 | for item in self.start_urls: 25 | yield Request(url=item, headers={'Referer': self.referer}) 26 | 27 | def parse(self, response): 28 | ip_list = response.xpath('//table[@id="ip_list"]/tr') 29 | if len(ip_list) > 0: 30 | ip_list.pop(0) 31 | for ip in ip_list: 32 | item = ProxyIPItem() 33 | item['ip'] = ip.xpath('td[2]/text()').extract()[0] 34 | item['port'] = ip.xpath('td[3]/text()').extract()[0] 35 | item['type'] = 'http' 36 | yield item 37 | -------------------------------------------------------------------------------- /crawler/commands/crawlall.py: -------------------------------------------------------------------------------- 1 | from scrapy.commands import ScrapyCommand 2 | from scrapy.crawler import CrawlerRunner 3 | from scrapy.utils.conf import arglist_to_dict 4 | class Command(ScrapyCommand): 5 | requires_project = True 6 | def syntax(self): 7 | return '[options]' 8 | def short_desc(self): 9 | return 'Runs all of the spiders' 10 | def add_options(self, parser): 11 | ScrapyCommand.add_options(self, parser) 12 | parser.add_option("-a", dest="spargs", action="append", default=[], metavar="NAME=VALUE", 13 | help="set spider argument (may be repeated)") 14 | parser.add_option("-o", "--output", metavar="FILE", 15 | help="dump scraped items into FILE (use - for stdout)") 16 | parser.add_option("-t", "--output-format", metavar="FORMAT", 17 | help="format to use for dumping items with -o") 18 | def process_options(self, args, opts): 19 | ScrapyCommand.process_options(self, args, opts) 20 | try: 21 | opts.spargs = arglist_to_dict(opts.spargs) 22 | except ValueError: 23 | raise UsageError("Invalid -a value, use -a NAME=VALUE", print_help=False) 24 | def run(self, args, opts): 25 | #settings = get_project_settings() 26 | 27 | spider_loader = self.crawler_process.spider_loader 28 | for spidername in args or spider_loader.list(): 29 | print "*********cralall spidername************" + spidername 30 | self.crawler_process.crawl(spidername, **opts.spargs) 31 | self.crawler_process.start() 32 | -------------------------------------------------------------------------------- /crawler/spiders/haodaili.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from scrapy.spiders import Spider 4 | from scrapy.http import Request 5 | from scrapy.selector import Selector 6 | from crawler.items import ProxyIPItem 7 | 8 | class HaodailiSpider(Spider): 9 | name = "haodaili" 10 | allowed_domains = ["haodailiip.com"] 11 | start_urls = [ 12 | "http://www.haodailiip.com/guonei", 13 | "http://www.haodailiip.com/guoji" 14 | ] 15 | referer = "http://www.haodailiip.com" 16 | 17 | def start_requests(self): 18 | for item in self.start_urls: 19 | yield Request(url=item, headers={'Referer': self.referer}) 20 | 21 | def parse(self, response): 22 | ip_list = response.xpath('/html/body/center/table[2]/tr/td[1]/table/tr') 23 | if len(ip_list) > 1: 24 | ip_list.pop(0) 25 | has_next = True 26 | for ip in ip_list: 27 | item = ProxyIPItem() 28 | columns = ip.xpath('td/text()').extract() 29 | item['ip'] = columns[0].strip() 30 | item['port'] = columns[1].strip() 31 | item['type'] = 'http' 32 | if columns[-1].strip() == u'超时': 33 | has_next = False 34 | yield item 35 | if has_next: 36 | url = "%s%s" % (self.referer, response.xpath('/html/body/center/table[2]/tr/td[1]/p/a[last()]/@href').extract()[0]) 37 | yield Request(url=url, headers={'Referer': response.url}) 38 | 39 | -------------------------------------------------------------------------------- /crawler/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for crawler project 4 | # 5 | # For simplicity, this file contains only the most important settings by 6 | # default. All the other settings are documented here: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # 10 | 11 | BOT_NAME = 'crawler' 12 | 13 | SPIDER_MODULES = ['crawler.spiders'] 14 | NEWSPIDER_MODULE = 'crawler.spiders' 15 | COMMANDS_MODULE = 'crawler.commands' 16 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 17 | #USER_AGENT = 'crawler (+http://www.yourdomain.com)' 18 | 19 | DOWNLOADER_MIDDLEWARES = { 20 | 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware' : None, 21 | 'scrapy.downloadermiddlewares.retry.RetryMiddleware' : 300, 22 | 'crawler.middleware.random_user_agent.RandomUserAgentMiddleware' : 500, 23 | # 'crawler.middleware.proxy.StaticProxyMiddleware' : 100 , 24 | } 25 | 26 | ITEM_PIPELINES = { 27 | 'crawler.pipelines.ProxyScanPipeline': 500, 28 | # 'crawler.pipelines.PrintPipeline': 800, 29 | } 30 | 31 | LOG_LEVEL = 'INFO' 32 | 33 | DOWNLOAD_DELAY = 1 34 | import urllib2 35 | LOCAL_IP = urllib2.urlopen('http://ifconfig.io/ip').read()[:-1] 36 | # PROXY = "http://61.53.143.179:80" 37 | 38 | # used by StaticProxyMiddleware, if you want to crawl through a proxy server 39 | PROXY = "http://localhost:8088" 40 | 41 | # used by RandomProxyMiddleware, if you want to crawl through proxy servers 42 | PROXY_LIST = [ 43 | "http://120.83.5.164:18000", 44 | "http://111.161.126.100:80", 45 | "http://61.53.143.179:80" 46 | ] 47 | 48 | RETRY_HTTP_CODES = [500, 502, 503, 504, 400, 405, 408] 49 | -------------------------------------------------------------------------------- /crawler/spiders/proxylists.py: -------------------------------------------------------------------------------- 1 | from scrapy.spiders import Spider 2 | from scrapy.http import Request 3 | from scrapy.selector import Selector 4 | from crawler.items import ProxyIPItem 5 | 6 | from urllib import unquote 7 | from re import search 8 | 9 | class ProxylistsSpider(Spider): 10 | name = "proxylists" 11 | allowed_domains = ["proxylists.net"] 12 | start_urls = [ 13 | "http://www.proxylists.net/cn_1_ext.html", 14 | ] 15 | referer = "http://www.proxylists.net/" 16 | 17 | def start_requests(self): 18 | for item in self.start_urls: 19 | yield Request(url=item, headers={'Referer': self.referer}) 20 | 21 | def parse(self, response): 22 | ip_list = response.xpath('body/font/b/table/tr[1]/td[2]/table/tr') 23 | if len(ip_list) > 3: 24 | ip_list.pop(1) 25 | ip_list.pop(0) 26 | ip = ip_list.pop() 27 | cur_page = int(search('cn_(\d+)_ext', response.url).group(1)) 28 | total = len(ip.xpath('td/b/a')) 29 | if total - cur_page > 1: 30 | yield Request(url="http://www.proxylists.net/cn_%d_ext.html" % (cur_page + 1), headers={'Referer': response.url}) 31 | for ip in ip_list: 32 | item = ProxyIPItem() 33 | item['ip'] = unquote(search('%22(.*)%22', ip.xpath('td/script/text()').extract()[0]).group(1)) 34 | item['port'] = ip.xpath('td[2]/text()').extract()[0] 35 | type = ip.xpath('td[3]/text()').extract()[0].lower() 36 | type = 'http' if type in ['anonymous', 'transparent', 'high anonymity', 'distorting'] else type 37 | item['type'] = type 38 | yield item 39 | 40 | -------------------------------------------------------------------------------- /crawler/spiders/xroxy.py: -------------------------------------------------------------------------------- 1 | from scrapy.spiders import Spider 2 | from scrapy.http import Request 3 | from scrapy.selector import Selector 4 | from crawler.items import ProxyIPItem 5 | 6 | from re import search 7 | 8 | class XroxySpider(Spider): 9 | name = "xroxy" 10 | allowed_domains = ["xroxy.com"] 11 | start_urls = [ 12 | "http://www.xroxy.com/proxylist.php", 13 | ] 14 | referer = "http://www.xroxy.com/" 15 | 16 | def start_requests(self): 17 | for item in self.start_urls: 18 | yield Request(url=item, headers={'Referer': self.referer}) 19 | 20 | def parse(self, response): 21 | total = int(response.xpath('//*[@id="content"]/table[2]/tr/td[1]/table/tr[2]/td/small/b/text()').extract()[0]) / 10 22 | if response.url.find('pnum=') == -1: 23 | cur_page = 0 24 | else: 25 | cur_page = int(search('pnum=(\d+)', response.url).group(1)) 26 | if total - cur_page > 1: 27 | yield Request(url="http://www.xroxy.com/proxylist.php?pnum=%d" % (cur_page + 1), headers={'Referer': response.url}) 28 | ip_list = response.xpath('//*[@id="content"]/table[1]/tr[@class="row0"] | //*[@id="content"]/table[1]/tr[@class="row1"]') 29 | for ip in ip_list: 30 | item = ProxyIPItem() 31 | item['ip'] = ip.xpath('td[2]/a/text()').extract()[0].strip() 32 | item['port'] = ip.xpath('td[3]/a/text()').extract()[0].strip() 33 | type = ip.xpath('td[4]/a/text()').extract()[0].strip().lower() 34 | type = 'http' if type in ['anonymous', 'transparent', 'high anonymity', 'distorting'] else type 35 | item['type'] = type 36 | yield item 37 | 38 | -------------------------------------------------------------------------------- /crawler/spiders/chunzhen.py: -------------------------------------------------------------------------------- 1 | from scrapy.spiders import Spider 2 | from scrapy.http import Request 3 | from scrapy.selector import Selector 4 | from crawler.items import ProxyIPItem 5 | 6 | class ChunzhenSpider(Spider): 7 | name = "chunzhen" 8 | allowed_domains = ["cz88.net"] 9 | start_urls = [ 10 | "http://www.cz88.net/proxy/index.shtml", 11 | "http://www.cz88.net/proxy/http_2.shtml", 12 | "http://www.cz88.net/proxy/http_3.shtml", 13 | "http://www.cz88.net/proxy/http_4.shtml", 14 | "http://www.cz88.net/proxy/http_5.shtml", 15 | "http://www.cz88.net/proxy/http_6.shtml", 16 | "http://www.cz88.net/proxy/http_7.shtml", 17 | "http://www.cz88.net/proxy/http_8.shtml", 18 | "http://www.cz88.net/proxy/http_9.shtml", 19 | "http://www.cz88.net/proxy/http_10.shtml", 20 | "http://www.cz88.net/proxy/socks4.shtml", 21 | "http://www.cz88.net/proxy/socks4_2.shtml", 22 | "http://www.cz88.net/proxy/socks4_3.shtml", 23 | "http://www.cz88.net/proxy/socks5.shtml", 24 | "http://www.cz88.net/proxy/socks5_2.shtml" 25 | ] 26 | referer = 'http://www.cz88.net/proxy/index.shtml' 27 | 28 | def start_requests(self): 29 | for item in self.start_urls: 30 | yield Request(url=item, headers={'Referer': self.referer}) 31 | 32 | def parse(self, response): 33 | ip_list = response.xpath('//div[@id="boxright"]/div/ul/li') 34 | if len(ip_list) > 0: 35 | ip_list.pop(0) 36 | for ip in ip_list: 37 | item = ProxyIPItem() 38 | item['ip'] = ip.xpath('div[@class="ip"]/text()').extract()[0] 39 | item['port'] = ip.xpath('div[@class="port"]/text()').extract()[0] 40 | if response.url.find('socks4') != -1: 41 | item['type'] = 'socks4' 42 | elif response.url.find('socks5') != -1: 43 | item['type'] = 'socks5' 44 | else: 45 | item['type'] = 'http' 46 | yield item 47 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ##Introduction 2 | 3 | **Proxy Server Crawler** is a tool used to crawl public proxy servers from proxy websites. When crawled a proxy server(ip::port::type), it will test the functionality of the server automatically. 4 | 5 | Currently supported websites: 6 | 7 | * http://www.66ip.cn 8 | * http://www.cz88.net 9 | * http://www.cn-proxy.com 10 | * http://www.haodailiip.com 11 | * http://www.kuaidaili.com 12 | * http://www.proxylists.net 13 | * http://www.qiaodm.net 14 | * http://www.socks-proxy.net 15 | * http://www.xroxy.com 16 | * http://www.xicidaili.com 17 | 18 | Currently supported testing(for http proxy) 19 | 20 | * ssl support 21 | * post support 22 | * speed (tested with 10 frequently used sites) 23 | * type(high/anonymous/transparent) 24 | 25 | ## Requirements 26 | 27 | * Python >= 2.7 28 | * Scrapy 1.3.0 (not tested for lower version) 29 | * node (for some sites, you need node to bypass waf based on javascript) 30 | 31 | ## Usage 32 | 33 | ```bash 34 | cd proxy_server_crawler 35 | scrapy crawl chunzhen 36 | ``` 37 | 38 | [log] 39 | 40 | ``` 41 | [ result] ip: 59.41.214.218 , port: 3128 , type: http, proxy server not alive or healthy. 42 | [ result] ip: 117.90.6.67 , port: 9000 , type: http, proxy server not alive or healthy. 43 | [ result] ip: 117.175.183.10 , port: 8123 , speed: 984 , type: high 44 | [ result] ip: 180.95.154.221 , port: 80 , type: http, proxy server not alive or healthy. 45 | [ result] ip: 110.73.0.206 , port: 8123 , type: http, proxy server not alive or healthy. 46 | [ proxy] ip: 124.88.67.54 , port: 80 , speed: 448 , type: high , post: True , ssl: False 47 | [ result] ip: 117.90.2.149 , port: 9000 , type: http, proxy server not alive or healthy. 48 | [ result] ip: 115.212.165.170, port: 9000 , type: http, proxy server not alive or healthy. 49 | [ proxy] ip: 118.123.22.192 , port: 3128 , speed: 769 , type: high , post: True , ssl: False 50 | [ proxy] ip: 117.175.183.10 , port: 8123 , speed: 908 , type: high , post: True , ssl: True 51 | ``` 52 | 53 | ##License 54 | 55 | The MIT License (MIT) 56 | -------------------------------------------------------------------------------- /crawler/spiders/kuaidaili.py: -------------------------------------------------------------------------------- 1 | from scrapy.spiders import Spider 2 | from scrapy.http import Request 3 | from scrapy.selector import Selector 4 | from crawler.items import ProxyIPItem 5 | import re 6 | import os 7 | 8 | class KuaidailiSpider(Spider): 9 | name = "kuaidaili" 10 | allowed_domains = ["kuaidaili.com"] 11 | start_urls = [ 12 | "http://www.kuaidaili.com" 13 | ] 14 | referer = 'http://www.kuaidaili.com' 15 | 16 | def start_requests(self): 17 | for item in self.start_urls: 18 | yield Request(url=item, headers={'Referer': self.referer}, meta={'handle_httpstatus_list': [521]}, callback=self.parseInitRequest) 19 | 20 | def parse(self, response): 21 | ip_list = response.xpath('//div[@id="index_free_list"]/table/tbody/tr') 22 | for line in ip_list: 23 | item = ProxyIPItem(type="http") 24 | item["ip"] = line.xpath('td[1]/text()').extract()[0].strip() 25 | item["port"] = line.xpath('td[2]/text()').extract()[0].strip() 26 | yield item 27 | if response.request.url.find('proxylist') < 0: 28 | pages = response.xpath('//div[@id="listnav"]/ul/li/a') 29 | pages.pop(0) 30 | for page in pages: 31 | path = page.xpath('@href').extract()[0] 32 | yield Request(url=self.start_urls[0] + path, headers={'Referer': response.request.url, 'User-Agent': response.request.headers.get('User-Agent')}) 33 | 34 | 35 | def parseInitRequest(self, response): 36 | if response.status == 200: 37 | yield Request(url=response.request.url, headers={'Referer': self.referer, 'User-Agent': response.request.headers.get('User-Agent')}, dont_filter=True) 38 | return 39 | group = re.search('setTimeout\("\w+\((\d+)\)".*(function .*"\);})', response.body) 40 | key, function = group.group(1), group.group(2) 41 | script = '!%s(%s)' % (function.replace('=eval', '=console.log'), key) 42 | result = os.popen("node -e '%s'" % script).read() 43 | path = re.search('"(.*)"', result).group(1) 44 | url = response.request.url + path 45 | yield Request(url=url, headers={'Referer': response.request.url, 'User-Agent': response.request.headers.get('User-Agent')}, dont_filter=True) 46 | -------------------------------------------------------------------------------- /crawler/spiders/qiaodm.py: -------------------------------------------------------------------------------- 1 | from scrapy.spiders import Spider 2 | from scrapy.http import Request 3 | from scrapy.selector import Selector 4 | from crawler.items import ProxyIPItem 5 | 6 | class QiaodmSpider(Spider): 7 | name = "qiaodm" 8 | allowed_domains = ["qiaodm.com"] 9 | start_urls = [ 10 | "http://ip.qiaodm.com/", 11 | "http://ip.qiaodm.com/free/index.html" 12 | ] 13 | referer = "http://ip.qiaodm.com/" 14 | 15 | def start_requests(self): 16 | for item in self.start_urls: 17 | yield Request(url=item, headers={'Referer': self.referer}, dont_filter=True) 18 | 19 | def parse(self, response): 20 | # pages = response.xpath('//*[@id="flip"]/div/span | //*[@id="flip"]/div/a') 21 | # if len(pages) > 4: 22 | # next_page = pages[-2].xpath('@href').extract() 23 | # if len(next_page) == 1: 24 | # yield Request(url='%s/%s' % (self.referer, next_page[0]), headers={'Referer': response.url}) 25 | if response.request.url == 'http://ip.qiaodm.com/free/index.html': 26 | hot_urls = response.xpath('//div[@class="freeb"]/a[contains(@href,"free")]/@href').extract() 27 | for url in hot_urls: 28 | yield Request(url=url, headers={'Referer': self.referer}) 29 | country_urls = response.xpath('//a[@class="item"]/@href').extract() 30 | for url in country_urls: 31 | yield Request(url=url, headers={'Referer': self.referer}) 32 | 33 | ip_list = response.xpath('//*[@id="main_container"]/div[1]/table/tbody/tr') 34 | if len(ip_list) > 2: 35 | ip_list.pop(1) 36 | ip_list.pop(0) 37 | for line in ip_list: 38 | item = ProxyIPItem() 39 | columns = line.xpath('td') 40 | ip_spans = columns[0].xpath('node()/script/text() | node()[not(contains(@style, "none"))]/text()').extract() 41 | item['ip'] = ''.join([a.replace('document.write(\'','').replace('\');','') for a in ip_spans]) 42 | # port = columns[1].xpath('text()').extract()[0] 43 | port = columns[1].xpath('@class').extract()[0].split(' ')[1] 44 | port = int(''.join([str("ABCDEFGHIZ".index(c)) for c in port])) / 8 45 | item['port'] = port 46 | # port = columns[1].xpath('script/text()').extract()[0] 47 | # port = port[port.index('=') + 1:port.index(';')] 48 | # item['port'] = ''.join([str(eval(a)) for a in port.split('+')]) 49 | item['type'] = 'http' 50 | yield item 51 | -------------------------------------------------------------------------------- /crawler/middleware/random_user_agent.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import random 3 | from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware 4 | import logging 5 | 6 | class RandomUserAgentMiddleware(UserAgentMiddleware): 7 | 8 | def __init__(self, user_agent=''): 9 | self.logger = logging.getLogger("crawler.middleware.randomua") 10 | self.user_agent = user_agent 11 | 12 | def process_request(self, request, spider): 13 | ua = random.choice(self.user_agent_list) 14 | if request.headers.get('User-Agent') is not None: 15 | return 16 | request.headers.setdefault('User-Agent', ua) 17 | self.logger.info("process request %s using random ua: %s" % (request, ua)) 18 | 19 | #the default user_agent_list composes chrome,I E,firefox,Mozilla,opera,netscape 20 | #for more user agent strings,you can find it in http://www.useragentstring.com/pages/useragentstring.php 21 | user_agent_list = [ 22 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)", 23 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET4.0C)", 24 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; qihu theworld)", 25 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; Trident/4.0)", 26 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; Trident/4.0; 360SE)", 27 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; Trident/4.0; InfoPath.2)", 28 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.30729; LBBROWSER)", 29 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; SV1)", 30 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; GreenBrowser)", 31 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; InfoPath.2; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)", 32 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; 2345Explorer 4.2.0.13850)", 33 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; .NET CLR 3.5.30729; Alexa Toolbar; .NET CLR 1.1.4322)", 34 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30618)", 35 | "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; GTB7.4; KB974488)", 36 | "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; iCafeMedia; .NET CLR 2.0.50727; .NET4.0C; .NET4.0E)", 37 | "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; BIDUBrowser 2.x)", 38 | "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; baiduie8; 2345Explorer 4.2.0.13929)", 39 | "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; Apache; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729)", 40 | "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0; 360SE)", 41 | "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; WOW64; Trident/6.0; 2345Explorer 5.0.0.14067)", 42 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/7.0)", 43 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/7.0; KB974488)", 44 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0) LBBROWSER", 45 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; QQBrowser/8.0.2820.400)", 46 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.99 Safari/537.36", 47 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.99 Safari/537.36 LBBROWSER", 48 | "Mozilla/5.0 (Windows NT 5.1; rv:32.0) Gecko/20100101 Firefox/32.0", 49 | "Mozilla/5.0 (Windows NT 5.1; rv:25.0) Gecko/20100101 Firefox/25.0", 50 | "Mozilla/5.0 (Windows NT 5.1; rv:25.0) Gecko/20100101 Firefox/31.0", 51 | "Mozilla/5.0 (Windows NT 5.1; rv:25.0) Gecko/20100101 Firefox/36.0", 52 | "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.12 (KHTML, like Gecko) Maxthon/3.0 Chrome/18.0.966.0 Safari/535.12", 53 | "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.110 Safari/537.36 CoolNovo/2.0.9.20", 54 | "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.66 Safari/537.36 LBBROWSER", 55 | "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 UBrowser/3.1.1644.34 Safari/537.36", 56 | "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.115 Safari/537.36 OPR/27.0.1689.76", 57 | "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.2.0.3000 Chrome/30.0.1551.0 Safari/537.36", 58 | "Mozilla/5.0 (Windows NT 6.1; rv:33.0) Gecko/20100101 Firefox/33.0", 59 | "Mozilla/5.0 (Windows NT 6.1; Trident/7.0; MALC; rv:11.0; QQBrowser/8.0.3345.400) like Gecko", 60 | "Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0; QQBrowser/8.0.3197.400) like Gecko", 61 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.0.3000 Chrome/30.0.1599.101 Safari/537.36", 62 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3647.11 Safari/537.36", 63 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.0.8) Gecko/2009032609 Firefox/3.0.8 (.NET CLR 3.5.30729)", 64 | "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:34.0) Gecko/20100101 Firefox/34.0", 65 | "UCWEB/2.0 (Linux; U; Adr 2.3.5; zh-CN; Lenovo A288t) U2/1.0.0 UCBrowser/9.6.2.404 U2/1.0.0 Mobile", 66 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.99 Safari/537.36", 67 | "UCWEB/2.0 (MIDP-2.0; U; zh-CN; Lenovo S898t+) U2/1.0.0 UCBrowser/10.2.1.550 U2/1.0.0 Mobile", 68 | "UCWEB/2.0 (MIDP-2.0; U; zh-CN; MI 4C) U2/1.0.0 UCBrowser/10.2.0.535 U2/1.0.0 Mobile", 69 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.115 Safari/537.36", 70 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.104 Safari/537.36 QQBrowser/3.3.3201.400", 71 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:36.0) Gecko/20100101 Firefox/36.0", 72 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:36.0) Gecko/20100101 Firefox/36.0", 73 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; zh_cn) AppleWebKit/600.1.4.12.4 (KHTML, like Gecko) Version/5.0.5 Safari/600.1.4.12.4", 74 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/600.3.18 (KHTML, like Gecko) Version/7.1.3 Safari/537.85.12" 75 | ] 76 | -------------------------------------------------------------------------------- /crawler/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | import urllib2 8 | import json 9 | import socket 10 | import exceptions 11 | import httplib 12 | import time 13 | from scrapy.exceptions import DropItem 14 | from crawler.items import * 15 | from threading import Thread 16 | from scrapy.conf import settings 17 | import logging 18 | import Queue 19 | import gzip 20 | from StringIO import StringIO 21 | 22 | # adding threading 23 | import threading 24 | lock = threading.Lock() 25 | fs = open('proxy.txt','w') # adding here for writing http proxy 26 | socket.setdefaulttimeout(2) 27 | localhost = settings.get('LOCAL_IP') 28 | logger = logging.getLogger('crawler.proxy.checker') 29 | proxy_headers = [ 30 | 'x-proxy-id', 31 | 'via', 32 | 'x-via', 33 | 'x-forwarded-for', 34 | 'forwarded-for', 35 | 'x-client-ip', 36 | 'client-ip', 37 | 'x-real-ip', 38 | 'real-ip', 39 | 'proxy-client-ip', 40 | 'wl-proxy-client-ip', 41 | 'x-bluecoat-via', 42 | 'x-cc-connectivity', 43 | 'x-mato-param', 44 | 'x-forwarded-host', 45 | 'x-forwarded-server' 46 | ] 47 | 48 | class CrawlerPipeline(object): 49 | def process_item(self, item, spider): 50 | return item 51 | 52 | class PrintPipeline(object): 53 | def process_item(self, item, spider): 54 | if isinstance(item, ProxyIPItem): 55 | logger.info('\033[33m[crawled]\033[m ip: \033[33m%-15s\033[m, port: \033[33m%-5s\033[m, type: \033[33m%s\033[m' % (item['ip'], item['port'], item['type'])) 56 | return item 57 | 58 | class ProxyScanPipeline(object): 59 | 60 | def __init__(self): 61 | logger.info("local ip address: %s" % localhost) 62 | self.queue = Queue.Queue() 63 | 64 | def open_spider(self, spider): 65 | logger.info("spider opened.") 66 | self.running = True 67 | for i in xrange(50): 68 | thread = Thread(target=self.scan_task, args=()) 69 | thread.start() 70 | 71 | def close_spider(self, spider): 72 | self.running = False 73 | fs.close() 74 | 75 | def process_item(self, item, spider): 76 | self.queue.put(item) 77 | return item 78 | 79 | def scan_task(self): 80 | while self.running or not self.queue.empty(): 81 | try: 82 | item = self.queue.get(True, 1) 83 | scan(item) 84 | except Queue.Empty: 85 | pass 86 | 87 | def scan(item, callback=None): 88 | result = test_proxy(item) 89 | if result is not None: 90 | # only write available http proxy 91 | lock.acquire() 92 | fs.write('%s://%s:%s\n' % (item['type'].lower(),item['ip'],item['port'])) 93 | lock.release() 94 | logger.info('\033[32m[ result]\033[m ip: \033[32m%-15s\033[m, port: \033[32m%-5s\033[m, speed: \033[32m%-4s\033[m, type: \033[32m%s\033[m' % (item['ip'], item['port'], item['speed'], item['type'])) 95 | if item['type'] in ['high', 'anonymous'] and test_http(item) is not None and item['speed'] < 2000: 96 | logger.info('\033[36m[ proxy]\033[m ip: \033[36m%-15s\033[m, port: \033[36m%-5s\033[m, speed: \033[36m%-4s\033[m, type: \033[36m%-11s\033[m, post: \033[36m%-5s\033[m, ssl: \033[36m%-5s\033[m' % (item['ip'], item['port'], item['speed'], item['type'], item['post'], item['ssl'])) 97 | if callback is not None: 98 | callback(item) 99 | else: 100 | logger.info('\033[31m[ result]\033[m ip: \033[31m%-15s\033[m, port: \033[31m%-5s\033[m, type: \033[31m%s\033[m, \033[31mproxy server not alive or healthy.\033[m' % (item['ip'], item['port'], item['type'])) 101 | 102 | def test_http(item, verbose=False): 103 | proxyHandler = urllib2.ProxyHandler({'http':'http://%s:%s' % (item['ip'], item['port']), 'https':'http://%s:%s' % (item['ip'], item['port'])}) 104 | opener = urllib2.build_opener(proxyHandler) 105 | opener.addheaders = { 106 | 'Accept-Encoding': 'gzip,deflate,sdch', 107 | 'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6', 108 | 'User-Agent': 'Mozilla/5.0 (compatible; WOW64; MSIE 10.0; Windows NT 6.2)', 109 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 110 | 'Cache-Control': 'max-age=0' 111 | }.items() 112 | check_map = { 113 | "http://zhidao.baidu.com/robots.txt": "Baiduspider", 114 | "http://weibo.com/robots.txt": "sitemap", 115 | "http://www.qq.com/robots.txt": "Disallow", 116 | "http://xyq.163.com/robots.txt": "sitemap", 117 | "http://www.cnbeta.com/robots.txt": "manager", 118 | "http://www.zhihu.com/robots.txt": "resetpassword", 119 | "http://www.iqiyi.com/robots.txt": "Disallow", 120 | "http://www.taobao.com/robots.txt": "User-agent", 121 | "http://www.jd.com/robots.txt": "EtaoSpider", 122 | "http://www.58.com/robots.txt": "User-agent" 123 | } 124 | bad = 0 125 | total_time = item['speed'] 126 | success = 1 127 | for url in check_map: 128 | try: 129 | req = urllib2.Request(url) 130 | begin = time.time() 131 | resp = opener.open(req) 132 | content = resp.read() 133 | if resp.info().get('Content-Encoding') == 'gzip': 134 | buf = StringIO(content) 135 | f = gzip.GzipFile(fileobj=buf) 136 | content = f.read() 137 | if content.find(check_map[url]) < 0: 138 | bad += 1 139 | if verbose: 140 | log.msg(repr(content), log.DEBUG) 141 | else: 142 | success += 1; 143 | total_time += int((time.time() - begin) * 1000) 144 | if verbose: 145 | log.msg("%s %d" % (url, int((time.time() - begin) * 1000)), log.DEBUG) 146 | except Exception, e: 147 | bad += 1 148 | if verbose: 149 | logger.error("%s %s" % (url, e)) 150 | if success * 1.0 / (len(check_map.items()) + 1) < 0.8: 151 | return None 152 | else: 153 | item['speed'] = total_time / success 154 | item['post'] = False 155 | try: 156 | req = urllib2.Request('http://httpbin.org/post', 'q=this_is_a_test') 157 | resp = opener.open(req) 158 | content = resp.read() 159 | if content.find('this_is_a_test') > 0: 160 | item['post'] = True 161 | except: 162 | pass 163 | item['ssl'] = False 164 | try: 165 | req = urllib2.Request('https://httpbin.org/get?q=this_is_a_test') 166 | resp = opener.open(req) 167 | content = resp.read() 168 | if content.find('this_is_a_test') > 0: 169 | item['ssl'] = True 170 | except: 171 | pass 172 | return item 173 | 174 | def test_proxy(item): 175 | try: 176 | item['port'] = int(item['port']) 177 | except ValueError: 178 | return None 179 | if item['type'] == 'http': 180 | proxyHandler = urllib2.ProxyHandler({'http':'http://%s:%s' % (item['ip'], item['port']), 'https':'http://%s:%s' % (item['ip'], item['port'])}) 181 | opener = urllib2.build_opener(proxyHandler) 182 | opener.addheaders = { 183 | 'Accept-Encoding': 'gzip', 184 | 'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6', 185 | 'User-Agent': 'Mozilla/5.0 (compatible; WOW64; MSIE 10.0; Windows NT 6.2)', 186 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 187 | 'Cache-Control': 'max-age=0' 188 | }.items() 189 | try: 190 | req = urllib2.Request('http://httpbin.org/get') 191 | begin = time.time() 192 | resp = opener.open(req) 193 | content = resp.read() 194 | item['speed'] = int((time.time() - begin) * 1000) 195 | content = json.loads(content) 196 | if content['origin'].find(localhost) != -1: 197 | # print '\t[Leak Header] X-Forwarded-For: %s' % content['origin'] 198 | item['type'] = 'transparent' 199 | return item 200 | if len(content['origin'].split(',')) > 1: 201 | # print '\t[Leak Header] X-Forwarded-For: %s' % content['origin'] 202 | item['type'] = 'anonymous' 203 | return item 204 | # logger.error('ip: %s' % item['ip']) 205 | # for key in content['headers']: 206 | # logger.error('%s: %s' % (key, content['headers'][key])) 207 | for key in content['headers']: 208 | if content['headers'][key].find(localhost) != -1: 209 | # print '\t[Leak Header] %s: %s' % (key, content['headers'][key]) 210 | item['type'] = 'transparent' 211 | return item 212 | if key.lower() in proxy_headers: 213 | # print '\t[Leak Header] %s: %s' % (key, content['headers'][key]) 214 | item['type'] = 'anonymous' 215 | if item['type'] == 'http': 216 | item['type'] = 'high' 217 | return item 218 | except exceptions.ValueError, error: 219 | # print 'host seems to be a proxy with limitation' 220 | # print error 221 | pass 222 | except httplib.BadStatusLine, error: 223 | # print error 224 | pass 225 | except urllib2.URLError, error: 226 | # print error 227 | pass 228 | except socket.timeout, error: 229 | # print error 230 | pass 231 | except socket.error, error: 232 | # print error 233 | pass 234 | elif item['type'] == 'socks4': 235 | sock = socket.socket() 236 | try: 237 | begin = time.time() 238 | sock.connect((item['ip'], int(item['port']))) 239 | sock.send('\x04\x01\x00\x50\x36\xaf\xde\xf6MOZ\x00') 240 | response = sock.recv(10) 241 | # print repr(response) 242 | if response.find('\x00\x5A') == 0: 243 | item['speed'] = int((time.time() - begin) * 1000) 244 | sock.close() 245 | return item 246 | except socket.timeout, error: 247 | # print error 248 | pass 249 | except socket.error, error: 250 | # print error 251 | pass 252 | elif item['type'] == 'socks5': 253 | sock = socket.socket() 254 | try: 255 | begin = time.time() 256 | sock.connect((item['ip'], int(item['port']))) 257 | sock.send('\x05\x01\x00') 258 | response = sock.recv(3) 259 | # print repr(response) 260 | if response.find('\x05\x00') == 0: 261 | item['speed'] = int((time.time() - begin) * 1000) 262 | sock.close() 263 | return item 264 | except socket.timeout, error: 265 | # print error 266 | pass 267 | except socket.error, error: 268 | # print error 269 | pass 270 | return None 271 | # raise DropItem('proxy server not alive or healthy.') 272 | 273 | 274 | if __name__ == '__main__': 275 | item = {} 276 | item['ip'] = '120.52.72.58' 277 | item['port'] = '80' 278 | item['speed'] = 1000 279 | item['type'] = 'anonymous' 280 | print test_http(item, True) 281 | -------------------------------------------------------------------------------- /crawler/contrib/socks.py: -------------------------------------------------------------------------------- 1 | """SocksiPy - Python SOCKS module. 2 | Version 1.00 3 | 4 | Copyright 2006 Dan-Haim. All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without modification, 7 | are permitted provided that the following conditions are met: 8 | 1. Redistributions of source code must retain the above copyright notice, this 9 | list of conditions and the following disclaimer. 10 | 2. Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 3. Neither the name of Dan Haim nor the names of his contributors may be used 14 | to endorse or promote products derived from this software without specific 15 | prior written permission. 16 | 17 | THIS SOFTWARE IS PROVIDED BY DAN HAIM "AS IS" AND ANY EXPRESS OR IMPLIED 18 | WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 19 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO 20 | EVENT SHALL DAN HAIM OR HIS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 21 | INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 22 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA 23 | OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 24 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 25 | OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMANGE. 26 | 27 | 28 | This module provides a standard socket-like interface for Python 29 | for tunneling connections through SOCKS proxies. 30 | 31 | """ 32 | 33 | import socket 34 | import struct 35 | 36 | PROXY_TYPE_SOCKS4 = 1 37 | PROXY_TYPE_SOCKS5 = 2 38 | PROXY_TYPE_HTTP = 3 39 | 40 | _defaultproxy = None 41 | _orgsocket = socket.socket 42 | 43 | class ProxyError(Exception): 44 | def __init__(self, value): 45 | self.value = value 46 | def __str__(self): 47 | return repr(self.value) 48 | 49 | class GeneralProxyError(ProxyError): 50 | def __init__(self, value): 51 | self.value = value 52 | def __str__(self): 53 | return repr(self.value) 54 | 55 | class Socks5AuthError(ProxyError): 56 | def __init__(self, value): 57 | self.value = value 58 | def __str__(self): 59 | return repr(self.value) 60 | 61 | class Socks5Error(ProxyError): 62 | def __init__(self, value): 63 | self.value = value 64 | def __str__(self): 65 | return repr(self.value) 66 | 67 | class Socks4Error(ProxyError): 68 | def __init__(self, value): 69 | self.value = value 70 | def __str__(self): 71 | return repr(self.value) 72 | 73 | class HTTPError(ProxyError): 74 | def __init__(self, value): 75 | self.value = value 76 | def __str__(self): 77 | return repr(self.value) 78 | 79 | _generalerrors = ("success", 80 | "invalid data", 81 | "not connected", 82 | "not available", 83 | "bad proxy type", 84 | "bad input") 85 | 86 | _socks5errors = ("succeeded", 87 | "general SOCKS server failure", 88 | "connection not allowed by ruleset", 89 | "Network unreachable", 90 | "Host unreachable", 91 | "Connection refused", 92 | "TTL expired", 93 | "Command not supported", 94 | "Address type not supported", 95 | "Unknown error") 96 | 97 | _socks5autherrors = ("succeeded", 98 | "authentication is required", 99 | "all offered authentication methods were rejected", 100 | "unknown username or invalid password", 101 | "unknown error") 102 | 103 | _socks4errors = ("request granted", 104 | "request rejected or failed", 105 | "request rejected because SOCKS server cannot connect to identd on the client", 106 | "request rejected because the client program and identd report different user-ids", 107 | "unknown error") 108 | 109 | def setdefaultproxy(proxytype=None,addr=None,port=None,rdns=True,username=None,password=None): 110 | """setdefaultproxy(proxytype, addr[, port[, rdns[, username[, password]]]]) 111 | Sets a default proxy which all further socksocket objects will use, 112 | unless explicitly changed. 113 | """ 114 | global _defaultproxy 115 | _defaultproxy = (proxytype,addr,port,rdns,username,password) 116 | 117 | class socksocket(socket.socket): 118 | """socksocket([family[, type[, proto]]]) -> socket object 119 | 120 | Open a SOCKS enabled socket. The parameters are the same as 121 | those of the standard socket init. In order for SOCKS to work, 122 | you must specify family=AF_INET, type=SOCK_STREAM and proto=0. 123 | """ 124 | 125 | def __init__(self, family=socket.AF_INET, type=socket.SOCK_STREAM, proto=0, _sock=None): 126 | _orgsocket.__init__(self,family,type,proto,_sock) 127 | if _defaultproxy != None: 128 | self.__proxy = _defaultproxy 129 | else: 130 | self.__proxy = (None, None, None, None, None, None) 131 | self.__proxysockname = None 132 | self.__proxypeername = None 133 | 134 | def __recvall(self, bytes): 135 | """__recvall(bytes) -> data 136 | Receive EXACTLY the number of bytes requested from the socket. 137 | Blocks until the required number of bytes have been received. 138 | """ 139 | data = "" 140 | while len(data) < bytes: 141 | data = data + self.recv(bytes-len(data)) 142 | return data 143 | 144 | def setproxy(self,proxytype=None,addr=None,port=None,rdns=True,username=None,password=None): 145 | """setproxy(proxytype, addr[, port[, rdns[, username[, password]]]]) 146 | Sets the proxy to be used. 147 | proxytype - The type of the proxy to be used. Three types 148 | are supported: PROXY_TYPE_SOCKS4 (including socks4a), 149 | PROXY_TYPE_SOCKS5 and PROXY_TYPE_HTTP 150 | addr - The address of the server (IP or DNS). 151 | port - The port of the server. Defaults to 1080 for SOCKS 152 | servers and 8080 for HTTP proxy servers. 153 | rdns - Should DNS queries be preformed on the remote side 154 | (rather than the local side). The default is True. 155 | Note: This has no effect with SOCKS4 servers. 156 | username - Username to authenticate with to the server. 157 | The default is no authentication. 158 | password - Password to authenticate with to the server. 159 | Only relevant when username is also provided. 160 | """ 161 | self.__proxy = (proxytype,addr,port,rdns,username,password) 162 | 163 | def __negotiatesocks5(self,destaddr,destport): 164 | """__negotiatesocks5(self,destaddr,destport) 165 | Negotiates a connection through a SOCKS5 server. 166 | """ 167 | # First we'll send the authentication packages we support. 168 | if (self.__proxy[4]!=None) and (self.__proxy[5]!=None): 169 | # The username/password details were supplied to the 170 | # setproxy method so we support the USERNAME/PASSWORD 171 | # authentication (in addition to the standard none). 172 | self.sendall("\x05\x02\x00\x02") 173 | else: 174 | # No username/password were entered, therefore we 175 | # only support connections with no authentication. 176 | self.sendall("\x05\x01\x00") 177 | # We'll receive the server's response to determine which 178 | # method was selected 179 | chosenauth = self.__recvall(2) 180 | if chosenauth[0] != "\x05": 181 | self.close() 182 | raise GeneralProxyError((1,_generalerrors[1])) 183 | # Check the chosen authentication method 184 | if chosenauth[1] == "\x00": 185 | # No authentication is required 186 | pass 187 | elif chosenauth[1] == "\x02": 188 | # Okay, we need to perform a basic username/password 189 | # authentication. 190 | self.sendall("\x01" + chr(len(self.__proxy[4])) + self.__proxy[4] + chr(len(self.proxy[5])) + self.__proxy[5]) 191 | authstat = self.__recvall(2) 192 | if authstat[0] != "\x01": 193 | # Bad response 194 | self.close() 195 | raise GeneralProxyError((1,_generalerrors[1])) 196 | if authstat[1] != "\x00": 197 | # Authentication failed 198 | self.close() 199 | raise Socks5AuthError,((3,_socks5autherrors[3])) 200 | # Authentication succeeded 201 | else: 202 | # Reaching here is always bad 203 | self.close() 204 | if chosenauth[1] == "\xFF": 205 | raise Socks5AuthError((2,_socks5autherrors[2])) 206 | else: 207 | raise GeneralProxyError((1,_generalerrors[1])) 208 | # Now we can request the actual connection 209 | req = "\x05\x01\x00" 210 | # If the given destination address is an IP address, we'll 211 | # use the IPv4 address request even if remote resolving was specified. 212 | try: 213 | ipaddr = socket.inet_aton(destaddr) 214 | req = req + "\x01" + ipaddr 215 | except socket.error: 216 | # Well it's not an IP number, so it's probably a DNS name. 217 | if self.__proxy[3]==True: 218 | # Resolve remotely 219 | ipaddr = None 220 | req = req + "\x03" + chr(len(destaddr)) + destaddr 221 | else: 222 | # Resolve locally 223 | ipaddr = socket.inet_aton(socket.gethostbyname(destaddr)) 224 | req = req + "\x01" + ipaddr 225 | req = req + struct.pack(">H",destport) 226 | self.sendall(req) 227 | # Get the response 228 | resp = self.__recvall(4) 229 | if resp[0] != "\x05": 230 | self.close() 231 | raise GeneralProxyError((1,_generalerrors[1])) 232 | elif resp[1] != "\x00": 233 | # Connection failed 234 | self.close() 235 | if ord(resp[1])<=8: 236 | raise Socks5Error(ord(resp[1]),_generalerrors[ord(resp[1])]) 237 | else: 238 | raise Socks5Error(9,_generalerrors[9]) 239 | # Get the bound address/port 240 | elif resp[3] == "\x01": 241 | boundaddr = self.__recvall(4) 242 | elif resp[3] == "\x03": 243 | resp = resp + self.recv(1) 244 | boundaddr = self.__recvall(resp[4]) 245 | else: 246 | self.close() 247 | raise GeneralProxyError((1,_generalerrors[1])) 248 | boundport = struct.unpack(">H",self.__recvall(2))[0] 249 | self.__proxysockname = (boundaddr,boundport) 250 | if ipaddr != None: 251 | self.__proxypeername = (socket.inet_ntoa(ipaddr),destport) 252 | else: 253 | self.__proxypeername = (destaddr,destport) 254 | 255 | def getproxysockname(self): 256 | """getsockname() -> address info 257 | Returns the bound IP address and port number at the proxy. 258 | """ 259 | return self.__proxysockname 260 | 261 | def getproxypeername(self): 262 | """getproxypeername() -> address info 263 | Returns the IP and port number of the proxy. 264 | """ 265 | return _orgsocket.getpeername(self) 266 | 267 | def getpeername(self): 268 | """getpeername() -> address info 269 | Returns the IP address and port number of the destination 270 | machine (note: getproxypeername returns the proxy) 271 | """ 272 | return self.__proxypeername 273 | 274 | def __negotiatesocks4(self,destaddr,destport): 275 | """__negotiatesocks4(self,destaddr,destport) 276 | Negotiates a connection through a SOCKS4 server. 277 | """ 278 | # Check if the destination address provided is an IP address 279 | rmtrslv = False 280 | try: 281 | ipaddr = socket.inet_aton(destaddr) 282 | except socket.error: 283 | # It's a DNS name. Check where it should be resolved. 284 | if self.__proxy[3]==True: 285 | ipaddr = "\x00\x00\x00\x01" 286 | rmtrslv = True 287 | else: 288 | ipaddr = socket.inet_aton(socket.gethostbyname(destaddr)) 289 | # Construct the request packet 290 | req = "\x04\x01" + struct.pack(">H",destport) + ipaddr 291 | # The username parameter is considered userid for SOCKS4 292 | if self.__proxy[4] != None: 293 | req = req + self.__proxy[4] 294 | req = req + "\x00" 295 | # DNS name if remote resolving is required 296 | # NOTE: This is actually an extension to the SOCKS4 protocol 297 | # called SOCKS4A and may not be supported in all cases. 298 | if rmtrslv==True: 299 | req = req + destaddr + "\x00" 300 | self.sendall(req) 301 | # Get the response from the server 302 | resp = self.__recvall(8) 303 | if resp[0] != "\x00": 304 | # Bad data 305 | self.close() 306 | raise GeneralProxyError((1,_generalerrors[1])) 307 | if resp[1] != "\x5A": 308 | # Server returned an error 309 | self.close() 310 | if ord(resp[1]) in (91,92,93): 311 | self.close() 312 | raise Socks4Error((ord(resp[1]),_socks4errors[ord(resp[1])-90])) 313 | else: 314 | raise Socks4Error((94,_socks4errors[4])) 315 | # Get the bound address/port 316 | self.__proxysockname = (socket.inet_ntoa(resp[4:]),struct.unpack(">H",resp[2:4])[0]) 317 | if rmtrslv != None: 318 | self.__proxypeername = (socket.inet_ntoa(ipaddr),destport) 319 | else: 320 | self.__proxypeername = (destaddr,destport) 321 | 322 | def __negotiatehttp(self,destaddr,destport): 323 | """__negotiatehttp(self,destaddr,destport) 324 | Negotiates a connection through an HTTP server. 325 | """ 326 | # If we need to resolve locally, we do this now 327 | if self.__proxy[3] == False: 328 | addr = socket.gethostbyname(destaddr) 329 | else: 330 | addr = destaddr 331 | self.sendall("CONNECT " + addr + ":" + str(destport) + " HTTP/1.1\r\n" + "Host: " + destaddr + "\r\n\r\n") 332 | # We read the response until we get the string "\r\n\r\n" 333 | resp = self.recv(1) 334 | while resp.find("\r\n\r\n")==-1: 335 | resp = resp + self.recv(1) 336 | # We just need the first line to check if the connection 337 | # was successful 338 | statusline = resp.splitlines()[0].split(" ",2) 339 | if statusline[0] not in ("HTTP/1.0","HTTP/1.1"): 340 | self.close() 341 | raise GeneralProxyError((1,_generalerrors[1])) 342 | try: 343 | statuscode = int(statusline[1]) 344 | except ValueError: 345 | self.close() 346 | raise GeneralProxyError((1,_generalerrors[1])) 347 | if statuscode != 200: 348 | self.close() 349 | raise HTTPError((statuscode,statusline[2])) 350 | self.__proxysockname = ("0.0.0.0",0) 351 | self.__proxypeername = (addr,destport) 352 | 353 | def connect(self,destpair): 354 | """connect(self,despair) 355 | Connects to the specified destination through a proxy. 356 | destpar - A tuple of the IP/DNS address and the port number. 357 | (identical to socket's connect). 358 | To select the proxy server use setproxy(). 359 | """ 360 | # Do a minimal input check first 361 | if (type(destpair) in (list,tuple)==False) or (len(destpair)<2) or (type(destpair[0])!=str) or (type(destpair[1])!=int): 362 | raise GeneralProxyError((5,_generalerrors[5])) 363 | if self.__proxy[0] == PROXY_TYPE_SOCKS5: 364 | if self.__proxy[2] != None: 365 | portnum = self.__proxy[2] 366 | else: 367 | portnum = 1080 368 | _orgsocket.connect(self,(self.__proxy[1],portnum)) 369 | self.__negotiatesocks5(destpair[0],destpair[1]) 370 | elif self.__proxy[0] == PROXY_TYPE_SOCKS4: 371 | if self.__proxy[2] != None: 372 | portnum = self.__proxy[2] 373 | else: 374 | portnum = 1080 375 | _orgsocket.connect(self,(self.__proxy[1],portnum)) 376 | self.__negotiatesocks4(destpair[0],destpair[1]) 377 | elif self.__proxy[0] == PROXY_TYPE_HTTP: 378 | if self.__proxy[2] != None: 379 | portnum = self.__proxy[2] 380 | else: 381 | portnum = 8080 382 | _orgsocket.connect(self,(self.__proxy[1],portnum)) 383 | self.__negotiatehttp(destpair[0],destpair[1]) 384 | elif self.__proxy[0] == None: 385 | _orgsocket.connect(self,(destpair[0],destpair[1])) 386 | else: 387 | raise GeneralProxyError((4,_generalerrors[4])) 388 | --------------------------------------------------------------------------------