├── .gitignore ├── LICENSE ├── README.md ├── requirements.txt ├── rzproxy ├── __init__.py ├── check_proxy.py ├── db │ ├── __init__.py │ ├── mysql_db.py │ └── sqlite_db.py ├── http_relay.py ├── logger.py ├── manager.py └── run.py └── tests ├── test_mysql.py ├── test_proxy.py └── test_sqlite3.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.txt 3 | *.db 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE 2 | Version 2, December 2004 3 | 4 | Copyright (C) 2004 Sam Hocevar 5 | 6 | Everyone is permitted to copy and distribute verbatim or modified 7 | copies of this license document, and changing it is allowed as long 8 | as the name is changed. 9 | 10 | DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE 11 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 12 | 13 | 0. You just DO WHAT THE FUCK YOU WANT TO. 14 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # rzproxy 2 | [![WTFPL](https://raw.githubusercontent.com/legacy-icons/license-icons/master/dist/32x32/wtfpl.png) WTFPL](http://www.wtfpl.net/) 3 | 4 | > A local proxy help you chose the bset proxy from proxy pool. 5 | 6 | 7 | ## Usage 8 | 9 | Start rzproxy 10 | 11 | python run.py --host 127.0.0.1 --port 8399 --file-name proxy_pool.txt 12 | 13 | proxy.txt format: 14 | 15 | 1.1.1.1:8080 16 | 2.2.2.2:8080 17 | 18 | Then use ("127.0.0.1:8399") as http proxy 19 | 20 | import requests 21 | requests.get(url, proixes={"http": "http://127.0.0.1:8399"}) 22 | 23 | ## TODO 24 | - [ ] support max connection for each proxy depeding on weight -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | click>=6.6 2 | gevent>=1.0.2 3 | requests>=2.10.0 4 | https://cdn.mysql.com/Downloads/Connector-Python/mysql-connector-python-2.1.3.tar.gz 5 | -------------------------------------------------------------------------------- /rzproxy/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # Author: loalj 4 | # Created on 2016-06-27 17:00:00 5 | 6 | __version__ = '0.1' 7 | -------------------------------------------------------------------------------- /rzproxy/check_proxy.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import logging 3 | 4 | import requests 5 | from gevent import pool 6 | from gevent import monkey 7 | monkey.patch_socket() 8 | 9 | TIME_OUT_SECOND = 5 10 | HEADERS = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_3) " \ 11 | "AppleWebKit/537.36 (KHTML, like Gecko) " \ 12 | "Chrome/51.0.2704.103 Safari/537.36" 13 | 14 | logger = logging.getLogger(__name__) 15 | 16 | 17 | class ProxyCheck(object): 18 | def __init__(self, proxy_list, queue, target_url=None): 19 | self._proxy_list = proxy_list 20 | self.headers = {"User-Agent": HEADERS} 21 | self.url_list = [ 22 | "http://www.baidu.com", 23 | "http://www.qq.com", 24 | "http://www.bing.com" 25 | ] 26 | if target_url: 27 | self.url_list.append(target_url) 28 | self.queue = queue 29 | 30 | def _calculate_weight(self, proxy): 31 | weight = 0 32 | format_proxy = {"http": proxy} 33 | for url in self.url_list: 34 | response_time = self._dump_reposne_time(format_proxy, url) 35 | weight += 1.0 / response_time if response_time > 0 else 0 36 | weight = weight / 4 37 | self.proxy_insert_cache[proxy] = weight 38 | # self.queue.set(proxy, weight) 39 | 40 | def check(self): 41 | self.proxy_insert_cache = {} 42 | proxy_pool = pool.Pool(20) 43 | for proxy in self._proxy_list: 44 | proxy_pool.spawn(self._calculate_weight, proxy) 45 | proxy_pool.join() 46 | for proxy, weight in self.proxy_insert_cache.items(): 47 | self.queue.set(proxy, weight) 48 | self.queue.commit() 49 | 50 | def _dump_reposne_time(self, proxy, url): 51 | try: 52 | r = requests.get(url=url, proxies=proxy, headers=self.headers, 53 | timeout=TIME_OUT_SECOND) 54 | if r.status_code == 200: 55 | response_time = r.elapsed.total_seconds() 56 | else: 57 | response_time = -1 58 | except Exception: 59 | response_time = -1 60 | return response_time 61 | -------------------------------------------------------------------------------- /rzproxy/db/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/loadlj/rzproxy/0642c4a4f30b3469eeb76129f22eb63f997d7ef3/rzproxy/db/__init__.py -------------------------------------------------------------------------------- /rzproxy/db/mysql_db.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import time 3 | import logging 4 | import mysql.connector 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | class MysqlQueue(object): 10 | def __init__(self, host="localhost", port=3306, database='rzproxy', 11 | user="root", passwd=None): 12 | self._database_name = database 13 | self._conn = mysql.connector.connect(user=user, 14 | password=passwd, 15 | host=host, 16 | port=port, 17 | autocommit=True) 18 | if database not in [x[0] for x in self._execute('show databases')]: 19 | self._execute('CREATE DATABASE {}'.format(database)) 20 | self._conn.database = database 21 | self._execute('''CREATE TABLE IF NOT EXISTS proxy_pool ( 22 | `proxy` varchar(20) PRIMARY KEY, 23 | `weight` double(16, 4), 24 | `updatetime` double(16, 4) 25 | )ENGINE=InnoDB CHARSET=utf8''') 26 | 27 | @property 28 | def best_proxy(self): 29 | result_cur = self._execute( 30 | "SELECT proxy FROM proxy_pool GROUP BY weight DESC LIMIT 1") 31 | return result_cur.fetchone()[0] 32 | 33 | @property 34 | def last_updatetime(self): 35 | result_cur = self._execute("SELECT updatetime FROM proxy_pool \ 36 | GROUP BY updatetime DESC LIMIT 1") 37 | result = result_cur.fetchone() 38 | if result: 39 | return result[0] 40 | else: 41 | # no data in the table 42 | return 0 43 | 44 | @property 45 | def setup_cache(self): 46 | cache = {} 47 | result_list = self._execute("select proxy, weight from proxy_pool") 48 | for result in result_list.fetchall(): 49 | cache[result[0]] = result[1] 50 | return cache 51 | 52 | def get(self, key): 53 | result_cur = self._execute( 54 | "SELECT weight from proxy_pool WHERE proxy='{}'".format(key)) 55 | result = result_cur.fetchone()[0] 56 | return result 57 | 58 | def set(self, key, value, now=time.time()): 59 | # when the proxy is checked 60 | self._execute('''INSERT INTO proxy_pool(proxy, weight, updatetime) 61 | values ("{0}", {1}, {2}) ON DUPLICATE KEY UPDATE \ 62 | weight={1}, updatetime={2}'''.format(key, value, now)) 63 | 64 | def remove(self, key): 65 | self._execute("DELETE FROM proxy_pool WHERE proxy='{}'".format(key)) 66 | 67 | def _update(self, key, value): 68 | # not change the updatetime 69 | self._execute( 70 | "UPDATE proxy_pool SET weight={} WHERE proxy='{}'" 71 | .format(value, key)) 72 | 73 | def clean_all(self): 74 | self._execute("TRUNCATE TABLE proxy_pool") 75 | 76 | def _execute(self, sql_query, values=[]): 77 | dbcur = self._dbcur() 78 | dbcur.execute(sql_query, values) 79 | return dbcur 80 | 81 | def _dbcur(self): 82 | try: 83 | if self._conn.unread_result: 84 | self._conn.get_rows() 85 | return self._conn.cursor() 86 | except (mysql.connector.OperationalError, 87 | mysql.connector.InterfaceError): 88 | self._conn.ping(reconnect=True) 89 | self._conn.database = self._database_name 90 | return self._conn.cursor() 91 | -------------------------------------------------------------------------------- /rzproxy/db/sqlite_db.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import time 3 | import logging 4 | import sqlite3 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | class SqliteQueue(object): 10 | def __init__(self): 11 | self._conn = sqlite3.connect("rzproxy.db") 12 | self._execute('''CREATE TABLE IF NOT EXISTS proxy_pool ( 13 | `proxy` text PRIMARY KEY, 14 | `weight` real, 15 | `updatetime` real 16 | )''') 17 | 18 | @property 19 | def best_proxy(self): 20 | result_cur = self._execute( 21 | "SELECT proxy FROM proxy_pool ORDER BY weight DESC LIMIT 1") 22 | return result_cur.fetchone()[0] 23 | 24 | @property 25 | def last_updatetime(self): 26 | result_cur = self._execute("SELECT updatetime FROM proxy_pool\ 27 | ORDER BY updatetime DESC LIMIT 1") 28 | result = result_cur.fetchone() 29 | if result: 30 | return result[0] 31 | else: 32 | # start checking proxy list 33 | return 0 34 | 35 | @property 36 | def setup_cache(self): 37 | cache = {} 38 | result_list = self._execute("select proxy, weight from proxy_pool") 39 | for result in result_list.fetchall(): 40 | cache[result[0]] = result[1] 41 | return cache 42 | 43 | def get(self, key): 44 | result_cur = self._execute( 45 | "SELECT weight FROM proxy_pool WHERE proxy='{}'".format(key)) 46 | return result_cur.fetchone()[0] 47 | 48 | def set(self, key, value, now=time.time()): 49 | self._execute("REPLACE INTO proxy_pool VALUES('{0}', {1}, {2})" 50 | .format(key, value, now)) 51 | 52 | def commit(self): 53 | self._conn.commit() 54 | 55 | def remove(self, key): 56 | self._execute("DELETE FROM proxy_pool WHERE proxy='{}'".format(key)) 57 | 58 | def _execute(self, sql_query, values=[]): 59 | dbcur = self._dbcur() 60 | dbcur.execute(sql_query, values) 61 | return dbcur 62 | 63 | def _dbcur(self): 64 | return self._conn.cursor() 65 | -------------------------------------------------------------------------------- /rzproxy/http_relay.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import re 3 | import socket 4 | import logging 5 | import multiprocessing 6 | 7 | from gevent import pool 8 | from gevent import select 9 | from gevent.server import StreamServer 10 | from gevent import monkey 11 | monkey.patch_socket() 12 | 13 | BUF_SIZE = 4 * 1024 14 | CRLF = b"\r\n" 15 | 16 | logger = logging.getLogger(__name__) 17 | 18 | 19 | class HttpRelayHandler(multiprocessing.Process): 20 | # pool_count max is 100000 21 | # ensure the proxy weight is max 22 | 23 | def __init__(self, queue, proxy=("127.0.0.1", 8399), pool_count=100): 24 | multiprocessing.Process.__init__(self) 25 | self._proxy = proxy 26 | self._queue = queue 27 | self._pool = pool.Pool(pool_count) 28 | self._cache = None 29 | # tag proxy when response code is not 200 30 | self._error_code_trigger = {} 31 | self._server = StreamServer( 32 | proxy, self._handle_connection, spawn=self._pool) 33 | 34 | def _handle_connection(self, local_sock, address): 35 | if not self._cache: 36 | self._cache = self._queue.setup_cache 37 | cache = self._cache 38 | best_proxy = max(cache, key=cache.get) 39 | proxy_value = self._cache.get(best_proxy) 40 | logger.debug("proxy is {}, weight is {}" 41 | .format(best_proxy, proxy_value)) 42 | self._cache[best_proxy] = proxy_value * 0.5 43 | ip, port = best_proxy.split(":") 44 | 45 | try: 46 | max_connection = 0 47 | remote_sock = self._create_remote_connection((ip, int(port))) 48 | while True: 49 | r, w, e = select.select( 50 | [local_sock, remote_sock], [], []) 51 | if local_sock in r: 52 | request_data = local_sock.recv(BUF_SIZE) 53 | max_connection += 1 54 | if remote_sock.send(request_data) <= 0: 55 | break 56 | 57 | if remote_sock in r: 58 | response_data = remote_sock.recv(BUF_SIZE) 59 | if local_sock.send(response_data) <= 0: 60 | logger.debug("remote close connection") 61 | remote_sock.close() 62 | break 63 | response = self._parse_response(response_data) 64 | if response: 65 | response_code = re.match("HTTP/\d\.\d (\d+)", 66 | response).groups()[0] 67 | self._sweep_unvalid_proxy(best_proxy, response_code) 68 | request = self._parse_request(request_data) 69 | logger.info("({}) {} {}".format( 70 | best_proxy, request, response)) 71 | if max_connection >= 10: 72 | remote_sock.close() 73 | break 74 | 75 | self._cache[best_proxy] = self._cache[best_proxy] / 0.5 76 | except Exception: 77 | import traceback 78 | traceback.print_exc() 79 | 80 | # reduce proxy weight when the same error code repeat 5 times 81 | def _sweep_unvalid_proxy(self, proxy, error_code): 82 | if not re.match("^2|3\d\d", error_code): 83 | if proxy in self._error_code_trigger: 84 | if self._error_code_trigger[proxy][error_code] >= 5: 85 | logger.error("{} is not valid, sweep it".format(proxy)) 86 | self._error_code_trigger[proxy] = {error_code: 0} 87 | self._cache[proxy] = self._cache[proxy] * 0.1 88 | else: 89 | count = self._error_code_trigger[proxy][error_code] 90 | logger.info("{} {}".format(proxy, count)) 91 | self._error_code_trigger[proxy][error_code] += 1 92 | else: 93 | self._error_code_trigger[proxy] = {error_code: 0} 94 | 95 | def setup_cache(self): 96 | self._cache = self._queue.setup_cache 97 | 98 | def _parse_request(self, request_data): 99 | request_header = request_data.split(CRLF)[0] 100 | return request_header 101 | 102 | def _parse_response(self, response_data): 103 | header = response_data.split(CRLF)[0] 104 | if re.match(r"HTTP/\d\.\d", header): 105 | return header 106 | else: 107 | return None 108 | 109 | def _create_remote_connection(self, proxy): 110 | remote_sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) 111 | remote_sock.connect(proxy) 112 | return remote_sock 113 | 114 | def run(self): 115 | logger.info("Starting local server on {}.".format(self._proxy)) 116 | self._server.serve_forever() 117 | -------------------------------------------------------------------------------- /rzproxy/logger.py: -------------------------------------------------------------------------------- 1 | #!usr/bin/env python 2 | import sys 3 | import logging 4 | 5 | 6 | # These are the sequences need to get colored ouput 7 | RESET_SEQ = "\033[0m" 8 | COLOR_SEQ = "\033[1;%dm" 9 | 10 | 11 | def formatter_message(message, use_color=True): 12 | if use_color: 13 | message = message.replace( 14 | "$RESET", RESET_SEQ) 15 | else: 16 | message = message.replace("$RESET", "") 17 | return message 18 | 19 | COLORS = { 20 | 'WARNING': 3, 21 | 'INFO': 2, 22 | 'DEBUG': 4, 23 | 'CRITICAL': 1, 24 | 'ERROR': 1 25 | } 26 | 27 | 28 | class ColoredFormatter(logging.Formatter): 29 | def __init__(self, msg, use_color=True): 30 | logging.Formatter.__init__(self, msg) 31 | self.use_color = use_color 32 | 33 | def format(self, record): 34 | levelname = record.levelname 35 | if self.use_color and levelname in COLORS: 36 | levelname_color = COLOR_SEQ % (30 + COLORS[levelname]) \ 37 | + levelname + RESET_SEQ 38 | msg = COLOR_SEQ % (30 + COLORS[levelname]) \ 39 | + record.msg + RESET_SEQ 40 | record.levelname = levelname_color 41 | record.msg = msg 42 | return logging.Formatter.format(self, record) 43 | 44 | 45 | def set_logger(loglevel, use_color=True, handler=None): 46 | logger = logging.getLogger() 47 | if not handler: 48 | handler = logging.StreamHandler(sys.stdout) 49 | FORMAT = "[%(name)s$RESET:%(lineno)d] [%(levelname)s] [%(asctime)s]"\ 50 | " $RESET>> %(message)s " 51 | else: 52 | FORMAT = "[%(name)s$RESET] [%(levelname)s] >> %(message)s " 53 | if use_color: 54 | COLOR_FORMAT = formatter_message(FORMAT, True) 55 | color_formatter = ColoredFormatter(COLOR_FORMAT) 56 | handler.setFormatter(color_formatter) 57 | else: 58 | handler.setFormatter(logging.Formatter( 59 | formatter_message(FORMAT, False))) 60 | # ignore requests lib 61 | logging.getLogger("requests").setLevel(logging.WARNING) 62 | logger.setLevel(loglevel) 63 | logger.addHandler(handler) 64 | -------------------------------------------------------------------------------- /rzproxy/manager.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import time 3 | import logging 4 | 5 | logger = logging.getLogger(__name__) 6 | 7 | 8 | class Manager(object): 9 | def __init__(self, checker, queue, handler, interval): 10 | self._queue = queue 11 | self._last_updatetime = queue.last_updatetime 12 | self._interval = interval 13 | self._is_handler_start = False 14 | self._handler = handler 15 | self._checker = checker 16 | 17 | def _schedule(self): 18 | # set up the crontab 19 | while True: 20 | now = int(time.time()) 21 | if now - self._last_updatetime >= self._interval: 22 | logger.info( 23 | "start checking proxy list...just wait a minute") 24 | self._checker.check() 25 | self._last_updatetime = now 26 | self._handler.setup_cache() 27 | logger.info("the proxy_list has checked out...") 28 | self._call_back() 29 | 30 | def _call_back(self): 31 | if self._is_handler_start: 32 | time.sleep(10) 33 | else: 34 | self._is_handler_start = True 35 | self._handler.setup_cache() 36 | self._handler.start() 37 | 38 | def run(self): 39 | self._schedule() 40 | -------------------------------------------------------------------------------- /rzproxy/run.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import click 3 | import logging 4 | 5 | from manager import Manager 6 | from logger import set_logger 7 | from check_proxy import ProxyCheck 8 | from db.sqlite_db import SqliteQueue 9 | from db.mysql_db import MysqlQueue 10 | from http_relay import HttpRelayHandler 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | def load_file(proxy_file): 16 | with open(proxy_file, 'rb') as r: 17 | for line in r.readlines(): 18 | yield line.strip() 19 | 20 | 21 | @click.command() 22 | @click.option("--host", default="127.0.0.1", help="rzproxy host") 23 | @click.option("--db-type", default="sqlite", help="mysql, sqlite") 24 | @click.option("--port", default=8399, help="rzproxy port", type=int) 25 | @click.option("--file-name", help="proxy list file", required=True) 26 | @click.option("--mysql-host", default="127.0.0.1", help="mysql host") 27 | @click.option("--mysql-port", default=3306, help="mysql port", type=float) 28 | @click.option("--db", default="rzproxy", help="mysql name") 29 | @click.option("--user", default="root", help="mysql user") 30 | @click.option("--password", help="mysql password") 31 | @click.option("--target-url", default=None, 32 | help="the target url you will crawl") 33 | @click.option("--interval", default=30 * 60, 34 | help="scheduler interval", type=float) 35 | @click.option("--log-level", default="INFO", 36 | help="DEBUG, INFO, WARNING, ERROR, CRITICAL") 37 | def main(host, db_type, port, file_name, mysql_host, mysql_port, 38 | db, user, password, target_url, interval, log_level): 39 | set_logger(getattr(logging, log_level)) 40 | proxy_list = load_file(file_name) 41 | if db_type == "sqlite": 42 | queue = SqliteQueue() 43 | else: 44 | queue = MysqlQueue(mysql_host, mysql_port, db, user, password) 45 | checker = ProxyCheck(proxy_list, queue, target_url) 46 | relay_handler = HttpRelayHandler(queue, (host, port)) 47 | scheldurer = Manager(checker, queue, relay_handler, interval) 48 | scheldurer.run() 49 | 50 | if __name__ == '__main__': 51 | main() 52 | -------------------------------------------------------------------------------- /tests/test_mysql.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import time 3 | import unittest 4 | 5 | from rzproxy.db.mysql_db import MysqlQueue 6 | 7 | 8 | class TestQueue(unittest.TestCase): 9 | def setUp(self): 10 | self.queue = MysqlQueue(passwd="") 11 | 12 | def test_get(self): 13 | self.queue.set("127.0.0.1", 123.00) 14 | self.assertEqual(self.queue.get("127.0.0.1"), 123.00) 15 | 16 | def test_set_updatetime(self): 17 | now = time.time() 18 | self.queue.set("127.0.0.1", 123.00, now) 19 | self.assertEqual(self.queue.last_updatetime, round(now, 2)) 20 | 21 | def tearDown(self): 22 | self.queue.remove("127.0.0.1") 23 | 24 | if __name__ == "__main__": 25 | unittest.main() 26 | -------------------------------------------------------------------------------- /tests/test_proxy.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import unittest 3 | import gevent 4 | import requests 5 | from gevent import monkey 6 | monkey.patch_socket() 7 | 8 | 9 | class TestProxy(unittest.TestCase): 10 | def test_proxy(self): 11 | local_proxy = {"http": "http://127.0.0.1:8399"} 12 | 13 | def get(): 14 | r = requests.get("http://www.baidu.com", proxies=local_proxy) 15 | self.assertEqual(r.status_code, 200) 16 | 17 | gevent_list = [] 18 | for i in xrange(5): 19 | gevent_list.append(gevent.spawn(get)) 20 | gevent.joinall(gevent_list) 21 | -------------------------------------------------------------------------------- /tests/test_sqlite3.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import time 3 | import unittest 4 | 5 | from rzproxy.db.sqlite_db import SqliteQueue 6 | 7 | 8 | class TestQueue(unittest.TestCase): 9 | def test_get(self): 10 | queue = SqliteQueue() 11 | queue.set("127.0.0.1", 1.122) 12 | self.assertEqual(queue.get("127.0.0.1"), 1.122) 13 | queue.remove("127.0.0.1") 14 | 15 | def test_set_updatetime(self): 16 | queue = SqliteQueue() 17 | now = time.time() 18 | queue.set("127.0.0.1", 123.00, now) 19 | self.assertEqual(queue.last_updatetime, round(now, 2)) 20 | queue.remove("127.0.0.1") 21 | 22 | def test_setup_cache(self): 23 | queue = SqliteQueue() 24 | queue.set("127.0.0.1", 123.12) 25 | cache = queue.setup_cache 26 | self.assertEqual(cache["127.0.0.1"], 123.12) 27 | queue.remove("127.0.0.1") 28 | 29 | 30 | if __name__ == "__main__": 31 | unittest.main() 32 | --------------------------------------------------------------------------------