├── src ├── api │ ├── __init__.py │ ├── wsgi.py │ ├── utils.py │ ├── urls.py │ ├── views.py │ └── settings.py ├── ip_proxy │ ├── __init__.py │ ├── utils.py │ ├── models.py │ ├── ip_proxy.py │ ├── crawl.py │ ├── validator.py │ └── settings.py ├── uwsgi.ini ├── django_wsgi.py └── manage.py ├── .gitignore ├── requirements.txt ├── deploy ├── nginx.conf └── supervisord.conf └── README.md /src/api/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/* 2 | *.pyc 3 | *.rdb -------------------------------------------------------------------------------- /src/ip_proxy/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | version = '0.0.1' 3 | __author__ = 'cleland' 4 | -------------------------------------------------------------------------------- /src/uwsgi.ini: -------------------------------------------------------------------------------- 1 | [uwsgi] 2 | socket = 118.99.23.163:8080 3 | chdir=/root/webapp/ip_proxy/src/ 4 | wsgi-file = django_wsgi.py 5 | plugins=python 6 | processes = 4 7 | threads = 2 8 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | Django==1.10.5 2 | django-redis-cache==1.7.1 3 | gevent==1.2.1 4 | greenlet==0.4.11 5 | lxml==3.7.2 6 | mongoengine==0.11.0 7 | pymongo==3.4.0 8 | redis==2.10.5 9 | requests==2.12.4 10 | six==1.10.0 11 | -------------------------------------------------------------------------------- /src/django_wsgi.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | import os 3 | import sys 4 | 5 | reload(sys) 6 | sys.setdefaultencoding('utf8') 7 | 8 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "api.settings") 9 | 10 | from django.core.wsgi import get_wsgi_application 11 | application = get_wsgi_application() 12 | -------------------------------------------------------------------------------- /src/api/wsgi.py: -------------------------------------------------------------------------------- 1 | """ 2 | WSGI config for src project. 3 | 4 | It exposes the WSGI callable as a module-level variable named ``application``. 5 | 6 | For more information on this file, see 7 | https://docs.djangoproject.com/en/1.10/howto/deployment/wsgi/ 8 | """ 9 | 10 | import os 11 | 12 | from django.core.wsgi import get_wsgi_application 13 | 14 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "api.settings") 15 | 16 | application = get_wsgi_application() 17 | -------------------------------------------------------------------------------- /src/api/utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | import json 3 | from django.http import HttpResponse 4 | 5 | 6 | def render_json_only(view_func): 7 | """ render http response to json decorator 8 | """ 9 | 10 | def wrap(request, *args, **kwargs): 11 | retval = view_func(request, *args, **kwargs) 12 | if isinstance(retval, HttpResponse): 13 | retval.mimetype = 'application/json; charset=utf-8' 14 | return retval 15 | else: 16 | js = json.dumps(retval) 17 | return HttpResponse(js, content_type='application/json; charset=utf-8') 18 | 19 | return wrap -------------------------------------------------------------------------------- /src/ip_proxy/utils.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | 3 | 4 | def ranking(proxies, count=None): 5 | """当前根据成功率单一指标进行ip排名 6 | 当times<5, 不进入ip排名 7 | times>=5. 取最后10次的数据求平均值 8 | 9 | TODO: 评估指标: 成功率, 平均数据, ip速度的稳定性 10 | """ 11 | if not proxies: return [] 12 | failed_flag = 0 13 | items = [] 14 | for proxy in proxies: 15 | speeds = proxy['speeds'] 16 | speeds_len = len(speeds) 17 | if speeds_len <= 5: 18 | continue 19 | failed_count = speeds.count(failed_flag) 20 | success_rate = 1 - (float(failed_count) / speeds_len) 21 | ip_addr = '{ip}:{port}'.format(ip=proxy['ip'], port=proxy['port']) 22 | items.append((ip_addr, success_rate)) 23 | proxies = sorted(items, key=lambda item: item[1], reverse=True) 24 | return proxies[:count] 25 | -------------------------------------------------------------------------------- /src/manage.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import os 3 | import sys 4 | 5 | if __name__ == "__main__": 6 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "api.settings") 7 | try: 8 | from django.core.management import execute_from_command_line 9 | except ImportError: 10 | # The above import may fail for some other reason. Ensure that the 11 | # issue is really that Django is missing to avoid masking other 12 | # exceptions on Python 2. 13 | try: 14 | import django 15 | except ImportError: 16 | raise ImportError( 17 | "Couldn't import Django. Are you sure it's installed and " 18 | "available on your PYTHONPATH environment variable? Did you " 19 | "forget to activate a virtual environment?" 20 | ) 21 | raise 22 | execute_from_command_line(sys.argv) 23 | -------------------------------------------------------------------------------- /src/api/urls.py: -------------------------------------------------------------------------------- 1 | """src URL Configuration 2 | 3 | The `urlpatterns` list routes URLs to views. For more information please see: 4 | https://docs.djangoproject.com/en/1.10/topics/http/urls/ 5 | Examples: 6 | Function views 7 | 1. Add an import: from my_app import views 8 | 2. Add a URL to urlpatterns: url(r'^$', views.home, name='home') 9 | Class-based views 10 | 1. Add an import: from other_app.views import Home 11 | 2. Add a URL to urlpatterns: url(r'^$', Home.as_view(), name='home') 12 | Including another URLconf 13 | 1. Import the include() function: from django.conf.urls import url, include 14 | 2. Add a URL to urlpatterns: url(r'^blog/', include('blog.urls')) 15 | """ 16 | from django.conf.urls import url 17 | from django.contrib import admin 18 | from api.views import ip_proxy 19 | 20 | urlpatterns = [ 21 | url(r'^admin/', admin.site.urls), 22 | url(r'^proxy/', ip_proxy) 23 | ] 24 | -------------------------------------------------------------------------------- /src/ip_proxy/models.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | import datetime 3 | from mongoengine import Document 4 | from mongoengine import IntField, StringField, DateTimeField, FloatField, ListField 5 | 6 | 7 | class IpProxies(Document): 8 | TYPE_CHOICES = (u'高匿', u'匿名') 9 | PRO_CHOICES = ('http', 'https') 10 | 11 | ip = StringField(required=True, unique=True) 12 | port = IntField(required=True) 13 | ip_type = StringField(choices=TYPE_CHOICES, default=u'匿名') 14 | protocol = StringField(choices=PRO_CHOICES, default='http') 15 | speeds = ListField(FloatField()) 16 | creation_date = DateTimeField() 17 | update_date = DateTimeField() 18 | meta = {"db_alias": "material"} 19 | 20 | def save(self, *args, **kwargs): 21 | if not self.creation_date: 22 | self.creation_date = datetime.datetime.now() 23 | return super(IpProxies, self).save(*args, **kwargs) 24 | 25 | def get_proxies(self): 26 | proxy_address = '{ip}:{port}'.format( 27 | ip=self.ip, 28 | port=self.port 29 | ) 30 | return { 31 | 'http': 'http://%s' % proxy_address, 32 | 'https': 'https://%s' % proxy_address, 33 | } 34 | -------------------------------------------------------------------------------- /src/api/views.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | import json 3 | 4 | from django.core.cache import cache 5 | from django.views.decorators.csrf import csrf_exempt 6 | from django.http import HttpResponseForbidden 7 | from ip_proxy.models import IpProxies 8 | from ip_proxy.utils import ranking 9 | from api.utils import render_json_only 10 | 11 | 12 | REDIS_KEY = 'ip' 13 | NEVER_REDIS_TIMEOUT = 60 * 2 # 缓存2分钟更新一次 14 | 15 | 16 | @csrf_exempt 17 | @render_json_only 18 | def ip_proxy(request): 19 | if request.method == 'POST': 20 | try: 21 | data = request.POST 22 | count = int(data['count']) 23 | except: 24 | return u'请求失败' 25 | else: 26 | count = None 27 | proxies = get_proxy() 28 | return proxies[:count] 29 | 30 | 31 | def get_proxy(): 32 | proxies = cache.get(REDIS_KEY) 33 | if proxies: 34 | return json.loads(proxies) 35 | if not proxies or (len(proxies) == 0): 36 | proxies = [] 37 | objs = IpProxies.objects.all() 38 | for obj in objs: 39 | proxies.append(json.loads(obj.to_json())) 40 | proxies = ranking(proxies) 41 | proxies = [item[0] for item in proxies] 42 | cache.set(REDIS_KEY, json.dumps(proxies), NEVER_REDIS_TIMEOUT) 43 | return proxies 44 | 45 | -------------------------------------------------------------------------------- /deploy/nginx.conf: -------------------------------------------------------------------------------- 1 | # For more information on configuration, see: 2 | # * Official English Documentation: http://nginx.org/en/docs/ 3 | # * Official Russian Documentation: http://nginx.org/ru/docs/ 4 | 5 | user nginx; 6 | worker_processes auto; 7 | error_log /var/log/nginx/error.log; 8 | pid /run/nginx.pid; 9 | 10 | # Load dynamic modules. See /usr/share/nginx/README.dynamic. 11 | include /usr/share/nginx/modules/*.conf; 12 | 13 | events { 14 | worker_connections 1024; 15 | } 16 | 17 | http { 18 | log_format main '$remote_addr - $remote_user [$time_local] "$request" ' 19 | '$status $body_bytes_sent "$http_referer" ' 20 | '"$http_user_agent" "$http_x_forwarded_for"'; 21 | 22 | access_log /var/log/nginx/access.log main; 23 | 24 | sendfile on; 25 | tcp_nopush on; 26 | tcp_nodelay on; 27 | keepalive_timeout 65; 28 | types_hash_max_size 2048; 29 | 30 | include /etc/nginx/mime.types; 31 | default_type application/octet-stream; 32 | 33 | # Load modular configuration files from the /etc/nginx/conf.d directory. 34 | # See http://nginx.org/en/docs/ngx_core_module.html#include 35 | # for more information. 36 | include /etc/nginx/conf.d/*.conf; 37 | 38 | server { 39 | listen 80; 40 | server_name 118.99.23.163; 41 | root /usr/share/nginx/html; 42 | 43 | # Load configuration files for the default server block. 44 | include /etc/nginx/default.d/*.conf; 45 | 46 | location / { 47 | uwsgi_pass 118.99.23.163:8080; 48 | include uwsgi_params; 49 | } 50 | 51 | error_page 404 /404.html; 52 | location = /40x.html { 53 | } 54 | 55 | error_page 500 502 503 504 /50x.html; 56 | location = /50x.html { 57 | } 58 | } 59 | } 60 | 61 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | #免费代理IP池 2 | 高频抓取某个网站的数据,很有可能就被网站管理员封掉IP,导致抓取数据失败,解决这个问题最直接,简单的方法就是使用代理IP。目前网上有不少提供付费代理IP的平台,但是如需长期使用,该方案是笔不少的开销。本项目通过抓取IP代理网站提供免费代理IP,并不间断的验证IP的有效性,根据代理IP验证的历史记录对IP进行评估,输出高质量代理IP。 3 | **源码:**https://github.com/clelandgt/ip_proxy 4 | 5 | ##功能 6 | - 100+ 支持https, 匿名或高匿名的免费代理IP; 7 | - 外部提供API调用。 8 | 9 | ##原理 10 | 第一步:爬取多个免费IP代理网站获得代理IP; 11 | 第二步:访问https://www.baidu.com/验证代理IP的有效性和响应时间,由于每次都能爬取10000+个代理,需要使用并发(多进程+协程)方式快速完成验证; 12 | 第三步:将验证通过的代理IP存入mongodb中; 13 | 第四步:睡眠一定时间; 14 | 第五步:开始验证数据库中的IP,根据每个IP的历史失败数和失败率淘汰掉一些低质量或失效的代理IP; 15 | 第六步:验证完数据库中的IP,如果数据库中的IP小于一个预设值(比如100),执行第一步,否则执行第四步。 16 | ![这里写图片描述](http://img.blog.csdn.net/20170222220015846?watermark/2/text/aHR0cDovL2Jsb2cuY3Nkbi5uZXQvZ2FuemhleXU=/font/5a6L5L2T/fontsize/400/fill/I0JBQkFCMA==/dissolve/70/gravity/SouthEast) 17 | 18 | ##部署 19 | - os:Centos7 20 | - redis 2.8.19 21 | - mongodb 3.4.1 22 | - nginx 1.10.2 23 | 24 | 完成以上系统和软件的安装,其中操作系统可以选择其他linux版本,软件版本没有特殊要求。 25 | 26 | ###克隆代码到本地 27 | $ git clone https://github.com/clelandgt/ip_proxy 28 | 29 | ### 创建虚拟环境 30 | $ mkvirtualenv proxy_env #创建名为proxy_env的虚拟环境 31 | $ workon proxy_env #加载proxy_env虚拟环境 32 | $ pip install -r requirements.txt #导入并安装需要安装的第三方库 33 | 关于虚拟环境的详细介绍和使用详见:http://blog.csdn.net/ganzheyu/article/details/53014726 34 | 35 | ### supervisord 36 | 使用supervisord进行进程管理,当进程出现异常退出时,supervisord会重新启动该进程。 37 | 38 | $ pip install supervisor #安装supervisord 39 | $ mv supervisord.conf /etc/supervisord.conf #默认配置文件在/etc下,所以将项目配置好的deploy目录下的supervisord.conf拷贝到/etc下。 40 | 41 | 本项目需要监控的进程主要有两个,一个是IP代理核心流程"python ip_proxy.py",另一个是提供外部API访问的"uwsgi uwsgi.ini" 42 | 43 | [program:proxy_crawler] 44 | command=python ip_proxy.py 45 | directory=/root/webapp/ip_proxy/src/ip_proxy 46 | autostart=true 47 | autorestart=true 48 | stdout_logfile=/tmp/proxy_crawler.log 49 | 50 | [program:uwsgi] 51 | command=uwsgi uwsgi.ini 52 | directory=/root/webapp/ip_proxy/src 53 | autostart=true 54 | autorestart=true 55 | 56 | ### nginx + uwsgi 配置后台服务 57 | ... 58 | 59 | ### 设置开机自启 60 | $ systemctl enabled supervisord.service #设置supervisord开机自启 61 | $ systemctl enabled redis.service #设置redis开机自启 62 | $ systemctl enabled mongod.service #设置mongodb开机自启 63 | $ systemctl enabled nginx.service #设置nginx开机自启 64 | 65 | 关于systemctl详细使用见:http://blog.csdn.net/ganzheyu/article/details/56335419 66 | -------------------------------------------------------------------------------- /src/ip_proxy/ip_proxy.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | import time 3 | import logging 4 | import logging.config 5 | import settings 6 | 7 | from mongoengine import connect 8 | from settings import PARSER_LIST, LOGGING 9 | from models import IpProxies 10 | from crawl import Crawl 11 | from validator import Validator 12 | 13 | 14 | class IPProxy(object): 15 | def __init__(self): 16 | self.config_logging() 17 | self.connect_mongodb() 18 | self.validator = Validator() 19 | self.logger = logging.getLogger(__name__) 20 | 21 | @staticmethod 22 | def connect_mongodb(): 23 | connect(host='mongodb://localhost:27017/material', alias='material') 24 | 25 | @staticmethod 26 | def config_logging(): 27 | logging.config.dictConfig(LOGGING) 28 | 29 | def run(self): 30 | while True: 31 | try: 32 | proxies = IpProxies.objects.all() 33 | self.validate(proxies) 34 | proxies = IpProxies.objects.all() 35 | if proxies.count() < settings.IPS_MIN_NUM: 36 | new_proxies = self.crawl() 37 | self.validate(new_proxies) 38 | time.sleep(settings.UPDATE_TIME) 39 | except Exception as e: 40 | self.logger.exception(e) 41 | 42 | def validate(self, proxies): 43 | proxies_len = len(proxies) 44 | start_time = time.time() 45 | self.logger.info('{0} proxies need validate -------'.format(proxies_len)) 46 | self.validator.run(proxies) 47 | self.logger.info('validate end -------\n' 48 | '{0} proxies, spend {1}s'.format(proxies_len, time.time()-start_time)) 49 | return proxies 50 | 51 | def crawl(self): 52 | crawl = Crawl() 53 | proxies = [] 54 | self.logger.info('crawl beginning -------') 55 | for parser in PARSER_LIST: 56 | for url in parser['urls']: 57 | self.logger.info('crawling {0}'.format(url)) 58 | result = crawl.run(url, parser) 59 | proxies.extend(result) 60 | self.logger.info('crawl end -------\n' 61 | 'crawl {0} ips'.format(len(proxies))) 62 | return proxies 63 | 64 | 65 | def main(): 66 | ip_proxy = IPProxy() 67 | ip_proxy.run() 68 | 69 | 70 | if __name__ == '__main__': 71 | main() 72 | -------------------------------------------------------------------------------- /src/ip_proxy/crawl.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | import requests 3 | import logging 4 | 5 | from lxml import etree 6 | from settings import HEADER, CRAWL_TIMEOUT, MAX_RETRY_TIMES 7 | from models import IpProxies 8 | from utils import ranking 9 | 10 | 11 | class Crawl(object): 12 | def __init__(self): 13 | self.proxies = [] 14 | self.request = requests.Session() 15 | self.request.headers.update(HEADER) 16 | self.request.adapters.DEFAULT_RETRIES = 5 17 | self.logger = logging.getLogger(__name__) 18 | 19 | def run(self, url, parser): 20 | try: 21 | resp = self.download(url) 22 | return self.parse(resp, parser) 23 | except Exception as e: 24 | self.logger.exception(e) 25 | return [] 26 | 27 | def download(self, url): 28 | for index in xrange(1, MAX_RETRY_TIMES+1): 29 | try: 30 | resp = self.request.get(url=url, timeout=CRAWL_TIMEOUT) 31 | if index != 1: 32 | proxy = self.get_proxy() 33 | resp = self.request.get(url=url, timeout=CRAWL_TIMEOUT, proxies=proxy) 34 | if not resp.ok: 35 | raise ValueError('response status code is {}, not 200'.format(resp.status_code)) 36 | self.logger.info('connect url {} success.'.format(url)) 37 | return resp.text 38 | except Exception as e: 39 | self.logger.error(e) 40 | if index == MAX_RETRY_TIMES: 41 | self.logger.error('retry connect url {0} {1} times, but is failed.'.format(url, MAX_RETRY_TIMES)) 42 | raise e 43 | 44 | @staticmethod 45 | def parse(document, parser): 46 | proxies = [] 47 | root = etree.HTML(document) 48 | pattern = root.xpath(parser['pattern']) 49 | 50 | for position in pattern: 51 | ip = position.xpath(parser['position']['ip'])[0].text 52 | port = position.xpath(parser['position']['port'])[0].text 53 | ip_type = position.xpath(parser['position']['type'])[0].text 54 | ip_type = '高匿' if ip_type.find(u'高匿') != -1 else '匿名' 55 | proxies.append({'ip': ip, 'port': int(port), 'ip_type': ip_type, 'protocol': '', 'speeds': []}) 56 | return proxies 57 | 58 | def get_proxy(self): 59 | if len(self.proxies) == 0: 60 | proxies = IpProxies.objects.all() 61 | proxies = ranking(proxies) 62 | self.proxies = [{'http': 'http://{}'.format(proxy[0])} for proxy in proxies] 63 | return self.proxies.pop() 64 | -------------------------------------------------------------------------------- /src/api/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | """ 3 | Django settings for src project. 4 | 5 | Generated by 'django-admin startproject' using Django 1.10.4. 6 | 7 | For more information on this file, see 8 | https://docs.djangoproject.com/en/1.10/topics/settings/ 9 | 10 | For the full list of settings and their values, see 11 | https://docs.djangoproject.com/en/1.10/ref/settings/ 12 | """ 13 | 14 | import os 15 | 16 | import mongoengine 17 | from mongoengine import connection 18 | 19 | 20 | # Build paths inside the project like this: os.path.join(BASE_DIR, ...) 21 | BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 22 | 23 | 24 | # Quick-start development settings - unsuitable for production 25 | # See https://docs.djangoproject.com/en/1.10/howto/deployment/checklist/ 26 | 27 | # SECURITY WARNING: keep the secret key used in production secret! 28 | SECRET_KEY = '2-ao$dd)^a&sy7hrm4%sh2+pkd1z)r07cffn#=(r$b$mn1mqbv' 29 | 30 | # SECURITY WARNING: don't run with debug turned on in production! 31 | DEBUG = True 32 | 33 | ALLOWED_HOSTS = ['*'] 34 | 35 | 36 | # Application definition 37 | 38 | INSTALLED_APPS = [ 39 | 'django.contrib.admin', 40 | 'django.contrib.auth', 41 | 'django.contrib.contenttypes', 42 | 'django.contrib.sessions', 43 | 'django.contrib.messages', 44 | 'django.contrib.staticfiles', 45 | ] 46 | 47 | MIDDLEWARE = [ 48 | 'django.middleware.security.SecurityMiddleware', 49 | 'django.contrib.sessions.middleware.SessionMiddleware', 50 | 'django.middleware.common.CommonMiddleware', 51 | 'django.middleware.csrf.CsrfViewMiddleware', 52 | 'django.contrib.auth.middleware.AuthenticationMiddleware', 53 | 'django.contrib.messages.middleware.MessageMiddleware', 54 | 'django.middleware.clickjacking.XFrameOptionsMiddleware', 55 | ] 56 | 57 | ROOT_URLCONF = 'api.urls' 58 | 59 | TEMPLATES = [ 60 | { 61 | 'BACKEND': 'django.template.backends.django.DjangoTemplates', 62 | 'DIRS': [], 63 | 'APP_DIRS': True, 64 | 'OPTIONS': { 65 | 'context_processors': [ 66 | 'django.template.context_processors.debug', 67 | 'django.template.context_processors.request', 68 | 'django.contrib.auth.context_processors.auth', 69 | 'django.contrib.messages.context_processors.messages', 70 | ], 71 | }, 72 | }, 73 | ] 74 | 75 | WSGI_APPLICATION = 'api.wsgi.application' 76 | 77 | 78 | # Database 79 | # https://docs.djangoproject.com/en/1.10/ref/settings/#databases 80 | 81 | # DATABASES = { 82 | # 'default': { 83 | # 'ENGINE': 'django.db.backends.sqlite3', 84 | # 'NAME': os.path.join(BASE_DIR, 'db.sqlite3'), 85 | # } 86 | # } 87 | 88 | 89 | # Password validation 90 | # https://docs.djangoproject.com/en/1.10/ref/settings/#auth-password-validators 91 | 92 | AUTH_PASSWORD_VALIDATORS = [ 93 | { 94 | 'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator', 95 | }, 96 | { 97 | 'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator', 98 | }, 99 | { 100 | 'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator', 101 | }, 102 | { 103 | 'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator', 104 | }, 105 | ] 106 | 107 | 108 | # Internationalization 109 | # https://docs.djangoproject.com/en/1.10/topics/i18n/ 110 | 111 | LANGUAGE_CODE = 'en-us' 112 | 113 | TIME_ZONE = 'UTC' 114 | 115 | USE_I18N = True 116 | 117 | USE_L10N = True 118 | 119 | USE_TZ = True 120 | 121 | 122 | # Static files (CSS, JavaScript, Images) 123 | # https://docs.djangoproject.com/en/1.10/howto/static-files/ 124 | 125 | STATIC_URL = '/static/' 126 | 127 | 128 | # Redis address 129 | REDIS_ADDR = 'redis://localhost:6379' 130 | # 内网服务器地址 131 | REDIS_HOST = 'localhost' 132 | REDIS_PORT = 6379 133 | 134 | CACHES = { 135 | 'default': { 136 | 'BACKEND': 'redis_cache.cache.RedisCache', 137 | 'LOCATION': REDIS_ADDR, 138 | "OPTIONS": { 139 | "CLIENT_CLASS": "redis_cache.client.DefaultClient", 140 | }, 141 | }, 142 | } 143 | 144 | # Connect MongoDB 145 | MONGODBS = ['material'] 146 | MONGO_HOST = 'localhost' 147 | MONGO_PORT = 27017 148 | 149 | for name in MONGODBS: 150 | if name not in connection._connections: 151 | mongoengine.register_connection(alias=name, name=name, 152 | host=MONGO_HOST, port=MONGO_PORT) -------------------------------------------------------------------------------- /src/ip_proxy/validator.py: -------------------------------------------------------------------------------- 1 | # coding:utf- 2 | import time 3 | import requests 4 | import logging 5 | import multiprocessing 6 | 7 | from multiprocessing import Queue 8 | from gevent import monkey 9 | from gevent.pool import Pool 10 | monkey.patch_all() 11 | 12 | from mongoengine import NotUniqueError, DoesNotExist 13 | from requests.exceptions import RequestException 14 | from models import IpProxies 15 | from settings import (HEADER, TEST_URL, VALIDATE_TIMEOUT, VALIDATE_PROCESS_NUM, VALIDATE_THREAD_NUM, CONT_FAIL_TIMES, 16 | FAIL_RATE_LIMIT, ON_FAIL_RATE_TIMES) 17 | 18 | 19 | FAIL_PLACEHOLDER = 0 20 | 21 | 22 | class Validator(object): 23 | def __init__(self): 24 | self.request = requests.Session() 25 | self.request.adapters.DEFAULT_RETRIES = 5 26 | self.request.headers.update(HEADER) 27 | self.logger = logging.getLogger(__name__) 28 | 29 | def run(self, ips): 30 | cocurrent(self.validate, ips, VALIDATE_PROCESS_NUM, VALIDATE_THREAD_NUM) 31 | 32 | def validate(self, ip_obj): 33 | ip, port = ip_obj['ip'], ip_obj['port'] 34 | ip_obj['protocol'] = 'https' 35 | ip_addr = '{ip}:{port}'.format(ip=ip, port=port) 36 | proxies = {'https': 'https://{}'.format(ip_addr)} 37 | start = time.time() 38 | try: 39 | resp = requests.get(url=TEST_URL, timeout=VALIDATE_TIMEOUT, proxies=proxies, verify=False) 40 | if not resp.ok: 41 | raise RequestException 42 | speed = round(time.time() - start, 2) 43 | ip_obj['speeds'].append(speed) 44 | self.store_into_db(ip_obj) 45 | self.logger.info('success ip={ip}, port={port}, speed={speed}\n'.format(ip=ip, port=port, speed=speed)) 46 | except RequestException: 47 | self.logger.warning('fail ip={}\n'.format(ip)) 48 | self.handle_request_error(ip_obj) 49 | 50 | def handle_request_error(self, ip_obj): 51 | """处理验证失败的代理ip 52 | 爬取的ip直接返回. 53 | 数据库里的ip验证淘汰规则(失败的ip,speed=99): 54 | 1. 失败数: 连续验证失败3次的IP直接上删除淘汰. 55 | 2. 失败率: ip验证次数超过10次时,开启失败率淘汰(当失败率>50%时,直接淘汰删除) 56 | :param ip_obj: 57 | :return: 58 | """ 59 | ip_obj['speeds'].append(FAIL_PLACEHOLDER) 60 | ip, speeds = ip_obj['ip'], ip_obj['speeds'] 61 | speeds_len = len(speeds) 62 | if speeds_len >= CONT_FAIL_TIMES: 63 | # 失败数 64 | last_speeds = speeds[(0 - CONT_FAIL_TIMES):] 65 | if len(last_speeds) == last_speeds.count(FAIL_PLACEHOLDER): 66 | self.logger.warning('ip {ip} continue fail {count} times arrive limit.'.format(ip=ip, count=CONT_FAIL_TIMES)) 67 | self.delete_ip_from_db(ip) 68 | return 69 | # 失败率 70 | if speeds_len >= ON_FAIL_RATE_TIMES: 71 | fail_count = speeds.count(FAIL_PLACEHOLDER) 72 | fail_rate = float(fail_count)/speeds_len 73 | if fail_rate > FAIL_RATE_LIMIT: 74 | self.logger.warning('ip failed rate {} arrive limit.'.format(fail_rate)) 75 | self.delete_ip_from_db(ip) 76 | return 77 | self.store_into_db(ip_obj) 78 | 79 | @staticmethod 80 | def store_into_db(ip_obj): 81 | ip, port, ip_type, protocol, speeds = ip_obj['ip'], ip_obj['port'], ip_obj['ip_type'], ip_obj['protocol'], ip_obj['speeds'] 82 | try: 83 | obj = IpProxies.objects.get(ip=ip) 84 | if len(speeds) == 1: 85 | speeds.extend(obj['speeds']) 86 | obj.update(port=port, ip_type=ip_type, protocol=protocol, speeds=speeds) 87 | except DoesNotExist: 88 | IpProxies(ip=ip, port=port, ip_type=ip_type, protocol=protocol, speeds=speeds).save() 89 | 90 | def delete_ip_from_db(self, ip): 91 | IpProxies.objects(ip=ip).delete() 92 | self.logger.warning('delete ip {0} from database'.format(ip)) 93 | 94 | 95 | def cocurrent(func, items, process_num, coroutine_num): 96 | queue = Queue() 97 | pieces = average_cut_list(items, process_num) 98 | processes = [] 99 | for piece in pieces: 100 | process = multiprocessing.Process(target=process_with_coroutine, args=(func, piece, queue, coroutine_num)) 101 | process.start() 102 | processes.append(process) 103 | for process in processes: 104 | process.join() 105 | 106 | results = [] 107 | for _ in processes: 108 | result = queue.get() 109 | results.extend(result) 110 | return results 111 | 112 | 113 | def process_with_coroutine(func, piece, queue, coroutine_num): 114 | validate_pool = Pool(coroutine_num) 115 | result = validate_pool.map(func, piece) 116 | queue.put(result) 117 | 118 | 119 | def average_cut_list(source_list, count): 120 | func = lambda A, n: [A[i:i + n] for i in range(0, len(A), n)] 121 | return func(source_list, count) 122 | -------------------------------------------------------------------------------- /src/ip_proxy/settings.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | import os 3 | import random 4 | import multiprocessing 5 | 6 | 7 | # 配置爬取IP流程 8 | CRAWL_THREAD_NUM = 20 9 | CRAWL_TIMEOUT = 10 10 | MAX_RETRY_TIMES = 5 11 | 12 | 13 | # 配置验证IP流程 14 | VALIDATE_PROCESS_NUM = multiprocessing.cpu_count() 15 | VALIDATE_THREAD_NUM = 100 16 | VALIDATE_TIMEOUT = 8 17 | CONT_FAIL_TIMES = 3 18 | FAIL_RATE_LIMIT = 0.5 19 | ON_FAIL_RATE_TIMES = 10 20 | 21 | 22 | UPDATE_TIME = 2 * 60 # 每两分钟验证一轮数据库中的IP 23 | IPS_MIN_NUM = 100 24 | 25 | 26 | TEST_URL = 'https://www.baidu.com/' 27 | 28 | 29 | PARSER_LIST = [ 30 | { 31 | 'urls': ['http://www.66ip.cn/%s.html'% n for n in range(1, 20)], 32 | 'pattern': ".//*[@id='main']/div/div[1]/table/tr[position()>1]", 33 | 'position':{'ip': './td[1]', 'port': './td[2]', 'type': './td[4]'} 34 | }, 35 | { 36 | 'urls': ['http://www.66ip.cn/areaindex_%s/%s.html' % (m, n) for m in range(1, 35) for n in range(1, 5)], 37 | 'pattern': ".//*[@id='footer']/div/table/tr[position()>1]", 38 | 'position':{'ip': './td[1]', 'port': './td[2]', 'type': './td[4]'} 39 | }, 40 | { 41 | 'urls': ['http://www.kuaidaili.com/proxylist/%s/' % n for n in range(1, 10)], 42 | 'pattern': ".//*[@id='index_free_list']/table/tbody/tr[position()>0]", 43 | 'position':{'ip': './td[1]', 'port': './td[2]', 'type': './td[3]'} 44 | }, 45 | { 46 | 'urls': ['http://www.kuaidaili.com/free/%s/%s/' % (m, n) for m in ['inha', 'outha'] for n in range(1, 10)], 47 | 'pattern': ".//*[@id='list']/table/tbody/tr[position()>0]", 48 | 'position':{'ip': './td[1]', 'port': './td[2]', 'type': './td[3]'} 49 | }, 50 | { 51 | 'urls': ['http://www.ip181.com/daili/%s.html' % n for n in range(1, 10)], 52 | 'pattern': "/html/body/div[2]/div/div[2]/div/div[3]/table/tbody/tr[position()>1]", 53 | 'position': {'ip': './td[1]', 'port': './td[2]', 'type': './td[3]'} 54 | 55 | }, 56 | { 57 | 'urls': ['http://www.xicidaili.com/nn/%s' % n for n in range(1, 5)], 58 | 'pattern': ".//*[@id='ip_list']/tr[position()>1]", 59 | 'position': {'ip': './td[2]', 'port': './td[3]', 'type': './td[5]'} 60 | }, 61 | { 62 | 'urls': ['http://ip84.com/%s/%s' % (m, n) for m in ('gn', 'pn') for n in range(1, 10)], 63 | 'pattern': ".//*[@id='content']/div[1]/table/tr[position()>1]", 64 | 'position': {'ip': './td[1]', 'port': './td[2]', 'type': './td[4]'} 65 | }, 66 | { 67 | 'urls': ['http://www.nianshao.me/?stype=2&page=%s' % n for n in range(1, 50)], 68 | 'pattern': ".//*[@class='table']/tbody/tr[position()>1]", 69 | 'position': {'ip': './td[1]', 'port': './td[2]', 'type': './td[4]'} 70 | }, 71 | ] 72 | 73 | 74 | USER_AGENTS = [ 75 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", 76 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)", 77 | "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", 78 | "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)", 79 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", 80 | "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", 81 | "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)", 82 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)", 83 | "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6", 84 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1", 85 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0", 86 | "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5", 87 | "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6", 88 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", 89 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20", 90 | "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52", 91 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11", 92 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER", 93 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)", 94 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)", 95 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER", 96 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)", 97 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)", 98 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)", 99 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)", 100 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)", 101 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)", 102 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1", 103 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1", 104 | "Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5", 105 | "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre", 106 | "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0", 107 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11", 108 | "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10" 109 | ] 110 | 111 | 112 | HEADER = { 113 | 'User-Agent': random.choice(USER_AGENTS), 114 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 115 | 'Accept-Language': 'en-US,en;q=0.5', 116 | 'Connection': 'keep-alive', 117 | 'Accept-Encoding': 'gzip, deflate', 118 | } 119 | 120 | 121 | # logging configure file 122 | BASE_DIR = os.path.dirname(os.path.abspath(__file__)) 123 | LOGGING = { 124 | "version": 1, 125 | "disable_existing_loggers": False, 126 | "formatters": { 127 | "basic": { 128 | "format": "%(levelname)s - %(asctime)s - %(module)s - %(message)s" 129 | } 130 | }, 131 | "handlers": { 132 | "info_file_handler": { 133 | "class": "logging.handlers.RotatingFileHandler", 134 | "level": "INFO", 135 | "formatter": "basic", 136 | "filename": os.path.join(BASE_DIR, "ip_proxy.log"), 137 | "encoding": "utf8" 138 | }, 139 | }, 140 | "root": { 141 | "level": "INFO", 142 | "handlers": ["info_file_handler"] 143 | } 144 | } 145 | -------------------------------------------------------------------------------- /deploy/supervisord.conf: -------------------------------------------------------------------------------- 1 | ; Sample supervisor config file. 2 | ; 3 | ; For more information on the config file, please see: 4 | ; http://supervisord.org/configuration.html 5 | ; 6 | ; Notes: 7 | ; - Shell expansion ("~" or "$HOME") is not supported. Environment 8 | ; variables can be expanded using this syntax: "%(ENV_HOME)s". 9 | ; - Comments must have a leading space: "a=b ;comment" not "a=b;comment". 10 | 11 | [unix_http_server] 12 | file=/tmp/supervisor.sock ; (the path to the socket file) 13 | ;chmod=0700 ; socket file mode (default 0700) 14 | ;chown=nobody:nogroup ; socket file uid:gid owner 15 | ;username=user ; (default is no username (open server)) 16 | ;password=123 ; (default is no password (open server)) 17 | 18 | ;[inet_http_server] ; inet (TCP) server disabled by default 19 | ;port=127.0.0.1:9001 ; (ip_address:port specifier, *:port for all iface) 20 | ;username=user ; (default is no username (open server)) 21 | ;password=123 ; (default is no password (open server)) 22 | 23 | [supervisord] 24 | logfile=/tmp/supervisord.log ; (main log file;default $CWD/supervisord.log) 25 | logfile_maxbytes=50MB ; (max main logfile bytes b4 rotation;default 50MB) 26 | logfile_backups=10 ; (num of main logfile rotation backups;default 10) 27 | loglevel=info ; (log level;default info; others: debug,warn,trace) 28 | pidfile=/tmp/supervisord.pid ; (supervisord pidfile;default supervisord.pid) 29 | nodaemon=false ; (start in foreground if true;default false) 30 | minfds=1024 ; (min. avail startup file descriptors;default 1024) 31 | minprocs=200 ; (min. avail process descriptors;default 200) 32 | ;umask=022 ; (process file creation umask;default 022) 33 | ;user=chrism ; (default is current user, required if root) 34 | ;identifier=supervisor ; (supervisord identifier, default is 'supervisor') 35 | ;directory=/tmp ; (default is not to cd during start) 36 | ;nocleanup=true ; (don't clean up tempfiles at start;default false) 37 | ;childlogdir=/tmp ; ('AUTO' child log dir, default $TEMP) 38 | ;environment=KEY="value" ; (key value pairs to add to environment) 39 | ;strip_ansi=false ; (strip ansi escape codes in logs; def. false) 40 | 41 | ; the below section must remain in the config file for RPC 42 | ; (supervisorctl/web interface) to work, additional interfaces may be 43 | ; added by defining them in separate rpcinterface: sections 44 | [rpcinterface:supervisor] 45 | supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface 46 | 47 | [supervisorctl] 48 | serverurl=unix:///tmp/supervisor.sock ; use a unix:// URL for a unix socket 49 | ;serverurl=http://127.0.0.1:9001 ; use an http:// url to specify an inet socket 50 | ;username=chris ; should be same as http_username if set 51 | ;password=123 ; should be same as http_password if set 52 | ;prompt=mysupervisor ; cmd line prompt (default "supervisor") 53 | ;history_file=~/.sc_history ; use readline history if available 54 | 55 | ; The below sample program section shows all possible program subsection values, 56 | ; create one or more 'real' program: sections to be able to control them under 57 | ; supervisor. 58 | 59 | ;[program:theprogramname] 60 | ;command=/bin/cat ; the program (relative uses PATH, can take args) 61 | ;process_name=%(program_name)s ; process_name expr (default %(program_name)s) 62 | ;numprocs=1 ; number of processes copies to start (def 1) 63 | ;directory=/tmp ; directory to cwd to before exec (def no cwd) 64 | ;umask=022 ; umask for process (default None) 65 | ;priority=999 ; the relative start priority (default 999) 66 | ;autostart=true ; start at supervisord start (default: true) 67 | ;startsecs=1 ; # of secs prog must stay up to be running (def. 1) 68 | ;startretries=3 ; max # of serial start failures when starting (default 3) 69 | ;autorestart=unexpected ; when to restart if exited after running (def: unexpected) 70 | ;exitcodes=0,2 ; 'expected' exit codes used with autorestart (default 0,2) 71 | ;stopsignal=QUIT ; signal used to kill process (default TERM) 72 | ;stopwaitsecs=10 ; max num secs to wait b4 SIGKILL (default 10) 73 | ;stopasgroup=false ; send stop signal to the UNIX process group (default false) 74 | ;killasgroup=false ; SIGKILL the UNIX process group (def false) 75 | ;user=chrism ; setuid to this UNIX account to run the program 76 | ;redirect_stderr=true ; redirect proc stderr to stdout (default false) 77 | ;stdout_logfile=/a/path ; stdout log path, NONE for none; default AUTO 78 | ;stdout_logfile_maxbytes=1MB ; max # logfile bytes b4 rotation (default 50MB) 79 | ;stdout_logfile_backups=10 ; # of stdout logfile backups (default 10) 80 | ;stdout_capture_maxbytes=1MB ; number of bytes in 'capturemode' (default 0) 81 | ;stdout_events_enabled=false ; emit events on stdout writes (default false) 82 | ;stderr_logfile=/a/path ; stderr log path, NONE for none; default AUTO 83 | ;stderr_logfile_maxbytes=1MB ; max # logfile bytes b4 rotation (default 50MB) 84 | ;stderr_logfile_backups=10 ; # of stderr logfile backups (default 10) 85 | ;stderr_capture_maxbytes=1MB ; number of bytes in 'capturemode' (default 0) 86 | ;stderr_events_enabled=false ; emit events on stderr writes (default false) 87 | ;environment=A="1",B="2" ; process environment additions (def no adds) 88 | ;serverurl=AUTO ; override serverurl computation (childutils) 89 | 90 | ; The below sample eventlistener section shows all possible 91 | ; eventlistener subsection values, create one or more 'real' 92 | ; eventlistener: sections to be able to handle event notifications 93 | ; sent by supervisor. 94 | 95 | ;[eventlistener:theeventlistenername] 96 | ;command=/bin/eventlistener ; the program (relative uses PATH, can take args) 97 | ;process_name=%(program_name)s ; process_name expr (default %(program_name)s) 98 | ;numprocs=1 ; number of processes copies to start (def 1) 99 | ;events=EVENT ; event notif. types to subscribe to (req'd) 100 | ;buffer_size=10 ; event buffer queue size (default 10) 101 | ;directory=/tmp ; directory to cwd to before exec (def no cwd) 102 | ;umask=022 ; umask for process (default None) 103 | ;priority=-1 ; the relative start priority (default -1) 104 | ;autostart=true ; start at supervisord start (default: true) 105 | ;startsecs=1 ; # of secs prog must stay up to be running (def. 1) 106 | ;startretries=3 ; max # of serial start failures when starting (default 3) 107 | ;autorestart=unexpected ; autorestart if exited after running (def: unexpected) 108 | ;exitcodes=0,2 ; 'expected' exit codes used with autorestart (default 0,2) 109 | ;stopsignal=QUIT ; signal used to kill process (default TERM) 110 | ;stopwaitsecs=10 ; max num secs to wait b4 SIGKILL (default 10) 111 | ;stopasgroup=false ; send stop signal to the UNIX process group (default false) 112 | ;killasgroup=false ; SIGKILL the UNIX process group (def false) 113 | ;user=chrism ; setuid to this UNIX account to run the program 114 | ;redirect_stderr=false ; redirect_stderr=true is not allowed for eventlisteners 115 | ;stdout_logfile=/a/path ; stdout log path, NONE for none; default AUTO 116 | ;stdout_logfile_maxbytes=1MB ; max # logfile bytes b4 rotation (default 50MB) 117 | ;stdout_logfile_backups=10 ; # of stdout logfile backups (default 10) 118 | ;stdout_events_enabled=false ; emit events on stdout writes (default false) 119 | ;stderr_logfile=/a/path ; stderr log path, NONE for none; default AUTO 120 | ;stderr_logfile_maxbytes=1MB ; max # logfile bytes b4 rotation (default 50MB) 121 | ;stderr_logfile_backups=10 ; # of stderr logfile backups (default 10) 122 | ;stderr_events_enabled=false ; emit events on stderr writes (default false) 123 | ;environment=A="1",B="2" ; process environment additions 124 | ;serverurl=AUTO ; override serverurl computation (childutils) 125 | 126 | ; The below sample group section shows all possible group values, 127 | ; create one or more 'real' group: sections to create "heterogeneous" 128 | ; process groups. 129 | 130 | ;[group:thegroupname] 131 | ;programs=progname1,progname2 ; each refers to 'x' in [program:x] definitions 132 | ;priority=999 ; the relative start priority (default 999) 133 | 134 | ; The [include] section can just contain the "files" setting. This 135 | ; setting can list multiple files (separated by whitespace or 136 | ; newlines). It can also contain wildcards. The filenames are 137 | ; interpreted as relative to this file. Included files *cannot* 138 | ; include files themselves. 139 | 140 | ;[include] 141 | ;files = relative/directory/*.ini 142 | 143 | [program:proxy_crawler] 144 | command=python ip_proxy.py 145 | directory=/root/webapp/ip_proxy/src/ip_proxy 146 | autostart=true 147 | autorestart=true 148 | stdout_logfile=/tmp/proxy_crawler.log 149 | 150 | [program:uwsgi] 151 | command=uwsgi uwsgi.ini 152 | directory=/root/webapp/ip_proxy/src 153 | autostart=true 154 | autorestart=true 155 | --------------------------------------------------------------------------------