├── ip_proxy_site ├── ip_proxy │ ├── __init__.py │ ├── migrations │ │ ├── __init__.py │ │ ├── 0003_ipproxy_count.py │ │ ├── 0002_ipproxy_available.py │ │ └── 0001_initial.py │ ├── tests.py │ ├── apps.py │ ├── admin.py │ ├── urls.py │ ├── models.py │ └── views.py ├── ip_proxy_site │ ├── __init__.py │ ├── wsgi.py │ ├── urls.py │ └── settings.py ├── static │ ├── favicon.ico │ ├── css │ │ └── test_01.css │ └── js │ │ ├── popper.min.js │ │ ├── bootstrap.min.js │ │ └── jquery.min.js ├── manage.py └── templates │ ├── ip_proxy │ ├── base.html │ └── list.html │ └── pagination.html ├── spider └── ip_proxies │ ├── ip_proxies │ ├── __init__.py │ ├── spiders │ │ ├── __init__.py │ │ ├── xicidaili.py │ │ ├── kuaidaili.py │ │ ├── ip3366.py │ │ ├── jiangxianli.py │ │ ├── base.py │ │ └── verify.py │ ├── items.py │ ├── pipelines.py │ ├── middlewares.py │ └── settings.py │ ├── start_verify.py │ ├── scrapy.cfg │ ├── test.py │ └── start.py ├── docs ├── history.md ├── TODO_history.md └── Ubuntu_service.md ├── requirements.txt ├── rm_log.py ├── LICENSE ├── scheduler.py ├── .gitignore └── README.md /ip_proxy_site/ip_proxy/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /ip_proxy_site/ip_proxy_site/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /spider/ip_proxies/ip_proxies/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /ip_proxy_site/ip_proxy/migrations/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /ip_proxy_site/ip_proxy/tests.py: -------------------------------------------------------------------------------- 1 | from django.test import TestCase 2 | 3 | # Create your tests here. 4 | -------------------------------------------------------------------------------- /ip_proxy_site/static/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LZC6244/ip_proxy_pool/HEAD/ip_proxy_site/static/favicon.ico -------------------------------------------------------------------------------- /ip_proxy_site/static/css/test_01.css: -------------------------------------------------------------------------------- 1 | .container { 2 | margin-top: 2%; 3 | } 4 | div.post { 5 | margin-bottom: 2%; 6 | } -------------------------------------------------------------------------------- /docs/history.md: -------------------------------------------------------------------------------- 1 | ## 2019.9.23 2 | 1. 修复验证代理时不能验证到全部代理 3 | - 原因:`update` 和 `del` 操作会影响 `验证时间` ,而列表页是按照最新验证时间排序的。 4 | - 解决:获取完所有代理再进行代理验证。 -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | Scrapy==1.6.0 2 | psycopg2==2.8.3 3 | Django==2.2.13 4 | requests==2.21.0 5 | Twisted==20.3.0 6 | APScheduler==3.6.1 7 | -------------------------------------------------------------------------------- /ip_proxy_site/ip_proxy/apps.py: -------------------------------------------------------------------------------- 1 | from django.apps import AppConfig 2 | 3 | 4 | class IpProxyConfig(AppConfig): 5 | name = 'ip_proxy' 6 | -------------------------------------------------------------------------------- /spider/ip_proxies/ip_proxies/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /spider/ip_proxies/start_verify.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: lzc 3 | # @Time : 2019/9/20 4 | from scrapy import cmdline 5 | 6 | cmdline.execute('scrapy crawl verify'.split()) 7 | # cmdline.execute('scrapy crawl ip3366'.split()) 8 | -------------------------------------------------------------------------------- /spider/ip_proxies/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = ip_proxies.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = ip_proxies 12 | -------------------------------------------------------------------------------- /ip_proxy_site/ip_proxy/admin.py: -------------------------------------------------------------------------------- 1 | from django.contrib import admin 2 | 3 | # Register your models here. 4 | from .models import IpProxy 5 | 6 | 7 | class IpProxyAdmin(admin.ModelAdmin): 8 | list_display = ['ip', 'port', 'anonymity', 'net_type', 'ip_location', 'available', 'count', 'verify_time'] 9 | list_filter = ['anonymity', 'net_type', 'available'] 10 | search_fields = ['ip'] 11 | 12 | 13 | admin.site.register(IpProxy, IpProxyAdmin) 14 | -------------------------------------------------------------------------------- /ip_proxy_site/ip_proxy_site/wsgi.py: -------------------------------------------------------------------------------- 1 | """ 2 | WSGI config for ip_proxy_site project. 3 | 4 | It exposes the WSGI callable as a module-level variable named ``application``. 5 | 6 | For more information on this file, see 7 | https://docs.djangoproject.com/en/2.2/howto/deployment/wsgi/ 8 | """ 9 | 10 | import os 11 | 12 | from django.core.wsgi import get_wsgi_application 13 | 14 | os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'ip_proxy_site.settings') 15 | 16 | application = get_wsgi_application() 17 | -------------------------------------------------------------------------------- /ip_proxy_site/ip_proxy/migrations/0003_ipproxy_count.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 2.2.10 on 2020-04-05 14:27 2 | 3 | from django.db import migrations, models 4 | 5 | 6 | class Migration(migrations.Migration): 7 | 8 | dependencies = [ 9 | ('ip_proxy', '0002_ipproxy_available'), 10 | ] 11 | 12 | operations = [ 13 | migrations.AddField( 14 | model_name='ipproxy', 15 | name='count', 16 | field=models.IntegerField(default=0), 17 | ), 18 | ] 19 | -------------------------------------------------------------------------------- /ip_proxy_site/ip_proxy/migrations/0002_ipproxy_available.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 2.2.10 on 2020-04-05 13:43 2 | 3 | from django.db import migrations, models 4 | 5 | 6 | class Migration(migrations.Migration): 7 | 8 | dependencies = [ 9 | ('ip_proxy', '0001_initial'), 10 | ] 11 | 12 | operations = [ 13 | migrations.AddField( 14 | model_name='ipproxy', 15 | name='available', 16 | field=models.BooleanField(default=None, null=True), 17 | ), 18 | ] 19 | -------------------------------------------------------------------------------- /ip_proxy_site/ip_proxy/urls.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: lzc 3 | # @Time :2019/8/26 4 | from django.conf.urls import url 5 | from django.urls import path 6 | from . import views 7 | 8 | app_name = 'ip_proxy' 9 | urlpatterns = [ 10 | path('list/', views.proxy_list, name='list_proxy'), 11 | path('get/', views.proxy_get, name='get_proxy'), 12 | path('update/', views.proxy_update, name='update_proxy'), 13 | path('del/', views.proxy_del, name='del_proxy'), 14 | path('get_csrf/', views.get_csrf, name='get_csrf'), 15 | ] 16 | -------------------------------------------------------------------------------- /spider/ip_proxies/test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: lzc 3 | # @Time : 2019/9/10 4 | import requests 5 | 6 | session = requests.session() 7 | csrf = session.get('http://127.0.0.1:8000/ip_proxy/get_csrf/').json().get('csrf') 8 | formdata = { 9 | 'csrfmiddlewaretoken': csrf, 10 | 'ip': '163.204.247.41', 11 | 'port': '12345', 12 | 'net_type': 'HTTP', 13 | 'anonymity': '测试', 14 | 'ip_location': '测试', 15 | 'verify_time': '2019-09-10 11:36:11', 16 | } 17 | 18 | session.post('http://127.0.0.1:8000/ip_proxy/update/', formdata) 19 | -------------------------------------------------------------------------------- /docs/TODO_history.md: -------------------------------------------------------------------------------- 1 | # TODO History 2 | 3 | [TOC] 4 | 5 | --- 6 | ## 2019.8.29 7 | - [x] 增加 priority 字段,刚入库初始值为1,验证失败一次减2,小于1时删除该数据 8 | - [x] 增删查改之类的接口,弃用通过管道操作数据库方案 9 | 10 | --- 11 | ## 2019.9.5 12 | - [x] 创建爬取代理的爬虫的模板,代理爬虫爬取到代理后使用其与代理池服务器的数据库交互 13 | 14 | --- 15 | ## 2019.9.6 16 | - [x] 增加查看全部代理的列表页 17 | - [x] 修复使用爬虫模板时反复调用 parse 的问题(未指定callback) 18 | - [x] 增加验证代理可用性的程序 19 | - [x] 增加联动 `Scrapy` 和 `Django` 的调度程序 20 | 21 | --- 22 | ## 2019.9.26 23 | - [x] 增加代理源 [云代理](http://www.ip3366.net/free/?stype=1&page=1) 24 | - [x] 增加定时删除前几天的 `scrapy` `log` 的程序 25 | 26 | --- 27 | ## 2020.4.5 28 | - [ ] 增加一列表明上次是否验证成功 29 | - [ ] 优化优先级规则,第几次验证失败优先级就减少几次`(如第一次失败-1,第二次失败-2)` -------------------------------------------------------------------------------- /spider/ip_proxies/ip_proxies/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class IpProxiesItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field()['ip', 'port', 'anonymity', 'type', 'location'] 14 | 15 | ip = scrapy.Field() 16 | port = scrapy.Field() 17 | # 匿名度:高匿、普通(也只爬取这两种,透明类型的不要) 18 | anonymity = scrapy.Field() 19 | # HTTP OR HTTPS 20 | net_type = scrapy.Field() 21 | # 代理 IP 的位置 22 | ip_location = scrapy.Field() 23 | # 最后验证时间 24 | verify_time = scrapy.Field() 25 | -------------------------------------------------------------------------------- /ip_proxy_site/manage.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Django's command-line utility for administrative tasks.""" 3 | import os 4 | import sys 5 | 6 | 7 | def main(): 8 | os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'ip_proxy_site.settings') 9 | try: 10 | from django.core.management import execute_from_command_line 11 | except ImportError as exc: 12 | raise ImportError( 13 | "Couldn't import Django. Are you sure it's installed and " 14 | "available on your PYTHONPATH environment variable? Did you " 15 | "forget to activate a virtual environment?" 16 | ) from exc 17 | execute_from_command_line(sys.argv) 18 | 19 | 20 | if __name__ == '__main__': 21 | main() 22 | -------------------------------------------------------------------------------- /rm_log.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: lzc 3 | # @Time : 2019/9/26 4 | import os 5 | import re 6 | from datetime import datetime, timedelta 7 | 8 | 9 | def rm_log(path, time_format): 10 | if not os.path.exists(path): 11 | return 'path not exists.' 12 | # 当前日期 13 | date_now = datetime.now() 14 | file_li = os.listdir(path) 15 | for i in file_li: 16 | # 日志文件的日期 17 | date_log = re.findall('\d{4}-\d{2}-\d{2}T\d{2}_\d{2}_\d{2}', i) 18 | if not date_log: 19 | continue 20 | date_log = date_log[0] 21 | date_log = datetime.strptime(date_log, time_format) 22 | # 二者相差的天数 23 | day = (date_now - date_log).days 24 | # 删除 2 天前的日志文件 25 | if day >= 2: 26 | os.remove(os.path.join(path, i)) 27 | -------------------------------------------------------------------------------- /spider/ip_proxies/start.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: lzc 3 | # @Time :2019/8/23 4 | 5 | 6 | from scrapy import cmdline 7 | from scrapy.crawler import CrawlerProcess 8 | from scrapy.utils.project import get_project_settings 9 | from ip_proxies.spiders.kuaidaili import KuaidailiSpider 10 | from ip_proxies.spiders.jiangxianli import JiangxianliSpider 11 | from ip_proxies.spiders.xicidaili import XicidailiSpider 12 | from ip_proxies.spiders.ip3366 import Ip3366Spider 13 | 14 | # 测试爬虫时使用,默认注释掉 15 | # cmdline.execute('scrapy crawl xicidaili'.split()) 16 | 17 | # 在同一进程同时运行多个爬虫 18 | process = CrawlerProcess(get_project_settings()) 19 | process.crawl(KuaidailiSpider) 20 | process.crawl(JiangxianliSpider) 21 | process.crawl(XicidailiSpider) 22 | process.crawl(Ip3366Spider) 23 | # 脚本将会停在此处知道所有爬虫完成 24 | process.start() 25 | # # process.stop() 26 | -------------------------------------------------------------------------------- /ip_proxy_site/templates/ip_proxy/base.html: -------------------------------------------------------------------------------- 1 | {% load static %} 2 | 3 | 4 |
5 || IP | 9 |PORT | 10 |类型 | 11 |匿名度 | 12 |位置 | 13 |上次验证时间 | 14 |上次验证结果 | 15 |优先级 | 16 |
|---|---|---|---|---|---|---|---|
| {{ proxy.ip }} | 22 |{{ proxy.port }} | 23 |{{ proxy.net_type }} | 24 |{{ proxy.anonymity }} | 25 |{{ proxy.ip_location }} | 26 |{{ proxy.verify_time | date:'Y-m-d H:i:s' }} | 27 |{{ proxy.available }} | 28 |{{ proxy.priority }} | 29 |
共 {{ paginator.num_pages }} 页,每页 20 个代理
共 {{ paginator.count }} 个代理,其中上次验证可用共 {{ p_last_true_num }} 个
代理IP采集于网络,仅供学习交流
请勿用于非法途径,违者后果自负
ip:\t%s
\nport:\t%s
\nThe proxy does not exist.
' % (ip, port)) 143 | count = p[0].count 144 | priority = p[0].priority 145 | count += 1 146 | priority -= count 147 | if priority <= 0: 148 | p.delete() 149 | return HttpResponse( 150 | 'ip:\t%s
\nport:\t%s
\nThe proxy priority has been deleted.
' % (ip, port)) 151 | else: 152 | p.update(priority=priority, verify_time=verify_time, available=False, count=count) 153 | return HttpResponse( 154 | 'ip:\t%s
\nport:\t%s
\npriority(now):\t%s
The proxy priority has been reduced by ' 155 | 'one.
' % ( 156 | ip, port, priority)) 157 | 158 | 159 | def get_csrf(request): 160 | # 从此 URL 获取 csrf 数据 161 | return JsonResponse(data={'csrf': get_token(request)}) 162 | -------------------------------------------------------------------------------- /spider/ip_proxies/ip_proxies/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import random 3 | import requests 4 | from datetime import datetime 5 | from twisted.internet.error import TimeoutError 6 | from ip_proxies.settings import USER_AGENTS, GET_CSRF, GET_PROXY, DEL_PROXY, RETRY_TIMES, TIME_FORMAT 7 | # Define here the models for your spider middleware 8 | # 9 | # See documentation in: 10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | from scrapy import signals 13 | 14 | session = requests.session() 15 | csrf = session.get(GET_CSRF).json().get('csrf') 16 | 17 | 18 | def get_proxy(): 19 | return requests.get(GET_PROXY).json() 20 | 21 | 22 | def del_proxy(ip, port, verify_time): 23 | return session.post(DEL_PROXY, {'csrfmiddlewaretoken': csrf, 'ip': ip, 'port': port, 'verify_time': verify_time}) 24 | 25 | 26 | class IpProxiesSpiderMiddleware(object): 27 | # Not all methods need to be defined. If a method is not defined, 28 | # scrapy acts as if the spider middleware does not modify the 29 | # passed objects. 30 | 31 | @classmethod 32 | def from_crawler(cls, crawler): 33 | # This method is used by Scrapy to create your spiders. 34 | s = cls() 35 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 36 | return s 37 | 38 | def process_spider_input(self, response, spider): 39 | # Called for each response that goes through the spider 40 | # middleware and into the spider. 41 | 42 | # Should return None or raise an exception. 43 | return None 44 | 45 | def process_spider_output(self, response, result, spider): 46 | # Called with the results returned from the Spider, after 47 | # it has processed the response. 48 | 49 | # Must return an iterable of Request, dict or Item objects. 50 | for i in result: 51 | yield i 52 | 53 | def process_spider_exception(self, response, exception, spider): 54 | # Called when a spider or process_spider_input() method 55 | # (from other spider middleware) raises an exception. 56 | 57 | # Should return either None or an iterable of Response, dict 58 | # or Item objects. 59 | pass 60 | 61 | def process_start_requests(self, start_requests, spider): 62 | # Called with the start requests of the spider, and works 63 | # similarly to the process_spider_output() method, except 64 | # that it doesn’t have a response associated. 65 | 66 | # Must return only requests (not items). 67 | for r in start_requests: 68 | yield r 69 | 70 | def spider_opened(self, spider): 71 | spider.logger.info('Spider opened: %s' % spider.name) 72 | 73 | 74 | class IpProxiesDownloaderMiddleware(object): 75 | # Not all methods need to be defined. If a method is not defined, 76 | # scrapy acts as if the downloader middleware does not modify the 77 | # passed objects. 78 | 79 | @classmethod 80 | def from_crawler(cls, crawler): 81 | # This method is used by Scrapy to create your spiders. 82 | s = cls() 83 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 84 | return s 85 | 86 | def process_request(self, request, spider): 87 | # Called for each request that goes through the downloader 88 | # middleware. 89 | 90 | # Must either: 91 | # - return None: continue processing this request 92 | # - or return a Response object 93 | # - or return a Request object 94 | # - or raise IgnoreRequest: process_exception() methods of 95 | # installed downloader middleware will be called 96 | return None 97 | 98 | def process_response(self, request, response, spider): 99 | # Called with the response returned from the downloader. 100 | 101 | # Must either; 102 | # - return a Response object 103 | # - return a Request object 104 | # - or raise IgnoreRequest 105 | return response 106 | 107 | def process_exception(self, request, exception, spider): 108 | # Called when a download handler or a process_request() 109 | # (from other downloader middleware) raises an exception. 110 | 111 | # Must either: 112 | # - return None: continue processing this exception 113 | # - return a Response object: stops process_exception() chain 114 | # - return a Request object: stops process_exception() chain 115 | pass 116 | 117 | def spider_opened(self, spider): 118 | spider.logger.info('Spider opened: %s' % spider.name) 119 | 120 | 121 | class RandomUA(object): 122 | 123 | def process_request(self, request, spider): 124 | # print('*' * 50) 125 | 126 | user_agent = random.choice(USER_AGENTS) 127 | # print(user_agent) 128 | request.headers['User-Agent'] = user_agent 129 | 130 | 131 | class ManageProxy(object): 132 | # def process_request(self, request, spider): 133 | # if request.meta.get('retry_times') or request.meta.get('proxy'): 134 | # # request.meta['proxy'] = request.meta['proxy'] 135 | # pass 136 | # else: 137 | # proxy_json = get_proxy() 138 | # request.meta['proxy'] = proxy_json.get('proxy') 139 | # request.meta['ip_proxy'] = proxy_json.get('ip') 140 | # request.meta['port_proxy'] = proxy_json.get('port') 141 | # request.meta['verify_time'] = datetime.strftime(datetime.now(), TIME_FORMAT) 142 | 143 | def process_exception(self, request, exception, spider): 144 | # if all([isinstance(exception, TimeoutError), 145 | # request.meta.get('retry_times') == RETRY_TIMES]): 146 | if request.meta.get('retry_times') == RETRY_TIMES: 147 | item = request.meta['item'] 148 | verify_time = datetime.strftime(datetime.now(), TIME_FORMAT) 149 | del_proxy(item['ip'], item['port'], verify_time) 150 | # return request.copy() 151 | -------------------------------------------------------------------------------- /spider/ip_proxies/ip_proxies/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | from datetime import datetime 4 | 5 | # Scrapy settings for ip_proxies project 6 | # 7 | # For simplicity, this file contains only settings considered important or 8 | # commonly used. You can find more settings consulting the documentation: 9 | # 10 | # https://doc.scrapy.org/en/latest/topics/settings.html 11 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 12 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 13 | 14 | BOT_NAME = 'ip_proxies' 15 | 16 | SPIDER_MODULES = ['ip_proxies.spiders'] 17 | NEWSPIDER_MODULE = 'ip_proxies.spiders' 18 | 19 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 20 | # USER_AGENT = 'ip_proxies (+http://www.yourdomain.com)' 21 | 22 | # Obey robots.txt rules 23 | ROBOTSTXT_OBEY = False 24 | 25 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 26 | # CONCURRENT_REQUESTS = 32 27 | 28 | # Configure a delay for requests for the same website (default: 0) 29 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 30 | # See also autothrottle settings and docs 31 | DOWNLOAD_DELAY = 1.5 32 | # The download delay setting will honor only one of: 33 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16 34 | # CONCURRENT_REQUESTS_PER_IP = 16 35 | 36 | # Disable cookies (enabled by default) 37 | # COOKIES_ENABLED = False 38 | 39 | # Disable Telnet Console (enabled by default) 40 | # TELNETCONSOLE_ENABLED = False 41 | 42 | # Override the default request headers: 43 | # DEFAULT_REQUEST_HEADERS = { 44 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 45 | # 'Accept-Language': 'en', 46 | # } 47 | 48 | # Enable or disable spider middlewares 49 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 50 | # SPIDER_MIDDLEWARES = { 51 | # 'ip_proxies.middlewares.IpProxiesSpiderMiddleware': 543, 52 | # } 53 | 54 | # Enable or disable downloader middlewares 55 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 56 | DOWNLOADER_MIDDLEWARES = { 57 | # 'ip_proxies.middlewares.IpProxiesDownloaderMiddleware': 543, 58 | 'ip_proxies.middlewares.RandomUA': 543, 59 | } 60 | 61 | # Enable or disable extensions 62 | # See https://doc.scrapy.org/en/latest/topics/extensions.html 63 | # EXTENSIONS = { 64 | # 'scrapy.extensions.telnet.TelnetConsole': None, 65 | # } 66 | 67 | # Configure item pipelines 68 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html 69 | # ITEM_PIPELINES = { 70 | # 'ip_proxies.pipelines.IpProxiesPipeline': 300, 71 | # } 72 | 73 | # Enable and configure the AutoThrottle extension (disabled by default) 74 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 75 | # AUTOTHROTTLE_ENABLED = True 76 | # The initial download delay 77 | # AUTOTHROTTLE_START_DELAY = 3 78 | # The maximum download delay to be set in case of high latencies 79 | # AUTOTHROTTLE_MAX_DELAY = 60 80 | # The average number of requests Scrapy should be sending in parallel to 81 | # each remote server 82 | # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 83 | # Enable showing throttling stats for every response received: 84 | # AUTOTHROTTLE_DEBUG = False 85 | 86 | # Enable and configure HTTP caching (disabled by default) 87 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 88 | # HTTPCACHE_ENABLED = True 89 | # HTTPCACHE_EXPIRATION_SECS = 0 90 | # HTTPCACHE_DIR = 'httpcache' 91 | # HTTPCACHE_IGNORE_HTTP_CODES = [] 92 | # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 93 | 94 | # 重试次数设置 (默认为2) 95 | RETRY_TIMES = 2 96 | # 超时时间设为10 (默认180) 97 | DOWNLOAD_TIMEOUT = 10 98 | 99 | TEST_URLS = [ 100 | 'https://www.baidu.com', 101 | 'https://cn.bing.com/', 102 | 'https://www.so.com/', 103 | ] 104 | 105 | # 日志文件设置 106 | LOG_LEVEL = 'DEBUG' 107 | # LOG_LEVEL = 'WARNING' 108 | LOG_ENCODING = 'utf-8' 109 | DATE = datetime.now() 110 | log_path = os.path.normpath(os.path.join(os.path.dirname(__file__), '../log')) 111 | os.makedirs(log_path, exist_ok=True) 112 | TIME_FORMAT = '%Y-%m-%dT%H_%M_%S' 113 | LOG_FILE = f'log/{DATE.strftime(TIME_FORMAT)}.log' 114 | 115 | # 不需要去重 2019.8.23 116 | # # 使用scrapy-redis里的去重组件,不使用scrapy默认的去重 117 | # DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" 118 | # # 使用scrapy-redis里的调度器组件,不实用scrapy默认的调度器 119 | # SCHEDULER = "scrapy_redis.scheduler.Scheduler" 120 | # # 允许暂停,redis请求记录不丢失 121 | # SCHEDULER_PERSIST = True 122 | # REDIS_HOST = '127.0.0.1' 123 | # # REDIS_HOST = '139.9.58.217' 124 | # REDIS_PORT = 6379 125 | # REDIS_PARAMS = {'password': 'xxxxx'} 126 | 127 | 128 | USER_AGENTS = [ 129 | 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 ' 130 | 'Safari/534.50', 131 | 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50', 132 | 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0', 133 | 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)', 134 | 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)', 135 | 'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1', 136 | 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36', 137 | 'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11', 138 | 'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11', 139 | 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)', 140 | 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36', 141 | 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko' 142 | ] 143 | 144 | # IP_PROXY 相关 URL 配置 145 | GET_CSRF = 'http://127.0.0.1:8000/ip_proxy/get_csrf/' 146 | LIST_PROXY = 'http://127.0.0.1:8000/ip_proxy/list/?page=1' 147 | GET_PROXY = 'http://127.0.0.1:8000/ip_proxy/get/' 148 | UPDATE_PROXY = 'http://127.0.0.1:8000/ip_proxy/update/' 149 | DEL_PROXY = 'http://127.0.0.1:8000/ip_proxy/del/' 150 | -------------------------------------------------------------------------------- /ip_proxy_site/static/js/popper.min.js: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (C) Federico Zivolo 2017 3 | Distributed under the MIT License (license terms are at http://opensource.org/licenses/MIT). 4 | */(function(e,t){'object'==typeof exports&&'undefined'!=typeof module?module.exports=t():'function'==typeof define&&define.amd?define(t):e.Popper=t()})(this,function(){'use strict';function e(e){return e&&'[object Function]'==={}.toString.call(e)}function t(e,t){if(1!==e.nodeType)return[];var o=window.getComputedStyle(e,null);return t?o[t]:o}function o(e){return'HTML'===e.nodeName?e:e.parentNode||e.host}function n(e){if(!e||-1!==['HTML','BODY','#document'].indexOf(e.nodeName))return window.document.body;var i=t(e),r=i.overflow,p=i.overflowX,s=i.overflowY;return /(auto|scroll)/.test(r+s+p)?e:n(o(e))}function r(e){var o=e&&e.offsetParent,i=o&&o.nodeName;return i&&'BODY'!==i&&'HTML'!==i?-1!==['TD','TABLE'].indexOf(o.nodeName)&&'static'===t(o,'position')?r(o):o:window.document.documentElement}function p(e){var t=e.nodeName;return'BODY'!==t&&('HTML'===t||r(e.firstElementChild)===e)}function s(e){return null===e.parentNode?e:s(e.parentNode)}function d(e,t){if(!e||!e.nodeType||!t||!t.nodeType)return window.document.documentElement;var o=e.compareDocumentPosition(t)&Node.DOCUMENT_POSITION_FOLLOWING,i=o?e:t,n=o?t:e,a=document.createRange();a.setStart(i,0),a.setEnd(n,0);var l=a.commonAncestorContainer;if(e!==l&&t!==l||i.contains(n))return p(l)?l:r(l);var f=s(e);return f.host?d(f.host,t):d(e,s(t).host)}function a(e){var t=1p[c]&&(e.offsets.popper[f]+=s[f]+g-p[c]);var u=s[f]+s[a]/2-g/2,b=t(e.instance.popper,'margin'+l).replace('px',''),y=u-h(e.offsets.popper)[f]-b;return y=X(V(p[a]-g,y),0),e.arrowElement=i,e.offsets.arrow={},e.offsets.arrow[f]=Math.round(y),e.offsets.arrow[m]='',e},element:'[x-arrow]'},flip:{order:600,enabled:!0,fn:function(e,t){if(W(e.instance.modifiers,'inner'))return e;if(e.flipped&&e.placement===e.originalPlacement)return e;var o=w(e.instance.popper,e.instance.reference,t.padding,t.boundariesElement),i=e.placement.split('-')[0],n=L(i),r=e.placement.split('-')[1]||'',p=[];switch(t.behavior){case fe.FLIP:p=[i,n];break;case fe.CLOCKWISE:p=K(i);break;case fe.COUNTERCLOCKWISE:p=K(i,!0);break;default:p=t.behavior;}return p.forEach(function(s,d){if(i!==s||p.length===d+1)return e;i=e.placement.split('-')[0],n=L(i);var a=e.offsets.popper,l=e.offsets.reference,f=_,m='left'===i&&f(a.right)>f(l.left)||'right'===i&&f(a.left)","
"],col:[2,"
"],tr:[2,"","
"],td:[3,"
"],_default:[0,"",""]};ma.optgroup=ma.option,ma.tbody=ma.tfoot=ma.colgroup=ma.caption=ma.thead,ma.th=ma.td;function na(a,b){var c;return c="undefined"!=typeof a.getElementsByTagName?a.getElementsByTagName(b||"*"):"undefined"!=typeof a.querySelectorAll?a.querySelectorAll(b||"*"):[],void 0===b||b&&B(a,b)?r.merge([a],c):c}function oa(a,b){for(var c=0,d=a.length;c","