├── README.md ├── Web ├── class │ └── search_class.php ├── index.html ├── css │ ├── search.css │ └── index.css └── search.php ├── LICENSE ├── .gitignore ├── sql.py ├── movie.py └── Proxy-IP_1.0.0b3.py /README.md: -------------------------------------------------------------------------------- 1 | # douban-movie-crawler 2 | 豆瓣电影爬虫Demo 3 | -------------------------------------------------------------------------------- /Web/class/search_class.php: -------------------------------------------------------------------------------- 1 | actor_list = implode('/ ',$temp[0]); 11 | return $this->actor_list; 12 | } 13 | } 14 | ?> -------------------------------------------------------------------------------- /Web/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 电影搜索 6 | 7 | 8 | 9 |

10 |

17 |

18 |

19 |

电影搜索

20 | 24 |

25 | 28 | 29 | 30 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 老蘑菇 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Web/css/search.css: -------------------------------------------------------------------------------- 1 | * { 2 | margin: 0; 3 | padding: 0; 4 | } 5 | 6 | a { 7 | text-decoration: none; 8 | } 9 | 10 | /* 整个搜索盒子 */ 11 | .video { 12 | width: 600px; 13 | height: 200px; 14 | margin: 40px 0; 15 | margin-left: 50px; 16 | } 17 | 18 | /* 影视图片 */ 19 | .video .img { 20 | float: left; 21 | height: 100%; 22 | width: 150px; 23 | } 24 | 25 | /* 影视信息 */ 26 | .info { 27 | float: left; 28 | width: 420px; 29 | margin-left: 30px; 30 | } 31 | 32 | .info h3 { 33 | display: -webkit-box; 34 | width: inherit; 35 | overflow: hidden; 36 | text-overflow: ellipsis; 37 | -webkit-line-clamp: 1; 38 | -webkit-box-orient: vertical; 39 | } 40 | 41 | .info h3 a:hover { 42 | color: #ffaa00; 43 | } 44 | 45 | .info h3 span { 46 | color: #000080; 47 | font-size: 10px; 48 | } 49 | 50 | .info .actor { 51 | display: -webkit-box; 52 | width: inherit; 53 | overflow: hidden; 54 | text-overflow: ellipsis; 55 | -webkit-line-clamp: 1; 56 | -webkit-box-orient: vertical; 57 | margin-top: 10px; 58 | } 59 | 60 | .info .description p { 61 | display: -webkit-box; 62 | width: inherit; 63 | overflow: hidden; 64 | text-overflow: ellipsis; 65 | -webkit-line-clamp: 4; 66 | -webkit-box-orient: vertical; 67 | margin-top: 10px; 68 | } 69 | 70 | .info .more { 71 | display: block; 72 | margin-top: 10px; 73 | } 74 | -------------------------------------------------------------------------------- /Web/css/index.css: -------------------------------------------------------------------------------- 1 | * { 2 | margin: 0; 3 | padding: 0; 4 | } 5 | 6 | li { 7 | list-style: none; 8 | } 9 | 10 | a { 11 | text-decoration: none; 12 | color: #FFFFFF; 13 | } 14 | 15 | body { 16 | background-image: url(http://api.ixiaowai.cn/gqapi/gqapi.php); 17 | background-repeat: no-repeat; 18 | background-attachment: fixed; 19 | background-position: center top; 20 | font-family: "'Microsoft YaHei', arial, helvetica, sans-serif"; 21 | } 22 | 23 | /* 导航栏 */ 24 | .nav { 25 | width: 100%; 26 | height: 60px; 27 | text-align: center; 28 | transition: all .5s; 29 | } 30 | 31 | .nav:hover { 32 | background-color: rgba(255, 255, 255, 0.6); 33 | } 34 | 35 | .nav ul { 36 | display: inline-block; 37 | } 38 | 39 | .nav ul li { 40 | float: left; 41 | } 42 | 43 | .nav ul li a { 44 | color: #000000; 45 | line-height: 60px; 46 | padding: 0 25px; 47 | } 48 | 49 | .nav ul li a:hover { 50 | color: #ff0000; 51 | } 52 | 53 | /* 搜索框 */ 54 | .search { 55 | width: 100%; 56 | margin-top: 200px; 57 | text-align: center; 58 | } 59 | 60 | .search h1 { 61 | margin: 100px 0; 62 | color: #fff; 63 | } 64 | 65 | .search form { 66 | margin: 0 auto; 67 | } 68 | 69 | .search .text { 70 | width: 320px; 71 | height: 30px; 72 | } 73 | 74 | .search .button { 75 | width: 80px; 76 | height: 32px; 77 | margin-left: 10px; 78 | border: 0px; 79 | border-radius: 10px; 80 | transition: all .3s; 81 | } 82 | 83 | .search .button:hover { 84 | background-color: #ff5500; 85 | } 86 | 87 | .search .button:active { 88 | transform: translateY(3px); 89 | } 90 | 91 | /* 底部 */ 92 | .footer { 93 | width: 100%; 94 | color: #fff; 95 | margin-top: 20%; 96 | text-align: center; 97 | } 98 | 99 | .footer span a:hover { 100 | color: #ffaa00; 101 | } 102 | -------------------------------------------------------------------------------- /Web/search.php: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 电影搜索 5 | 6 | 7 | 8 | 9 | 共搜索到". count($row) ."个与【". $keywords ."】有关的内容

"; 28 | for($i=0; $i

". $row[$i][2] ." ". $row[$i][5] ."

". $info->actor($row[$i][9]) ."

". $row[$i][6] ."

查看详情

"; 30 | } 31 | } else { 32 | echo "

没有找到相关结果

"; 33 | } 34 | //释放结果集 35 | mysqli_free_result($result); 36 | 37 | //断开服务器连接 38 | mysqli_close($con); 39 | ?> 40 | 41 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /sql.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding:utf-8 -*- 3 | 4 | import mysql.connector 5 | 6 | host = "localhost" # 数据库主机地址 7 | user = "root" # 数据库用户名 8 | passwd = "root" # 数据库密码 9 | database = "movie" # 数据库名 10 | 11 | 12 | # 创建数据库 13 | def create_db(): 14 | db = mysql.connector.connect(host=host, user=user, passwd=passwd) # 打开数据库连接 15 | cursor = db.cursor() # 创建游标对象 16 | try: 17 | cursor.execute("CREATE DATABASE `movie`") # 执行MYSQL语句 18 | print("数据库创建成功！") 19 | except Exception: 20 | print("【movie】数据库已存在，跳过创建！") 21 | db.close() # 关闭数据库连接 22 | 23 | 24 | # 创建数据表 25 | def create_table(): 26 | db = mysql.connector.connect(host=host, user=user, passwd=passwd, database=database) 27 | cursor = db.cursor() 28 | try: 29 | cursor.execute("CREATE TABLE `movie` ( `id` INT NOT NULL AUTO_INCREMENT,\ 30 | `db_id` INT(20) NOT NULL, `title` VARCHAR(100) NOT NULL , `rate` FLOAT NOT NULL , `img` VARCHAR(100) NOT NULL ,\ 31 | `data` TINYTEXT NOT NULL , `description` TEXT NOT NULL , `director` TEXT NOT NULL ,\ 32 | `author` TEXT NOT NULL , `actor` TEXT NOT NULL , `genre` VARCHAR(255) NOT NULL ,\ 33 | PRIMARY KEY (`id`)) DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_520_ci") 34 | print("表创建成功！") 35 | except Exception: 36 | print("表已存在，跳过创建！") 37 | db.close() # 关闭数据库连接 38 | 39 | 40 | # 插入电影数据 ID、标题、评分、图链 41 | def insert_data(id, db_id, title, rate, img): 42 | db = mysql.connector.connect(host=host, user=user, passwd=passwd, database=database) 43 | cursor = db.cursor() 44 | insert = "INSERT INTO `movie` (`id`, `db_id`, `title`, `rate`, `img`, `data`, `description`,\ 45 | `director`, `author`, `actor`, `genre`) VALUES ('" + id + "', '" + db_id + "', '" + title + "', '" + rate +\ 46 | "', '" + img + "', '', '', '', '', '', '')" 47 | # print(insert) # 查看数据库插入语句 48 | cursor.execute(insert) 49 | db.commit() # 提交任务，数据才会写入数据库 50 | db.close() # 关闭数据库连接 51 | 52 | 53 | # 插入电影详情日期、简介、导演、作者、演员、类型 54 | def insert_info(id, data, description, director, author, actor, genre): 55 | db = mysql.connector.connect(host=host, user=user, passwd=passwd, database=database) 56 | cursor = db.cursor() 57 | insert = "UPDATE `movie` SET `data` = '" + data + "', `description` = '" + description + "', `director` = '"\ 58 | + director + "', `author` = '" + author + "', `actor` = '" + actor + "', `genre` = '"\ 59 | + genre + "' WHERE `movie`.`id` = " + id + ";" 60 | print(insert) # 查看数据库插入语句 61 | cursor.execute(insert) 62 | db.commit() # 提交任务，数据才会写入数据库 63 | db.close() # 关闭数据库连接 64 | 65 | 66 | if __name__ == '__main__': 67 | create_db() 68 | create_table() -------------------------------------------------------------------------------- /movie.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding:utf-8 -*- 3 | 4 | import re 5 | import json 6 | import time 7 | import requests 8 | from fake_useragent import UserAgent 9 | import sql 10 | 11 | # User-Agent池 12 | ua = UserAgent() 13 | headers = {'User-Agent': ua.chrome} 14 | 15 | # 代理IP池(有时效性) 16 | proxies = ['49.86.75.105:39187', '110.86.176.186:44587', '27.152.195.115:29933', '1.194.148.210:30077', 17 | '123.54.231.164:48890', '106.122.169.251:48471', '100.86.35.31:47038', '49.85.111.243:23859', 18 | '183.166.125.46:35662', '49.85.105.104:25124', '122.241.218.254:20164', '220.185.209.203:45233', 19 | '125.109.195.79:22384', '125.123.64.194:21925', '117.93.181.43:44142', '180.125.70.163:42062'] 20 | 21 | # 全局变量 22 | times = 0 23 | 24 | 25 | # 豆瓣电影分类函数(tag) 26 | def douban_tags(): 27 | response = requests.get("https://movie.douban.com/j/search_tags", headers=headers) 28 | tags = json.loads(response.text) 29 | return tags["tags"] # 返回豆瓣电影分类列表 30 | 31 | 32 | # 豆瓣电影信息函数(ID、标题、评分、图链) 33 | def douban_movie(mv_tag, mv_num): 34 | temp = [] 35 | url = "https://movie.douban.com/j/search_subjects?type=movie&tag=" + str(mv_tag) + "&sort=recommend&page_limit=" + \ 36 | str(mv_num) + "&page_start=0" 37 | response = requests.get(url, headers=headers, timeout=10) 38 | movie_json = json.loads(response.text) 39 | for info in movie_json["subjects"]: # 遍历豆瓣JSON信息得到字典数据 40 | temp.append([info["id"], info["title"], info["rate"], info["cover"]]) 41 | return temp # 返回各分类所有电影的信息数组 42 | 43 | 44 | # 豆瓣电影详情函数(日期、简介、导演、作者、演员、类型) 45 | def douban_info(id, mv_id, you_ip): 46 | url = "https://movie.douban.com/subject/" + str(mv_id) 47 | proxy_host = you_ip # 代理IP 48 | response = requests.get(url, headers=headers, proxies={"http": proxy_host, "https": proxy_host}, timeout=10) 49 | movie_json = re.search(r'application/ld\+json">([\s\S]*?)', '').replace('标签内):网页链接、备注名称、爬取页数、爬取速度 48 | def ip_html1(link, text, page, speed): 49 | global proxy_ip 50 | ip_num = 0 51 | try: 52 | for a in range(1, page): # 爬取多少页 53 | url = str(link) + str(a) 54 | response = requests.get(url, headers=headers, timeout=8) 55 | # print(response.text) 56 | soupIP = BeautifulSoup(response.text, 'html5lib') 57 | trs = soupIP.find_all('tr') 58 | for tr in trs[1:]: 59 | ip_num += 1 # 自增计算IP数 60 | tds = tr.find_all('td') 61 | # print(tds) 62 | ip = tds[0].text.strip() 63 | proxy_ip.append(ip) 64 | time.sleep(speed) # 控制访问速度(很重要，如果访问太快被封IP就不能继续爬了) 65 | except Exception as e: 66 | print(e) 67 | print(str(text) + "获取到" + str(ip_num) + "个代理IP") 68 | 69 | 70 | # 使用HTML解析获取代理IP函数【二】(源码IP和端口不在一起,IP在第1个标签内,端口在第2个标签内):网页链接、备注名称、爬取页数、爬取速度 71 | def ip_html2(link, text, page, speed): 72 | global proxy_ip 73 | ip_num = 0 74 | try: 75 | for a in range(1, page): # 爬取多少页 76 | url = str(link) + str(a) 77 | response = requests.get(url, headers=headers, timeout=8) 78 | # print(response.text) 79 | soupIP = BeautifulSoup(response.text, 'html5lib') 80 | trs = soupIP.find_all('tr') 81 | for tr in trs[1:]: 82 | ip_num += 1 # 自增计算IP数 83 | tds = tr.find_all('td') 84 | # print(tds) 85 | ip = tds[0].text.strip() 86 | port = tds[1].text.strip() 87 | proxy_ip.append(ip + ':' + port) 88 | time.sleep(speed) # 控制访问速度(很重要，如果访问太快被封IP就不能继续爬了) 89 | except Exception as e: 90 | print(e) 91 | print(str(text) + "获取到" + str(ip_num) + "个代理IP") 92 | 93 | 94 | # 使用HTML解析获取代理IP函数【三】(源码IP和端口不在一起,IP在第2个标签内,端口在第3个标签内):网页链接、备注名称、爬取页数、爬取速度 95 | def ip_html3(link, text, page, speed): 96 | global proxy_ip 97 | ip_num = 0 98 | try: 99 | for a in range(1, page): # 爬取多少页 100 | url = str(link) + str(a) 101 | response = requests.get(url, headers=headers, timeout=8) 102 | # print(response.text) 103 | soupIP = BeautifulSoup(response.text, 'html5lib') 104 | trs = soupIP.find_all('tr') 105 | for tr in trs[1:]: 106 | ip_num += 1 # 自增计算IP数 107 | tds = tr.find_all('td') 108 | # print(tds) 109 | ip = tds[1].text.strip() 110 | port = tds[2].text.strip() 111 | proxy_ip.append(ip + ':' + port) 112 | time.sleep(speed) # 控制访问速度(很重要，如果访问太快被封IP就不能继续爬了) 113 | except Exception as e: 114 | print(e) 115 | print(str(text) + "获取到" + str(ip_num) + "个代理IP") 116 | 117 | 118 | # 获取普通代理IP函数 119 | def get_ip(): 120 | # 定义一个获取IP的线程池，如果你有其他接口可以往里加 121 | threads_ip = [MyThread(ip_api, args=('http://www.66ip.cn/mo.php?sxb=&tqsl=7000&port=&export=&ktip=&sxa=&submit=%CC%E1++%C8%A1&textarea=', '66免费代理')), 122 | MyThread(ip_html3, args=('https://www.xicidaili.com/nt/', '西刺代理', 5, 6)), 123 | MyThread(ip_html2, args=('https://www.kuaidaili.com/free/intr/', '快代理', 20, 2)), 124 | MyThread(ip_html2, args=('http://www.ip3366.net/free/?stype=2&page=', '云代理', 7, 1)), 125 | MyThread(ip_api, args=('http://www.89ip.cn/tqdl.html?api=1&num=3000&port=&address=&isp=', '89免费代理（不知道是不是高匿）')), 126 | MyThread(ip_html1, args=('http://www.nimadaili.com/putong/', '泥马代理', 100, 1)), 127 | MyThread(ip_html1, args=('http://www.xiladaili.com/putong/', '西拉代理', 100, 1))] 128 | for b in threads_ip: 129 | b.start() 130 | for b in threads_ip: 131 | b.join() 132 | # 推荐个小幻代理(防爬但有手动提取接口):https://ip.ihuan.me/ 133 | 134 | 135 | # 获取匿名代理IP函数 136 | def get_anonymous_ip(): 137 | # 定义一个获取IP的线程池，如果你有其他接口可以往里加 138 | threads_ip = [MyThread(ip_api, args=('http://www.66ip.cn/nmtq.php?getnum=3000&isp=0&anonymoustype=3&start=&ports=&export=&ipaddress=&area=1&proxytype=2&api=66ip', '66免费代理')), 139 | MyThread(ip_html3, args=('https://www.xicidaili.com/nn/', '西刺代理', 5, 6)), 140 | MyThread(ip_html2, args=('https://www.kuaidaili.com/free/inha/', '快代理', 20, 2)), 141 | MyThread(ip_html2, args=('http://www.ip3366.net/free/?stype=1&page=', '云代理', 7, 1)), 142 | MyThread(ip_api, args=('http://www.89ip.cn/tqdl.html?api=1&num=3000&port=&address=&isp=', '89免费代理（不知道是不是高匿）')), 143 | MyThread(ip_html1, args=('http://www.nimadaili.com/gaoni/', '泥马代理', 100, 1)), 144 | MyThread(ip_html1, args=('http://www.xiladaili.com/gaoni/', '西拉代理', 100, 1)), 145 | MyThread(ip_html2, args=('https://www.7yip.cn/free/?action=china&page=', '齐云代理', 90, 1)), 146 | MyThread(ip_html2, args=('https://ip.jiangxianli.com/?page=', '高可用全球免费代理库', 8, 0))] 147 | for b in threads_ip: 148 | b.start() 149 | for b in threads_ip: 150 | b.join() 151 | # 推荐个小幻代理(防爬但有手动提取接口):https://ip.ihuan.me/ 152 | 153 | 154 | # 验证输入正确函数 155 | def check_input(site, word, code): 156 | check = requests.get(site, headers=headers, timeout=10) 157 | if code == "2": 158 | check.encoding = 'gbk' 159 | else: 160 | check.encoding = 'utf-8' 161 | if word in check.text: # 判断关键字符串是否在网站源码中 162 | print("\n验证成功，验证网址和关键字符串可用，开始代理IP验证！") 163 | return True 164 | else: 165 | print("\n验证失败，验证网址和关键字符串不可用，请重新输入！") 166 | return False 167 | 168 | 169 | # 验证代理函数 170 | def check_ip(ip, site, word, code): 171 | global proxy_ok_ip 172 | try: 173 | proxy_temp = {"http": ip, "https": ip} 174 | res = requests.get(site, headers=headers, proxies=proxy_temp, timeout=10) # 验证超时时间，默认10秒 175 | if code == "2": 176 | res.encoding = 'gbk' 177 | else: 178 | res.encoding = 'utf-8' 179 | if word in res.text: # 判断关键词是否在网站源码中 180 | print(res, ip + " is OK") 181 | proxy_ok_ip.append(ip) 182 | else: 183 | print(ip + " is BOOM") 184 | except: # 超时或异常 185 | print(ip + " is BOOM") 186 | 187 | 188 | # 列表写入TXT文件函数：filename为写入TXT文件的路径，data为要写入数据列表 189 | def text_save(filename, data): 190 | file = open(filename, 'w+') 191 | for i in range(len(data)): 192 | s = str(data[i]).replace('[', '').replace(']', '') # 去除[],这两行按数据不同，可以选择 193 | s = s.replace("'", '').replace(',', '') + '\n' # 去除单引号，逗号，每行末尾追加换行符 194 | file.write(s) 195 | file.close() 196 | print("代理IP信息保存文件成功，请查看当前运行目录！") 197 | 198 | 199 | # 列表去重函数 200 | def check_list(lists): 201 | temp = [] 202 | for i in lists: 203 | if not i in temp: 204 | temp.append(i) 205 | return temp 206 | 207 | 208 | if __name__ == '__main__': 209 | print("注意：此脚本为提取代理IP脚本，可以获取大量免费代理IP！\n") 210 | input_ip_type = input("请选择你要获取的代理IP类型(1.普通(默认) 2.高匿)：") 211 | if input_ip_type == '2': 212 | print("\n正在获取[高匿]代理IP中请稍等片刻，大概3min... _(:з」∠)_") 213 | get_anonymous_ip() 214 | ip_list = check_list(proxy_ip) # 去重 215 | else: 216 | print("\n正在获取[普通]代理IP中请稍等片刻，大概3min... _(:з」∠)_") 217 | get_ip() 218 | ip_list = check_list(proxy_ip) # 去重 219 | 220 | # 是否验证可用性？ 221 | input_check = input("\n去重后共获取到" + str(len(ip_list)) + "个代理IP，是否验证可用性（默认否，输入任意字符开始验证）：") 222 | if input_check: 223 | input_thread_num = input("请输入验证的线程数（默认500）：") 224 | else: 225 | input_thread_num = 500 226 | 227 | if input_check: # 验证可用性✔ 228 | while True: # 首次验证检测输入是否正确，不使用代理IP 229 | input_site = input("请输入要访问的站点：") 230 | input_code = input("请输入验证站点的编码格式(1.UTF-8(默认) 2.GBK)：") 231 | input_word = input("请输入验证站点内的关键字符串，如 “https://www.baidu.com” 中有关键字符串 “百度一下” ：") 232 | if check_input(input_site, input_word, input_code): 233 | break 234 | else: 235 | print('\n\n') 236 | 237 | # 多线程验证开始，GO! GO! GO! 238 | k = 0 239 | threads = [] # 定义一个线程池 240 | thread_count = len(ip_list) // int(input_thread_num) + 1 # 分几段线程 241 | for i in range(thread_count): 242 | for j in range(int(input_thread_num)): # 一段几线程 243 | try: 244 | # 创建新线程,添加到线程池 245 | threads.append(MyThread(check_ip, args=(ip_list[k], input_site, input_word, input_code))) 246 | k += 1 247 | except: 248 | break 249 | # 开启所有线程 250 | for t in threads: 251 | t.start() 252 | # 等待所有线程完成 253 | for t in threads: 254 | t.join() 255 | print("可用IP总数为" + str(len(proxy_ok_ip)) + "个") 256 | text_save("验证过的IP列表.txt", proxy_ok_ip) 257 | 258 | else: # 不验证可用性× 259 | # print(ip_list) 260 | text_save("未验证的IP列表.txt", ip_list) 261 | input("按任意键即可退出！") --------------------------------------------------------------------------------