├── 1.png ├── 2.png ├── README.md ├── config.py ├── db └── dbserver.py ├── keywords.txt ├── main_tools.py ├── module └── Engine │ ├── bing │ ├── bing.py │ └── bingInternational.py │ └── google │ └── google.py ├── requirements.txt └── sql.sql /1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/midisec/UrlCollectionTools/5d2cce059ba52804ff249f9a90ceb2c88aacf9ff/1.png -------------------------------------------------------------------------------- /2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/midisec/UrlCollectionTools/5d2cce059ba52804ff249f9a90ceb2c88aacf9ff/2.png -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # [UrlCollectionTools](https://github.com/midisec/UrlCollectionTools) 2 | 3 | **一款根据关键词进行多线程、多引擎同时并发采集url的工具,支持多个关键词,结果自动去重。** 4 | 5 | 6 | 7 | ## 支持的搜索引擎 8 | 9 | | 搜索引擎 | 多线程 | 时间 | 10 | | ---------- | ------ | ---------- | 11 | | Bing国内版 | √ | 2022-11-22 | 12 | | Bing国际版 | √ | 2022-11-22 | 13 | | | | | 14 | 15 | 16 | 17 | 18 | ## 快速上手 19 | 20 | 克隆项目 21 | 22 | ``` 23 | git clone https://github.com/midisec/UrlCollectionTools 24 | ``` 25 | 26 | python3的环境,安装依赖包。 27 | 28 | ```bash 29 | pip3 install -r requirements.txt 30 | ``` 31 | 32 | 创建mysql数据库,并将sql.sql文件导入进mysql 33 | 34 | 修改连接数据库配置文件(./db/dbserver.py) 35 | 36 | 37 | 38 | ### 数据库结构 39 | 40 | | 数据库名 | 表名 | 字段名1 | 字段名2 | 41 | | -------- | ---------- | -------------------------- | ------------------ | 42 | | url | url_tables | id(int, primary key, auto) | url(text, len(30)) | 43 | 44 | 设置多个关键词 keywords.txt 45 | 46 | 启动 47 | 48 | ```bash 49 | python3 main_tools.py 50 | ``` 51 | 52 | 53 | 54 | 55 | ## 效果预览 56 | 57 | 2H2G4M的机器,十个小时约8w条url 58 | 59 | 60 | 61 | ## 更新消息 62 | 63 | 2022-03-22 64 | 65 | 66 | * 重写项目结构 67 | * 支持bing国内、国际搜索引擎 68 | * 新增多线程,可同时对多个搜索引擎采集 69 | 70 | 2022-11-22 71 | 72 | * bing搜索引擎支持关键词的多线程采集,提升性能 73 | -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | 2 | PROXIES = { 3 | "sock4": "127.0.0.1:7890", 4 | "https": "http://127.0.0.1:7890", 5 | "http": "http://127.0.0.1:7890", 6 | } -------------------------------------------------------------------------------- /db/dbserver.py: -------------------------------------------------------------------------------- 1 | import pymysql 2 | import threading 3 | 4 | class MySQLCommand(object): 5 | def __init__(self): 6 | self.host = 'localhost' 7 | self.port = 3306 # 端口号 8 | self.user = 'root' # 用户名 9 | self.password = "*" # 密码 10 | self.db = "url" # 库 11 | self.table = "url_tables" # 表 12 | self.lock = threading.Lock() 13 | 14 | def connectMysql(self): 15 | try: 16 | self.conn = pymysql.connect(host=self.host, port=self.port, user=self.user, 17 | passwd=self.password, db=self.db, charset='utf8') 18 | self.cursor = self.conn.cursor() 19 | 20 | except: 21 | print('connect mysql error.') 22 | 23 | # 插入数据,插入之前先查询是否存在,如果存在就不再插入 24 | def insertData(self, url): 25 | table = "url_tables" # 要操作的表格 26 | 27 | sqlExit = "SELECT url FROM url_tables WHERE url = '%s'" % (url) 28 | self.lock.acquire() 29 | print(sqlExit) 30 | res = self.cursor.execute(sqlExit) 31 | self.lock.release() 32 | if res: # res为查询到的数据条数如果大于0就代表数据已经存在 33 | print("数据已存在", res) 34 | return 0 35 | # 数据不存在才执行下面的插入操作 36 | try: 37 | sql = "INSERT INTO url_tables (url) VALUES ('%s')" % (url) 38 | # print(sql) 39 | # INSERT INTO url_tables VALUES ('test'); 40 | #拼装后的sql如下 41 | # INSERT INTO home_list (img_path, url, id, title) VALUES ("https://img.huxiucdn.com.jpg"," https://www.huxiu.com90.html"," 12"," ") 42 | try: 43 | self.lock.acquire() 44 | result = self.cursor.execute(sql) 45 | insert_id = self.conn.insert_id() # 插入成功后返回的id 46 | self.conn.commit() 47 | self.lock.release() 48 | # 判断是否执行成功 49 | if result: 50 | print("插入成功", insert_id) 51 | return insert_id + 1 52 | except pymysql.Error as e: 53 | # 发生错误时回滚 54 | self.conn.rollback() 55 | # 主键唯一,无法插入 56 | if "key 'PRIMARY'" in e.args[1]: 57 | print("数据已存在,未插入数据") 58 | else: 59 | print("插入数据失败,原因 %d: %s" % (e.args[0], e.args[1])) 60 | except pymysql.Error as e: 61 | print("数据库错误,原因%d: %s" % (e.args[0], e.args[1])) 62 | 63 | # if __name__ == '__main__': 64 | # mysql = MySQLCommand() 65 | # mysql.connectMysql() 66 | # mysql.insertData("test2") 67 | -------------------------------------------------------------------------------- /keywords.txt: -------------------------------------------------------------------------------- 1 | keyword1 2 | keyword2 3 | keyword3 4 | -------------------------------------------------------------------------------- /main_tools.py: -------------------------------------------------------------------------------- 1 | from module.Engine.bing.bing import BingEngine 2 | from module.Engine.bing.bingInternational import BingInternationalEngine 3 | 4 | import threading 5 | 6 | 7 | def work1(keyword_list): 8 | bing = BingEngine(keyword_list) 9 | # bing.run() 10 | bing.asy_run() 11 | 12 | 13 | def work2(keyword_list): 14 | bing = BingInternationalEngine(keyword_list) 15 | # bing.run() 16 | bing.asy_run() 17 | 18 | 19 | if __name__ == '__main__': 20 | with open('keywords.txt', 'r', encoding='utf-8') as f: 21 | keyword_list = f.readlines() 22 | threads = [] 23 | threads.append(threading.Thread(target=work1,args=(keyword_list,))) 24 | threads.append(threading.Thread(target=work2,args=(keyword_list,))) 25 | for t in threads: 26 | t.start() 27 | -------------------------------------------------------------------------------- /module/Engine/bing/bing.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from lxml import etree 3 | import re 4 | import time 5 | from db.dbserver import MySQLCommand 6 | from concurrent.futures import ThreadPoolExecutor 7 | 8 | 9 | class BingEngine(object): 10 | def __init__(self, keyword_list): 11 | self.mysql = MySQLCommand() 12 | self.mysql.connectMysql() 13 | self.keyword_list = [i.strip() for i in keyword_list] 14 | self.headers = { 15 | "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36", 16 | "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", 17 | "cookie": "DUP=Q=BWQriQwgVDG-1uxZ9j7oRQ2&T=413722706&A=2&IG=D1EAECE750FE4873AD3AA3CE155EC927; MUID=2B7AE5B0424F69112605EAD6438F6819; SNRHOP=I=&TS=; MUIDB=2B7AE5B0424F69112605EAD6438F6819; SRCHD=AF=NOFORM; SRCHUID=V=2&GUID=0725D03312B54387BBE1E7B47B892572&dmnchg=1; ENSEARCH=BENVER=1; ULC=P=51B9|1:1&H=51B9|1:1&T=51B9|1:1; _SS=SID=11605D78715C6E7826A852AB701F6F67&bIm=341; _FP=hta=on; SerpPWA=reg=1; ENSEARCHZOSTATUS=STATUS=0; KievRPSSecAuth=FABKARRaTOJILtFsMkpLVWSG6AN6C/svRwNmAAAEgAAACNjbJXQoBj/BCAE9vNkU0vfQiujzKEEXXWSNsiMi0ZitKcEAb0czIJZ3vwzonozXlESTbc1YsHKxhMIgOhc92v1VIC3QzlfYO6s3WDpPOnis/ZNKDuDP%2BK8eZoUwCgm7baqh/S68jdwNZU9qCa/X0n9D5fjSZDsOlgjmcEBJnmXJOLJimYMGNldFEpd5AULy6lgqetFLBbMxVwW3cyjFiCpiMb0iA1abgcoqiGUI00wSt5L4BlYMRvh7QlT3gboUdGsFhWXvXjxX2xXNq%2B0fzUuhA6qtiivW9ygI9fvSPhJnQkmlRCXEvr28OlsfnGY%2BrjxiIFivVWLoi9eb5lSgkUTkZJlOCqZgIBzgaR/sZqeQJV8UANiGe7CAGu1YHTzP9tzU0ADiqX%2BJ; PPLState=1; ANON=A=EF96A70F0F8199A5D34957E8FFFFFFFF&E=1918&W=1; NAP=V=1.9&E=18be&C=Rr26JQgAdXit4mQy4EFHQz_lB7yg0BxQaHXYmhuwNINpWATyKDO6ag&W=1; _U=15DMfn6nsgtak4i3Lt55oPwwuogb1oCJ4FhnT4emp1uU9BlyBg2ED0hK6YoLgNoeXrYTKlXkhWqJX-QHCGDxNq1r4skT9wJdkNbqScyjpVY2Ynj0bIkWsyyGR2ZwBqzk8XjS05mBxd9ebMylwVeJ7_zA7FBDNjIWqUzlxNoJLZq76dya8GV9yyLzQTZa5Sss2; WLS=C=013892bea16ae168&N=mi; SRCHUSR=DOB=20210203&T=1612868226000&TPC=1612862553000&POEX=W; ipv6=hit=1612871830647&t=4; WLID=sPIZ6PTzUO038qUiNYGJueCscEV7vn21MXYK/gUXnagTHkKwEIZlxsYc4cgBDI/kZajIiktZPHuUMZ6vnBXHsfAYP41RgXa2aM54k488/F8=; _EDGE_S=SID=22B10F3EA4906227217B00E7A5D363F0&mkt=zh-cn; SRCHHPGUSR=BZA=0&BRW=W&BRH=S&CW=2034&CH=563&DPR=1&UTC=480&DM=0&PLTL=674&PLTA=674&PLTN=1&HV=1612868309&WTS=63748465026&SRCHLANGV2=zh-Hans" 18 | } 19 | self.pattern = re.compile( 20 | r'^((http://)|(https://))?([a-zA-Z0-9]([a-zA-Z0-9\-]{0,61}[a-zA-Z0-9])?\.)+[a-zA-Z]{2,6}((/)|())?') 21 | 22 | self.xpath = ["//li[@class='b_algo']//a[@class='sb_metalink']//text()", 23 | "//li[@class='b_algo']//div[@class='b_attribution']/cite//text()", 24 | "//div[@class='b_caption']//div[@class='b_attribution']//cite//text()" 25 | ] 26 | self.pool_size = 5 27 | self.pool = ThreadPoolExecutor(self.pool_size) 28 | 29 | @staticmethod 30 | def produce(keyword): 31 | return ["https://cn.bing.com/search?q={}&qs=n&sp=-1&sp=-1&pq={}&sc=4-8&sk=&cvid=3C863030DEEA4A6F8CB1FB27CCAFCCE7&first={}&ubiroff=1&FORM=PERE".format( 32 | keyword, keyword, i) for i in range(1, 1000, 10)] 33 | 34 | def request(self, url): 35 | resp = requests.get(url, headers=self.headers) 36 | print(url) 37 | return resp.content.decode() 38 | 39 | def withdraw(self, content, s1): 40 | html = etree.HTML(content) 41 | div_list = [] 42 | for xpath in self.xpath: 43 | div_list += html.xpath(xpath) 44 | 45 | for div in div_list: 46 | m = self.pattern.match(div) 47 | try: 48 | s1.add(m.group()) 49 | except Exception as e: 50 | print(e) 51 | time.sleep(2) # waf 52 | pass 53 | 54 | def set_xpath(self): 55 | pass 56 | 57 | def insert_database(self, s1): 58 | for url in s1: 59 | self.mysql.insertData(url) 60 | 61 | def run(self): 62 | s1 = set() 63 | for keyword in self.keyword_list: 64 | s1.clear() 65 | for url in self.produce(keyword): 66 | content = self.request(url) 67 | self.withdraw(content, s1) 68 | self.insert_database(s1) 69 | # yield s1 70 | 71 | def task_single_keyword(self, keyword): 72 | s1 = set() 73 | for url in self.produce(keyword): 74 | content = self.request(url) 75 | self.withdraw(content, s1) 76 | return s1 77 | 78 | def insert_database_asy(self, future): 79 | s1 = future.result() 80 | self.insert_database(s1) 81 | 82 | def asy_run(self): 83 | for keyword in self.keyword_list: 84 | s1 = self.pool.submit(self.task_single_keyword, keyword) 85 | s1.add_done_callback(self.insert_database_asy) 86 | 87 | 88 | if __name__ == '__main__': 89 | test_list = ['cms', '后台', 'cms1', '后台1', 'cms2', '后台2', 'cms3', '后台3', 'cms4', '后台4'] 90 | bing = BingEngine(test_list) 91 | # s1 = bing.run() 92 | bing.asy_run() -------------------------------------------------------------------------------- /module/Engine/bing/bingInternational.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from lxml import etree 3 | import re 4 | import time 5 | from db.dbserver import MySQLCommand 6 | from concurrent.futures import ThreadPoolExecutor 7 | 8 | 9 | class BingInternationalEngine(object): 10 | def __init__(self, keyword_list): 11 | self.mysql = MySQLCommand() 12 | self.mysql.connectMysql() 13 | self.keyword_list = [i.strip() for i in keyword_list] 14 | self.headers = { 15 | "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36", 16 | "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", 17 | "cookie": "DUP=Q=BWQriQwgVDG-1uxZ9j7oRQ2&T=413722706&A=2&IG=D1EAECE750FE4873AD3AA3CE155EC927; MUID=2B7AE5B0424F69112605EAD6438F6819; SNRHOP=I=&TS=; MUIDB=2B7AE5B0424F69112605EAD6438F6819; SRCHD=AF=NOFORM; SRCHUID=V=2&GUID=0725D03312B54387BBE1E7B47B892572&dmnchg=1; ENSEARCH=BENVER=1; ULC=P=51B9|1:1&H=51B9|1:1&T=51B9|1:1; _SS=SID=11605D78715C6E7826A852AB701F6F67&bIm=341; _FP=hta=on; SerpPWA=reg=1; ENSEARCHZOSTATUS=STATUS=0; KievRPSSecAuth=FABKARRaTOJILtFsMkpLVWSG6AN6C/svRwNmAAAEgAAACNjbJXQoBj/BCAE9vNkU0vfQiujzKEEXXWSNsiMi0ZitKcEAb0czIJZ3vwzonozXlESTbc1YsHKxhMIgOhc92v1VIC3QzlfYO6s3WDpPOnis/ZNKDuDP%2BK8eZoUwCgm7baqh/S68jdwNZU9qCa/X0n9D5fjSZDsOlgjmcEBJnmXJOLJimYMGNldFEpd5AULy6lgqetFLBbMxVwW3cyjFiCpiMb0iA1abgcoqiGUI00wSt5L4BlYMRvh7QlT3gboUdGsFhWXvXjxX2xXNq%2B0fzUuhA6qtiivW9ygI9fvSPhJnQkmlRCXEvr28OlsfnGY%2BrjxiIFivVWLoi9eb5lSgkUTkZJlOCqZgIBzgaR/sZqeQJV8UANiGe7CAGu1YHTzP9tzU0ADiqX%2BJ; PPLState=1; ANON=A=EF96A70F0F8199A5D34957E8FFFFFFFF&E=1918&W=1; NAP=V=1.9&E=18be&C=Rr26JQgAdXit4mQy4EFHQz_lB7yg0BxQaHXYmhuwNINpWATyKDO6ag&W=1; _U=15DMfn6nsgtak4i3Lt55oPwwuogb1oCJ4FhnT4emp1uU9BlyBg2ED0hK6YoLgNoeXrYTKlXkhWqJX-QHCGDxNq1r4skT9wJdkNbqScyjpVY2Ynj0bIkWsyyGR2ZwBqzk8XjS05mBxd9ebMylwVeJ7_zA7FBDNjIWqUzlxNoJLZq76dya8GV9yyLzQTZa5Sss2; WLS=C=013892bea16ae168&N=mi; SRCHUSR=DOB=20210203&T=1612868226000&TPC=1612862553000&POEX=W; ipv6=hit=1612871830647&t=4; WLID=sPIZ6PTzUO038qUiNYGJueCscEV7vn21MXYK/gUXnagTHkKwEIZlxsYc4cgBDI/kZajIiktZPHuUMZ6vnBXHsfAYP41RgXa2aM54k488/F8=; _EDGE_S=SID=22B10F3EA4906227217B00E7A5D363F0&mkt=zh-cn; SRCHHPGUSR=BZA=0&BRW=W&BRH=S&CW=2034&CH=563&DPR=1&UTC=480&DM=0&PLTL=674&PLTA=674&PLTN=1&HV=1612868309&WTS=63748465026&SRCHLANGV2=zh-Hans" 18 | } 19 | self.pattern = re.compile( 20 | r'^((http://)|(https://))?([a-zA-Z0-9]([a-zA-Z0-9\-]{0,61}[a-zA-Z0-9])?\.)+[a-zA-Z]{2,6}((/)|())?') 21 | 22 | self.xpath = ["//li[@class='b_algo']//a[@class='sb_metalink']//text()", 23 | "//li[@class='b_algo']//div[@class='b_attribution']/cite//text()", 24 | "//div[@class='b_caption']//div[@class='b_attribution']//cite//text()" 25 | ] 26 | self.pool_size = 5 27 | self.pool = ThreadPoolExecutor(self.pool_size) 28 | 29 | @staticmethod 30 | def produce(keyword): 31 | return ["https://www.bing.com/search?q={}&qs=n&sp=-1&sp=-1&pq={}&sc=4-8&sk=&cvid=3C863030DEEA4A6F8CB1FB27CCAFCCE7&first={}&ubiroff=1&FORM=PERE&ensearch=1".format( 32 | keyword, keyword, i) for i in range(1, 1000, 10)] 33 | 34 | def request(self, url): 35 | resp = requests.get(url, headers=self.headers) 36 | print(url) 37 | return resp.content.decode() 38 | 39 | def withdraw(self, content, s1): 40 | html = etree.HTML(content) 41 | div_list = [] 42 | for xpath in self.xpath: 43 | div_list += html.xpath(xpath) 44 | 45 | for div in div_list: 46 | m = self.pattern.match(div) 47 | try: 48 | s1.add(m.group()) 49 | except Exception as e: 50 | print(e) 51 | time.sleep(2) # waf 52 | pass 53 | 54 | def set_xpath(self): 55 | pass 56 | 57 | def insert_database(self, s1): 58 | for url in s1: 59 | self.mysql.insertData(url) 60 | 61 | def run(self): 62 | s1 = set() 63 | for keyword in self.keyword_list: 64 | s1.clear() 65 | for url in self.produce(keyword): 66 | content = self.request(url) 67 | self.withdraw(content, s1) 68 | self.insert_database(s1) 69 | 70 | def task_single_keyword(self, keyword): 71 | s1 = set() 72 | for url in self.produce(keyword): 73 | content = self.request(url) 74 | self.withdraw(content, s1) 75 | return s1 76 | 77 | def insert_database_asy(self, future): 78 | s1 = future.result() 79 | self.insert_database(s1) 80 | 81 | def asy_run(self): 82 | for keyword in self.keyword_list: 83 | s1 = self.pool.submit(self.task_single_keyword, keyword) 84 | s1.add_done_callback(self.insert_database_asy) 85 | 86 | 87 | if __name__ == '__main__': 88 | test_list = ['cms', '后台'] 89 | bing = BingInternationalEngine(test_list) 90 | bing.asy_run() 91 | -------------------------------------------------------------------------------- /module/Engine/google/google.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from lxml import etree 3 | import re, time 4 | import urllib3 5 | import config 6 | from db.dbserver import MySQLCommand 7 | 8 | 9 | class GoogleEngine(object): 10 | def __init__(self, keyword_list): 11 | urllib3.disable_warnings() 12 | self.mysql = MySQLCommand() 13 | self.mysql.connectMysql() 14 | self.keyword_list = [i.strip() for i in keyword_list] 15 | self.headers = { 16 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:49.0) Gecko/20100101 Firefox/49.0', 17 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 18 | 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 19 | 'Accept-Encoding': 'gzip, deflate', 20 | 'Referer': 'https://www.google.com.hk/', 21 | 'Cookie': '1P_JAR=2022-07-21-12; NID=511=n9vG1tS6R54Id81eXSY6vi8RySxjrXNMT6d3QWKzSPYQ8WpNKD03kHB6TsTGtJFgqyPxRuIF2cophzRKhsBQR7TagmfP7SQZ7R2qXlGPWBQCTf47vw98IE3TGyYPKhDZ3zkkqOooILTyIZQ3nfgb44IVbKVP-qgtZMONIgwT9cTFKtUoBCLopYU0gL_WoxeW12qoZsg; AEC=AakniGNjeaGlCmmeL5BCWM58RXTz0SkSjouE0329LCwKKQOms7JUznlXIQ', 22 | 'DNT': '1', 23 | 'X-Forwarded-For': '8.8.8.8', 24 | 'Connection': 'keep-alive', 25 | 'Upgrade-Insecure-Requests': '1' 26 | } 27 | self.proxies = config.PROXIES 28 | self.pattern = re.compile( 29 | r'^((http://)|(https://))?([a-zA-Z0-9]([a-zA-Z0-9\-]{0,61}[a-zA-Z0-9])?\.)+[a-zA-Z]{2,6}((/)|())?') 30 | 31 | self.xpath = ["//a/div//text()"] 32 | 33 | @staticmethod 34 | def produce(keyword): 35 | return ["https://www.google.com.hk/search?q={}&lr=&newwindow=1&safe=images&hl=zh-CN&as_qdr=all&as_rights=%E4%B8%8D%E6%8C%89%E7%85%A7%E8%AE%B8%E5%8F%AF%E8%BF%87%E6%BB%A4&ei=ZUjZYqz7OPDL2roPy7CbeA&start={}&sa=N&biw=1536&bih=722&dpr=1.25".format( 36 | keyword, i) for i in range(1, 150, 10)] 37 | 38 | def request(self, url): 39 | resp = requests.get(url, headers=self.headers, proxies=self.proxies, verify=False, timeout=5) 40 | print(url) 41 | print(resp.content.decode()) 42 | return resp.content.decode() 43 | 44 | def withdraw(self, content, s1): 45 | html = etree.HTML(content) 46 | div_list = [] 47 | for xpath in self.xpath: 48 | div_list += html.xpath(xpath) 49 | 50 | for div in div_list: 51 | m = self.pattern.match(div) 52 | try: 53 | s1.add(m.group()) 54 | except Exception as e: 55 | print(e) 56 | time.sleep(2) # waf 57 | pass 58 | 59 | def set_xpath(self): 60 | pass 61 | 62 | def insert_database(self, s1): 63 | for url in s1: 64 | self.mysql.insertData(url) 65 | 66 | def run(self): 67 | s1 = set() 68 | for keyword in self.keyword_list: 69 | s1.clear() 70 | for url in self.produce(keyword): 71 | content = self.request(url) 72 | self.withdraw(content, s1) 73 | self.insert_database(s1) 74 | 75 | 76 | if __name__ == '__main__': 77 | test_list = ['cms', '后台'] 78 | bing = GoogleEngine(test_list) 79 | bing.run() 80 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | lxml==4.8.0 2 | requests==2.25.1 3 | PyMySQL==1.0.2 -------------------------------------------------------------------------------- /sql.sql: -------------------------------------------------------------------------------- 1 | /* 2 | SQLyog Community v13.1.6 (64 bit) 3 | MySQL - 5.7.31 : Database - url 4 | ********************************************************************* 5 | */ 6 | 7 | /*!40101 SET NAMES utf8 */; 8 | 9 | /*!40101 SET SQL_MODE=''*/; 10 | 11 | /*!40014 SET @OLD_UNIQUE_CHECKS=@@UNIQUE_CHECKS, UNIQUE_CHECKS=0 */; 12 | /*!40014 SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0 */; 13 | /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='NO_AUTO_VALUE_ON_ZERO' */; 14 | /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */; 15 | /*Table structure for table `url_tables` */ 16 | 17 | DROP TABLE IF EXISTS `url_tables`; 18 | 19 | CREATE TABLE `url_tables` ( 20 | `id` int(11) NOT NULL AUTO_INCREMENT, 21 | `url` tinytext, 22 | PRIMARY KEY (`id`) 23 | ) ENGINE=InnoDB AUTO_INCREMENT=2 DEFAULT CHARSET=utf8; 24 | 25 | /*Data for the table `url_tables` */ 26 | 27 | LOCK TABLES `url_tables` WRITE; 28 | 29 | UNLOCK TABLES; 30 | 31 | /*!40101 SET SQL_MODE=@OLD_SQL_MODE */; 32 | /*!40014 SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS */; 33 | /*!40014 SET UNIQUE_CHECKS=@OLD_UNIQUE_CHECKS */; 34 | /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */; 35 | --------------------------------------------------------------------------------