├── README.md ├── requirements.txt ├── tasks.py ├── exporter.py ├── parser.py ├── log.py ├── LICENSE ├── config.py ├── .gitignore ├── FakeUAdb.py ├── common.py ├── uaServer.py ├── models.py ├── main.py └── FakeUA.py /README.md: -------------------------------------------------------------------------------- 1 | # FakeUA 2 | 3 | 现脚本暂无法使用,建议直接使用release中的现成库压缩文件 -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | trio==0.6.0+dev 2 | peewee==3.7.1 3 | stem==1.6.0 4 | termcolor==1.1.0 5 | asks==2.0.0 6 | pyquery==1.4.0 7 | -------------------------------------------------------------------------------- /tasks.py: -------------------------------------------------------------------------------- 1 | from celery import Celery 2 | from conf import Config 3 | app = Celery('darknet', broker=f'redis://{Config.redis_host}:{Config.redis_port}//') 4 | 5 | 6 | @app.task() 7 | def SaveToDB(datas, model): 8 | if datas: 9 | loger.info(colored(f'数据存储至DB中 [{len(datas)}]', 'yellow')) 10 | for chunk in list(MakeChunk(datas)): 11 | model.insert_many(datas).execute() 12 | -------------------------------------------------------------------------------- /exporter.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import pandas 4 | 5 | from common import checkPath 6 | from log import success 7 | from common import checkTimes 8 | 9 | 10 | def CreateXLSX(datas, columns, filename='res.xlsx'): 11 | with checkTimes(): 12 | xlsx = pandas.DataFrame(datas) 13 | xlsx.rename(columns=columns, inplace=True) 14 | writer = pandas.ExcelWriter( 15 | filename, options={'strings_to_urls': False}) 16 | xlsx.to_excel(writer, "data") 17 | writer.save() 18 | success('Created {filename}') 19 | 20 | 21 | def CreateJson(datas, filename='res.json'): 22 | with checkTimes(): 23 | with open(filename, 'w') as f: 24 | f.write(json.dumps(datas, ensure_ascii=False, indent=4)) 25 | success(f'Saved {filename}') 26 | -------------------------------------------------------------------------------- /parser.py: -------------------------------------------------------------------------------- 1 | from urllib.parse import urljoin, urlparse 2 | 3 | import moment 4 | import numpy 5 | from pyquery import PyQuery as jq 6 | 7 | from common import addtotal, addupdate 8 | from log import success, info 9 | 10 | 11 | def ParserSelect(text, url, types, city): 12 | jqdata = jq(text) 13 | payload = {} 14 | 15 | try: 16 | 17 | if types == 'xxx': 18 | for item in jqdata('.xxx').items(): 19 | 20 | hid = item.text() 21 | payload[hid] = { 22 | "hid": hid, 23 | } 24 | addtotal() 25 | success(f"{hid}") 26 | 27 | elif types == 'xxx_details': 28 | payload = { 29 | 30 | } 31 | addupdate() 32 | 33 | except Exception as e: 34 | raise 35 | 36 | return payload 37 | -------------------------------------------------------------------------------- /log.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from termcolor import colored 4 | 5 | from conf import config 6 | 7 | 8 | def makeStatus( 9 | ): return f"{'⚠️' if config.runfortest else '' }🏠:{colored(config.status['total'],'blue')} 🌀:{colored(config.status['updated'],'blue')} ✅:{colored(config.status['success'],'green')} 🚫:{colored(config.status['failed'],'red')}] " 10 | 11 | # logging.basicConfig( 12 | # format='[%(asctime)s] >>> %(levelname)s %(name)s: %(message)s', level=logging.INFO) 13 | 14 | 15 | logging.basicConfig( 16 | format='[%(asctime)s]%(message)s', level=logging.INFO) 17 | Loger = logging.getLogger(config.name) 18 | 19 | 20 | def info(txt): return Loger.info(f"{ makeStatus()} {colored(txt, 'blue')}") 21 | 22 | 23 | def success(txt): return Loger.info(f"{makeStatus()} {colored(txt, 'green')}") 24 | 25 | 26 | def warning(txt): return Loger.info(f"{makeStatus()} {colored(txt, 'yellow')}") 27 | 28 | 29 | def error(txt): return Loger.info(f"{makeStatus()} {colored(txt, 'red')}") 30 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Ai3 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | class Config(object): 2 | 3 | DEFAULT_LIMIT = 10 4 | MAX_LIMIT = 100 5 | SERVER_PORT = 80 6 | 7 | name = 'spider' 8 | maxConnections = 20 9 | 10 | fakeHeader = { 11 | 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 12 | 'accept-encoding': 'gzip, deflate, br', 13 | 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8', 14 | 'cache-control': 'max-age=0', 15 | 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36(KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36' 16 | } 17 | 18 | status = {'success': 0, 'failed': 0, 'total': 0, 'updated': 0} 19 | defaultstatus = {'success': 0, 'failed': 0, 'total': 0, 'updated': 0} 20 | 21 | mysql_or_sqlite = 1 22 | mysql_host = '127.0.0.1' 23 | mysql_port = 32769 24 | mysql_usr = 'root' 25 | mysql_pass = 'root' 26 | 27 | redis_host = '127.0.0.1' 28 | redis_port = 32768 29 | 30 | mongoURI = 'mongodb://root:4030aoii1033@localhost:32770/lianjia?authSource=admin' 31 | 32 | db = 'work' 33 | 34 | exportfunc = ['json', 'excel'] 35 | 36 | FAKEHEADER = { 37 | "x-requested-with": "XMLHttpRequest", 38 | "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36", 39 | "referer": "http://www.mafengwo.cn/", 40 | "accept": "text/javascript, application/javascript, application/ecmascript, application/x-ecmascript, */*; q=0.01" 41 | } # 默认伪造headers 42 | 43 | POOLS = {} # 数据池 44 | TASKS = set() # 任务池 45 | DATANUMS = 0 46 | MAXNUMS = 0 47 | LIMIT = 10 # 并发限制 48 | PERPAGE = 50 49 | 50 | 51 | conf = Config 52 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | *.zip 6 | *.db 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | .hypothesis/ 50 | .pytest_cache/ 51 | 52 | # Translations 53 | *.mo 54 | *.pot 55 | 56 | # Django stuff: 57 | *.log 58 | local_settings.py 59 | db.sqlite3 60 | 61 | # Flask stuff: 62 | instance/ 63 | .webassets-cache 64 | 65 | # Scrapy stuff: 66 | .scrapy 67 | 68 | # Sphinx documentation 69 | docs/_build/ 70 | 71 | # PyBuilder 72 | target/ 73 | 74 | # Jupyter Notebook 75 | .ipynb_checkpoints 76 | 77 | # pyenv 78 | .python-version 79 | 80 | # celery beat schedule file 81 | celerybeat-schedule 82 | 83 | # SageMath parsed files 84 | *.sage.py 85 | 86 | # Environments 87 | .env 88 | .venv 89 | env/ 90 | venv/ 91 | ENV/ 92 | env.bak/ 93 | venv.bak/ 94 | 95 | # Spyder project settings 96 | .spyderproject 97 | .spyproject 98 | 99 | # Rope project settings 100 | .ropeproject 101 | 102 | # mkdocs documentation 103 | /site 104 | 105 | # mypy 106 | .mypy_cache/ 107 | -------------------------------------------------------------------------------- /FakeUAdb.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | 3 | from peewee import * 4 | 5 | db = SqliteDatabase('useragents.db') # 初始化数据库 6 | 7 | 8 | class UAS(Model): 9 | uid = AutoField(primary_key=True, null=True) # 自增ID 10 | useragent = TextField(unique=True) # useragent 11 | software = CharField(null=True) # 软件类型 12 | engine = CharField(null=True) # 引擎 13 | types = CharField(null=True) # 硬件类型 14 | popularity = CharField(null=True) # 通用性 15 | 16 | class Meta: 17 | database = db # 指定数据库 18 | 19 | db.connect() # 连接数据库 20 | db.create_tables([UAS]) # 初始化创建不存在的库 21 | 22 | 23 | def UserAgent(searchwords, methods='and'): 24 | """ 25 | { 26 | "key":[ 27 | "words1", 28 | "words2" 29 | ] 30 | } 31 | """ 32 | count = 0 33 | resagent = '' 34 | if methods not in ['and', 'or']: 35 | return '' 36 | methods = '&' if not methods == 'or' else '|' 37 | whereQuery = f' {methods} '.join([ 38 | f'(UAS.{key} << {str(item)})' for key, item in searchwords.items() 39 | ]) 40 | try: 41 | count = UAS.select().where(eval(whereQuery)).order_by(fn.Random()).count() 42 | resagent = UAS.select().where(eval(whereQuery)).order_by( 43 | fn.Random()).limit(1)[0].useragent 44 | except Exception as e: 45 | pass 46 | return count, resagent 47 | 48 | 49 | def UserAgentGroups(colname, limit=10): 50 | if colname in ['software', 'engine', 'types', 'popularity']: # 判定查询字段是否合法 51 | target = eval(f'UAS.{colname}') # 取得目标字段类 52 | return {eval(f'item.{colname}'): item.nums for item in UAS.select(target, fn.COUNT(target).alias('nums')).group_by(target).order_by(fn.COUNT(target).desc()).limit(limit)} 53 | 54 | 55 | if __name__ == '__main__': 56 | from pprint import pprint 57 | print(UserAgent({ 58 | "software": [ 59 | 'Android Browser 4.0' 60 | ] 61 | })) 62 | # pprint(UserAgentGroups('engine', 5)) 63 | -------------------------------------------------------------------------------- /common.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import os 3 | import time 4 | from contextlib import contextmanager 5 | 6 | import moment 7 | 8 | from conf import config 9 | from log import info 10 | 11 | 12 | @contextmanager 13 | def checkTimes(level=3): 14 | timeStart = time.time() 15 | yield 16 | info(f'cost times: {round(time.time()-timeStart,level)}s') 17 | 18 | 19 | def checkCount(func): 20 | def checker(*args, **kwargs): 21 | try: 22 | res = func(*args, **kwargs) 23 | config.status['success'] += 1 24 | return res 25 | except Exception: 26 | config.status['failed'] += 1 27 | raise 28 | return checker(func) 29 | 30 | 31 | def addsucess(): 32 | config.status['success'] += 1 33 | 34 | 35 | def addfailed(): 36 | config.status['failed'] += 1 37 | 38 | 39 | def addtotal(): 40 | config.status['total'] += 1 41 | 42 | 43 | def addupdate(): 44 | config.status['updated'] += 1 45 | 46 | 47 | def checkPath(path): 48 | return os.path.exists(path) 49 | 50 | 51 | def initPath(path): 52 | if not checkPath(path): 53 | os.makedirs(path) 54 | 55 | 56 | def timeSections(starttimes=None, endtimes=None, sectiondays=30, sectionshours=1, formats=None): 57 | if not starttimes: 58 | starttimes = moment.now().replace(minutes=0, seconds=0).add(days=-sectiondays) 59 | else: 60 | starttimes = moment.date(starttimes) 61 | if not endtimes: 62 | endtimes = moment.now().replace(minutes=0, seconds=0) 63 | else: 64 | endtimes = moment.date(endtimes) 65 | 66 | while starttimes < endtimes: 67 | nexttimes = copy.deepcopy(starttimes).add(hours=sectionshours) 68 | if formats: 69 | yield starttimes.format(formats), nexttimes.format(formats) 70 | else: 71 | yield starttimes, nexttimes 72 | starttimes = nexttimes 73 | 74 | 75 | if __name__ == "__main__": 76 | print(list(timeSections(formats='YYYY-MM-DD hh:mm:ss'))) 77 | -------------------------------------------------------------------------------- /uaServer.py: -------------------------------------------------------------------------------- 1 | from peewee import fn 2 | from sanic import Sanic 3 | from sanic.log import logger 4 | from sanic.response import json,redirect 5 | from termcolor import colored 6 | from contextlib import contextmanager 7 | import time 8 | 9 | @contextmanager 10 | def checkTimes(): 11 | startTime = time.time() 12 | yield 13 | logger.info( 14 | colored(f'cost times: [{str(round(round(time.time()-startTime,5)*1000,3)) }]ms', 'green')) 15 | 16 | from config import conf 17 | from FakeUAdb import UAS 18 | 19 | app = Sanic() 20 | 21 | @app.route('/') 22 | def handel_request(request): 23 | return redirect('/fakeua') 24 | 25 | @app.route('/fakeua') 26 | async def query_string(request): 27 | with checkTimes(): 28 | args = request.args 29 | query = UAS.select() 30 | 31 | keywords = args.get('keywords', [''])[0][:16].lower() 32 | if keywords: 33 | query = query.where(UAS.useragent.in_(keywords)) 34 | 35 | engine = args.get('engine', [''])[0] 36 | engine = engine.lower() if len(engine) < 10 and ''.join( 37 | engine.split()).isalpha() else '' 38 | if engine: 39 | query = query.where(UAS.engine.in_(engine)) 40 | 41 | types = args.get('types', [''])[0] 42 | types = types.lower() if len(types) < 10 else '' 43 | if types: 44 | query = query.where(UAS.types.in_(types)) 45 | 46 | software = args.get('software', [''])[0] 47 | software = software.lower() if len(software) < 24 else '' 48 | if software: 49 | query = query.where(UAS.software.in_(software)) 50 | 51 | limit = args.get('limit', [conf.DEFAULT_LIMIT])[0] 52 | limit = int(limit) if limit.isdigit() else conf.DEFAULT_LIMIT 53 | limit = limit if limit < conf.MAX_LIMIT + 1 else conf.DEFAULT_LIMIT 54 | 55 | counts = query.count() 56 | # result = query.order_by(fn.Random()).limit(limit) 57 | 58 | # return json({'total': counts, 'results': result}) 59 | return json({}) 60 | 61 | 62 | if __name__ == "__main__": 63 | app.run(host="0.0.0.0", port=conf.SERVER_PORT) 64 | -------------------------------------------------------------------------------- /models.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import hashlib 3 | import json 4 | 5 | import pymysql 6 | from peewee import * 7 | 8 | from conf import config 9 | from peewee import __exception_wrapper__ 10 | 11 | db = None 12 | 13 | class RetryOperationalError(object): 14 | 15 | def execute_sql(self, sql, params=None, commit=True): 16 | try: 17 | cursor = super(RetryOperationalError, self).execute_sql( 18 | sql, params, commit) 19 | except OperationalError: 20 | if not self.is_closed(): 21 | self.close() 22 | with __exception_wrapper__: 23 | cursor = self.cursor() 24 | cursor.execute(sql, params or ()) 25 | if commit and not self.in_transaction(): 26 | self.commit() 27 | return cursor 28 | 29 | 30 | class RetryMySQLDatabase(RetryOperationalError, MySQLDatabase): 31 | pass 32 | 33 | if config.mysql_or_sqlite: 34 | Links = { 35 | 'host': config.mysql_host, 36 | 'port': config.mysql_port, 37 | 'user': config.mysql_usr, 38 | 'password': config.mysql_pass, 39 | 40 | } 41 | try: 42 | con = pymysql.connect(**Links) 43 | with con.cursor() as cursor: 44 | cursor.execute( 45 | f'create database {config.db} character set UTF8mb4 collate utf8mb4_bin') 46 | con.close() 47 | except pymysql.err.ProgrammingError as e: 48 | if '1007' in str(e): 49 | pass 50 | except Exception as e: 51 | raise e 52 | Links['database'] = config.db 53 | db = RetryMySQLDatabase(**Links, charset='utf8mb4') 54 | else: 55 | db = SqliteDatabase(config.db) 56 | 57 | class UAS(Model): 58 | uid = AutoField(primary_key=True, null=True) # 自增ID 59 | useragent = TextField(unique=True) # useragent 60 | software = CharField(null=True) # 软件类型 61 | engine = CharField(null=True) # 引擎 62 | types = CharField(null=True) # 硬件类型 63 | popularity = CharField(null=True) # 通用性 64 | 65 | class Meta: 66 | database = db # 指定数据库 67 | 68 | 69 | db.connect() # 连接数据库 70 | db.create_tables([UAS]) # 初始化创建不存在的库 71 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | from log import info,warning,error,success 4 | import trio 5 | import asks 6 | 7 | asks.init('trio') 8 | 9 | class spider(object): 10 | 11 | def __init__(self, *args, **kwargs): 12 | 13 | self.datas = {} 14 | 15 | 16 | async def getTypesL1(self): 17 | """ 18 | 取得一级分类 19 | """ 20 | url = "https://developers.whatismybrowser.com/useragents/explore/" 21 | resp = await spiderSession.get(url) 22 | async with trio.open_nursery() as nursery: 23 | for item in jq(resp.text)("#listing-by-field-name > li > h2 > a").items(): 24 | types = item.text().strip().replace(' ', '_').lower() 25 | POOLS[types] = {} 26 | nursery.start_soon( 27 | getTypesL2, POOLS[types], types, urljoin(url, item.attr('href'))) 28 | 29 | 30 | async def getTypesL2(target, types, href): 31 | """ 32 | 取得二级分类 33 | """ 34 | loger.info(colored(f'fetching {href}', 'yellow')) 35 | resp = await spiderSession.get(href) 36 | async with trio.open_nursery() as nursery: 37 | for item in jq(resp.text)("body > div.content-base > section > div > table > tbody > tr").items(): 38 | name = item( 39 | 'td:nth-child(1)>a').text().strip().replace(' ', '_').lower() 40 | target[name] = {} 41 | url = urljoin(href, item('td:nth-child(1)>a').attr('href')) 42 | nums = int(item('td:nth-child(2)').text().strip()) 43 | target[name]['url'] = url 44 | target[name]['nums'] = nums 45 | target[name]['UA_list'] = [] 46 | for page in range(1, math.ceil(nums/PERPAGE)+1): 47 | TASKS.add('__'.join([ 48 | types, 49 | name, 50 | f"{url}{page}" 51 | ])) 52 | 53 | 54 | async def getUAs(): 55 | global MAXNUMS 56 | """ 57 | 爬行任务调度 58 | """ 59 | limit = trio.CapacityLimiter(LIMIT) 60 | while TASKS: 61 | MAXNUMS = len(list(TASKS)) 62 | loger.info(colored(f'当前任务量:{MAXNUMS}', 'red')) 63 | await trio.sleep(1) 64 | async with trio.open_nursery() as nursery: 65 | for item in list(TASKS): 66 | nursery.start_soon(getUAsitem, item, limit) 67 | 68 | 69 | 70 | def run(self): 71 | pass 72 | 73 | 74 | if __name__ == "__main__": 75 | try: 76 | main() 77 | except KeyboardInterrupt: 78 | pass 79 | except Exception as e: 80 | loger.error(colored(e, 'red')) 81 | finally: 82 | SaveJson(POOLS, 'POOLS.json') 83 | 84 | -------------------------------------------------------------------------------- /FakeUA.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import json 4 | import logging 5 | import math 6 | import os 7 | import re 8 | import sys 9 | import time 10 | from pprint import pprint 11 | from urllib.parse import quote, urljoin 12 | 13 | import asks 14 | import trio 15 | from pyquery import PyQuery as jq 16 | from stem import Signal 17 | from stem.connection import connect 18 | from termcolor import colored 19 | 20 | from FakeUAdb import UAS 21 | 22 | asks.init('trio') # 初始化trio 23 | FAKEHEADER = { 24 | "x-requested-with": "XMLHttpRequest", 25 | "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36", 26 | "referer": "http://www.mafengwo.cn/", 27 | "accept": "text/javascript, application/javascript, application/ecmascript, application/x-ecmascript, */*; q=0.01" 28 | } # 默认伪造headers 29 | POOLS = {} # 数据池 30 | TASKS = set() # 任务池 31 | DATANUMS = 0 32 | MAXNUMS = 0 33 | LIMIT = 10 # 并发限制 34 | PERPAGE = 50 35 | spiderSession = asks.Session(connections=LIMIT) 36 | spiderSession.headers = FAKEHEADER 37 | spiderSession.timeout_manager 38 | 39 | 40 | logging.basicConfig( 41 | format="[%(asctime)s] >>> %(levelname)s %(name)s: %(message)s", level=logging.INFO) # 初始化日志输出格式,级别 42 | loger = logging.getLogger('FakeUA') # 初始化一个日志对象 43 | try: 44 | controller = connect() 45 | controller.authenticate() 46 | except Exception as e: 47 | loger.error(colored('请检测您的Tor端口','red')) 48 | exit() 49 | 50 | 51 | async def getTypesL1(): 52 | """ 53 | 取得一级分类 54 | """ 55 | url = "https://developers.whatismybrowser.com/useragents/explore/" 56 | resp = await spiderSession.get(url) 57 | # listing-by-field-name > li:nth-child(1) > h2 > a 58 | # listing-by-field-name > li:nth-child(2) > h2 > a 59 | async with trio.open_nursery() as nursery: 60 | for item in jq(resp.text)("#listing-by-field-name > li > h2 > a").items(): 61 | types = item.text().strip().replace(' ', '_').lower() 62 | POOLS[types] = {} 63 | nursery.start_soon( 64 | getTypesL2, POOLS[types], types, urljoin(url, item.attr('href'))) 65 | 66 | async def getTypesL2(target, types, href): 67 | """ 68 | 取得二级分类 69 | """ 70 | loger.info(colored(f'fetching {href}', 'yellow')) 71 | resp = await spiderSession.get(href) 72 | async with trio.open_nursery() as nursery: 73 | for item in jq(resp.text)("body > div.content-base > section > div > table > tbody > tr").items(): 74 | name = item( 75 | 'td:nth-child(1)>a').text().strip().replace(' ', '_').lower() 76 | target[name] = {} 77 | url = urljoin(href, item('td:nth-child(1)>a').attr('href')) 78 | nums = int(item('td:nth-child(2)').text().strip()) 79 | target[name]['url'] = url 80 | target[name]['nums'] = nums 81 | target[name]['UA_list'] = [] 82 | for page in range(1, math.ceil(nums/PERPAGE)+1): 83 | TASKS.add('__'.join([ 84 | types, 85 | name, 86 | f"{url}{page}" 87 | ])) 88 | 89 | async def getUAs(): 90 | global MAXNUMS 91 | """ 92 | 爬行任务调度 93 | """ 94 | limit = trio.CapacityLimiter(LIMIT) 95 | while TASKS: 96 | MAXNUMS = len(list(TASKS)) 97 | loger.info(colored(f'当前任务量:{MAXNUMS}', 'red')) 98 | await trio.sleep(1) 99 | async with trio.open_nursery() as nursery: 100 | for item in list(TASKS): 101 | nursery.start_soon(getUAsitem, item, limit) 102 | 103 | async def getUAsitem(detals, limit): 104 | global DATANUMS 105 | global MAXNUMS 106 | 107 | """ 108 | 获取单个任务 109 | """ 110 | types, name, url = detals.split('__') 111 | target = POOLS[types][name]['UA_list'] 112 | async with limit: 113 | try: 114 | loger.info(colored(f'fetching -> {url}', 'yellow')) 115 | resp = await spiderSession.get(url, timeout=5, retries=3) 116 | LocalDatas = [] 117 | for item in jq(resp.text)( 118 | "body > div.content-base > section > div > table > tbody > tr").items(): 119 | datas = { 120 | # 'uid':'', 121 | 'useragent': item('td.useragent').text(), 122 | # 'href': item('td.useragent>a').attr('href'), 123 | 'software': item('td:nth-child(2)').attr('title'), 124 | 'engine': item('td:nth-child(3)').text(), 125 | 'types': item('td:nth-child(4)').text(), 126 | 'popularity': item('td:nth-child(5)').text() 127 | } 128 | loger.info( 129 | '[' + 130 | colored(DATANUMS,'green')+ 131 | '/'+ 132 | colored(MAXNUMS,'yellow')+ 133 | '/'+ 134 | colored(str(len(target)), 'blue') + 135 | ']' + 136 | colored('->', 'blue').join([ 137 | colored(types, 'red'), 138 | colored(name, 'red'), 139 | colored(datas["useragent"], 'green') 140 | ])) 141 | target.append(datas) 142 | LocalDatas.append(datas) 143 | SaveToDB(LocalDatas,UAS) 144 | TASKS.remove(detals) 145 | DATANUMS +=1 146 | MAXNUMS -=1 147 | except KeyboardInterrupt: 148 | raise 149 | except Exception as e: 150 | loger.error(colored(e, 'red')) 151 | NewID() 152 | 153 | def NewID(): 154 | controller.signal(Signal.NEWNYM) 155 | loger.error(colored('切换线路', 'red')) 156 | 157 | def SaveJson(datas, filename): 158 | """ 159 | Json数据存储 160 | """ 161 | if not datas: 162 | return 163 | loger.info(colored(f'文件存储至 {filename}', 'yellow')) 164 | with open(filename, 'w') as f: 165 | f.write(json.dumps(datas, indent=4, ensure_ascii=False)) 166 | 167 | def MakeChunk(datas,length=100): 168 | for item in range(0, math.ceil(len(datas)/length)): 169 | yield datas[item*length:(item+1)*length] 170 | 171 | def SaveToDB(datas,model): 172 | # data_source = [] 173 | data_source = datas 174 | # for tk, td in datas.items(): 175 | # for nk,nd in td.items(): 176 | # data_source.extend(nd['UA_list']) 177 | if data_source: 178 | loger.info(colored(f'数据存储至DB中 [{len(data_source)}]', 'yellow')) 179 | for chunk in list(MakeChunk(data_source)): 180 | model.insert_many(data_source).execute() 181 | 182 | def main(): 183 | """ 184 | 主逻辑 185 | """ 186 | while True: 187 | try: 188 | trio.run(getTypesL1) 189 | break 190 | except Exception: 191 | NewID() 192 | trio.run(getUAs) 193 | 194 | if __name__ == '__main__': 195 | try: 196 | main() 197 | except KeyboardInterrupt: 198 | pass 199 | except Exception as e: 200 | loger.error(colored(e, 'red')) 201 | finally: 202 | SaveJson(POOLS, 'POOLS.json') 203 | # SaveToDB(POOLS, UAS) 204 | --------------------------------------------------------------------------------