├── .gitignore ├── README.md ├── aqd_spider.py ├── common.py ├── config.ini.default ├── define.py ├── run.py ├── spider.py ├── static ├── app.min.css ├── app.min.js ├── glyphicons-halflings-regular.woff2 └── icon.png ├── templates ├── actresses.html ├── analyse.html ├── config.html ├── genre.html ├── group.html ├── index.html ├── install.html ├── main.html ├── movie.html ├── scandisk.html └── spider.html └── website.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.db 2 | *.ini 3 | *.txt 4 | .vscode/ 5 | .idea/ 6 | __pycache__/ 7 | /*.js 8 | /*.json 9 | test.py 10 | logs 11 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## 运行需求 2 | python3 3 | **第三方包** 4 | ```bash 5 | pip install requests 6 | pip install lxml 7 | pip install flask 8 | ``` 9 | ## 页面功能 10 | 11 | - `http://127.0.0.1:5000/` 首页 12 | - `http://127.0.0.1:5000/search/已发布` 已发布 13 | - `http://127.0.0.1:5000/search/已下载` 已下载 14 | - `http://127.0.0.1:5000/search/有资源` 有资源 15 | 16 | ### 列表页 17 | - `http://127.0.0.1:5000/group` 番号列表 18 | - `http://127.0.0.1:5000/actresses` 演员列表 19 | - `http://127.0.0.1:5000/genre` 类别列表 20 | - `http://127.0.0.1:5000/studio` 制作商列表 21 | - `http://127.0.0.1:5000/label` 发行商列表 22 | - `http://127.0.0.1:5000/series` 系列列表 23 | 24 | ### 明细页 25 | - `http://127.0.0.1:5000/movie/e3dedf889e44cee8` 影片明细 26 | - `http://127.0.0.1:5000/group/IPX` 番号明细 27 | - `http://127.0.0.1:5000/star/1971f1973cf8172f` 演员明细 28 | - `http://127.0.0.1:5000/genre/dd21aefe7ae3228c` 类别明细 29 | - `http://127.0.0.1:5000/studio/80be243ea6164094` 制作商明细 30 | - `http://127.0.0.1:5000/label/b0b3be30e6bf490f` 发行商明细 31 | - `http://127.0.0.1:5000/series/c28ffa16eae1bf1e` 系列明细 32 | - `http://127.0.0.1:5000/director/bb914a54dc51b21b` 导演明细 33 | 34 | ### 收藏页 35 | - `http://127.0.0.1:5000/like/group` 收藏番号 36 | - `http://127.0.0.1:5000/like/movie` 收藏影片 37 | - `http://127.0.0.1:5000/like/studio` 收藏制作商 38 | - `http://127.0.0.1:5000/like/label` 收藏发行商 39 | - `http://127.0.0.1:5000/like/series` 收藏系列 40 | 41 | ### 分析页 42 | - `http://127.0.0.1:5000/analyse/group/IPX` 分析番号 43 | - `http://127.0.0.1:5000/analyse/star/1971f1973cf8172f` 分析演员 44 | - `http://127.0.0.1:5000/analyse/genre/dd21aefe7ae3228c` 分析类别 45 | - `http://127.0.0.1:5000/analyse/studio/80be243ea6164094` 分析制作商 46 | - `http://127.0.0.1:5000/analyse/label/b0b3be30e6bf490f` 分析发行商 47 | - `http://127.0.0.1:5000/analyse/director/bb914a54dc51b21b` 分析导演 48 | 49 | ### 功能页 50 | - `http://127.0.0.1:5000/spider` 爬虫,输入链接,抓取链接内所有影片 51 | - `http://127.0.0.1:5000/scandisk` 扫描硬盘,扫描本地硬盘,识别番号文件名 52 | - `http://127.0.0.1:5000/config` 修改配置 53 | 54 | ## 注意事项 55 | 1. 右上角的语言切换是用来切换目标站的语言的,会影响演员名/类目名,仿站没有做多语言 56 | 2. avmoo最多只支持抓取到100页,超过100页无法抓取 57 | 3. 图片卡的时候尝试去`config`页面切换`图片cdn源` 58 | 4. 右上角的`链接`按钮指的是avmoo源站对应的链接 59 | 5. 爬虫页可以查看最近写入库中的影片,也可以操作爬虫 60 | 6. 磁力搜索网站可以自己在配置添加,会在末尾拼av_id 61 | 62 | ## 打开方法 63 | 命令行运行`python run.py`启动本地web(首次打开会跳转到安装页面) 64 | 指定配置文件运行`python run.py config_main.db` 65 | 默认地址为`http://127.0.0.1:5000`,端口可通过配置修改 66 | -------------------------------------------------------------------------------- /aqd_spider.py: -------------------------------------------------------------------------------- 1 | from ast import Constant 2 | from asyncio import constants 3 | import time 4 | import sys 5 | from requests import Timeout 6 | import json 7 | 8 | from typing import Iterator 9 | from common import * 10 | 11 | ''' 12 | 抓取aqdav.net的影片添加到扩展信息 13 | 1. av_extend 中必须有{aqd}配置,(rename,aqd,https://vip.aqdtv540.com),用来记录最新的地址 14 | ''' 15 | class Aqd: 16 | instance = None 17 | requests_ins = None 18 | db_ins = None 19 | log = logging.getLogger('aqd') 20 | save_file = 'aqd_result.txt' 21 | 22 | @staticmethod 23 | def db(): 24 | if Aqd.db_ins is None: 25 | Aqd.log.info('spider.db.init') 26 | # 链接数据库 27 | Aqd.db_ins = sqlite3.connect(CONFIG.get("base", "db_file")) 28 | Aqd.db_ins.row_factory = make_dicts 29 | return Aqd.db_ins 30 | 31 | @staticmethod 32 | def requests(): 33 | if Aqd.requests_ins is None: 34 | Aqd.log.info('spider.requests.init') 35 | # 创建会话对象 36 | Aqd.requests_ins = requests.Session() 37 | Aqd.requests_ins.headers = { 38 | 'User-Agent': CONFIG.get("requests", "user_agent"), 39 | } 40 | return Aqd.requests_ins 41 | 42 | @staticmethod 43 | def fetchall(sql) -> list: 44 | cur = Aqd.db().cursor() 45 | cur.execute(sql) 46 | return cur.fetchall() 47 | 48 | @staticmethod 49 | def aqd_site_url() -> str: 50 | site_url = CONFIG.get('aqd', 'aqd_site').strip("/") 51 | r = Aqd.requests().get(site_url, timeout=CONFIG.getint("requests", "timeout")) 52 | p = parse.urlparse(r.url) 53 | 54 | new_site_url = "https://" + p.hostname 55 | if site_url != new_site_url: 56 | # 存储新的链接 57 | CONFIG.set(section='aqd', option='aqd_site', value=new_site_url) 58 | config_save(CONFIG) 59 | 60 | return new_site_url 61 | 62 | # 自动翻页返回影片url 63 | @staticmethod 64 | def url_general() -> Iterator[str]: 65 | site_url = Aqd.aqd_site_url() 66 | if empty(site_url): 67 | site_url = "https://www.aqd99.com" 68 | return 69 | 70 | for page_no in range(1, 500): 71 | time.sleep(1) 72 | # 有PART关键字的影片都是AV影片 73 | # url = site_url + '/videos/search?key=PART&page={}'.format(page_no) 74 | url = site_url + '/videos/category/jp/{}'.format(page_no) 75 | Aqd.log.info("get:{}".format(url)) 76 | 77 | (status_code, html) = Aqd.get_html_by_url(url) 78 | if status_code in [403, 404, 500] or html is None: 79 | Aqd.log.fatal("url:{} status_code:{}".format(url, status_code)) 80 | break 81 | 82 | item_a_list = html.xpath('//div[@class="row index-videos-list index-videos-item-list"]/div/div/div/a') 83 | if not item_a_list: 84 | Aqd.log.warning("page empty break") 85 | break 86 | for item in item_a_list: 87 | # 判断是否有番号id 88 | title = item.attrib.get('alt') 89 | url = item.attrib.get('href') 90 | # av_id = Aqd.get_av_id(title) 91 | # if empty(av_id): 92 | # continue 93 | head_img = item.xpath('img')[0].attrib.get('data-original') 94 | # Aqd.log.info("aqdurl:{},title:{}".format(url, title)) 95 | yield site_url + url, head_img 96 | 97 | @staticmethod 98 | def movie_save(insert_list: list) -> None: 99 | if empty(insert_list): 100 | return 101 | insert_list_str = "\n".join([json.dumps(x, ensure_ascii = False) for x in insert_list]) 102 | with open(Aqd.save_file, "a", encoding='utf-8') as f: 103 | f.write(insert_list_str + "\n") 104 | 105 | # 解析html数据 106 | @staticmethod 107 | def movie_page_data(html) -> dict: 108 | title = html.xpath("/html/body/section/div[2]/div[2]/div[3]/div/div[1]/h3/text()")[0] 109 | video = "" 110 | res = re.findall(r"(http.+\.m3u8)", html.xpath("//script")[-5].text) 111 | if non_empty(res): 112 | video = res[0] 113 | url = html.xpath('/html/head/meta[15]')[0].attrib.get('content') 114 | data = { 115 | 'id': int(re.findall("\d+$", url)[0]), 116 | 'title': title, 117 | 'av_id': Aqd.get_av_id(title), 118 | 'video': video, 119 | 'img': '', 120 | # 发行时间 121 | 'date': html.xpath('/html/body/section/div[2]/div[2]/div[3]/div/div[3]/span/text()')[0].strip()[-19:] 122 | } 123 | return data 124 | 125 | @staticmethod 126 | def get_html_by_url(url: str) -> tuple: 127 | retry_limit = 100 128 | for i in range(retry_limit): 129 | try: 130 | res = Aqd.requests().get(url, timeout=CONFIG.getint("requests", "timeout")) 131 | if res.status_code != 200: 132 | Aqd.log.error("status_code = {},url:{}".format(res.status_code, url)) 133 | return res.status_code, None 134 | 135 | return 200, etree.HTML(res.text) 136 | except Timeout as e: 137 | Aqd.log.warning("requests Timeout,error:{}\nretry url:{}".format( 138 | e, url 139 | )) 140 | # 休眠 141 | time.sleep(10) 142 | # 超时重试 143 | continue 144 | 145 | except ConnectionError as e: 146 | Aqd.log.warning("requests ConnectionError,error:{}\nretry url:{}".format( 147 | e, url 148 | )) 149 | # 休眠 150 | time.sleep(10) 151 | # 链接异常 152 | continue 153 | 154 | except Exception as e: 155 | Aqd.log.warning("requests Exception:{}\nurl:{}".format(e, url)) 156 | time.sleep(10) 157 | continue 158 | # 返回错误 159 | return 500, None 160 | 161 | @staticmethod 162 | def get_av_id(title: str) -> str: 163 | ''' 164 | 从title中获取avid,取不到返回空''' 165 | res = re.findall(r"\[([A-Z]+\-\d+)\]", title) 166 | if not res: 167 | return '' 168 | return res[0] 169 | 170 | @staticmethod 171 | def get_max_id() -> int: 172 | max_id = 0 173 | with open(Aqd.save_file, "r", encoding="utf-8") as f: 174 | for line in f.readlines(): 175 | row = json.loads(line.strip()) 176 | if row["id"] > max_id: 177 | max_id = row["id"] 178 | return max_id 179 | 180 | @staticmethod 181 | def fetch_data(): 182 | max_id = aqd.get_max_id() 183 | for url, img in aqd.url_general(): 184 | id = re.findall("\d+$", url)[0] 185 | if int(id) <= max_id: 186 | break 187 | status_code,html = Aqd.get_html_by_url(url) 188 | if status_code != 200: 189 | continue 190 | 191 | Aqd.log.info('fetch:{}'.format(url)) 192 | data = Aqd.movie_page_data(html) 193 | data['img'] = img 194 | Aqd.movie_save([data]) 195 | 196 | @staticmethod 197 | def insert_data(): 198 | with open(Aqd.save_file, "r", encoding="utf-8") as f: 199 | file_data = f.read() 200 | file_data = file_data.split("\n") 201 | for i in range(len(file_data))[::-1]: 202 | line = file_data[i].strip() 203 | if len(line) < 20: 204 | continue 205 | row = json.loads(line) 206 | if empty(row['av_id']): 207 | continue 208 | # 查询库里有没有当前id 209 | res = fetchall("select * from av_list where av_id ='{}'".format(row['av_id'])) 210 | if empty(res): 211 | Aqd.log.warning("av_id:{} none {}".format(row["av_id"], get_url("search", row["av_id"]))) 212 | else: 213 | Aqd.log.info("av_id:{} complete {}".format(row["av_id"], get_local_url("movie", row["av_id"]))) 214 | 215 | m3u8_url = "{}#{}".format(row["video"], row["id"]) 216 | # 查询数据是不是已存在 217 | res = fetchall("select * from av_extend where extend_name='movie_res' and key='{}' and val='{}'".format(row['av_id'], m3u8_url)) 218 | if non_empty(res): 219 | Aqd.log.info("{} exist,break".format(row['av_id'])) 220 | break 221 | insert("av_extend", [{ 222 | "extend_name": "movie_res", 223 | "key": row['av_id'], 224 | "val": m3u8_url 225 | }]) 226 | if __name__ == '__main__': 227 | # python aqd_spider.py ./config.ini.default 228 | if len(sys.argv) == 2: 229 | conf_file = sys.argv[1] 230 | else: 231 | print("wrong config file.") 232 | exit() 233 | 234 | init(conf_file) 235 | create_logger('aqd') 236 | aqd = Aqd() 237 | Aqd.log.info("[fetch_data start]") 238 | aqd.fetch_data() 239 | Aqd.log.info("[insert_data start]") 240 | aqd.insert_data() 241 | 242 | # print(aqd.get_max_id()) 243 | # status_code,html = Aqd.get_html_by_url("/videos/play/6988") 244 | # data = Aqd.movie_page_data(html) 245 | # print(data) 246 | -------------------------------------------------------------------------------- /common.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import time 3 | from sqlite3 import Connection 4 | 5 | import requests 6 | import configparser 7 | import os 8 | import re 9 | import sqlite3 10 | from lxml import etree 11 | import webbrowser 12 | import threading 13 | import binascii 14 | import traceback 15 | from urllib.parse import quote 16 | from queue import Queue 17 | from define import * 18 | from urllib import parse 19 | 20 | CONFIG_FILE = "config.ini" 21 | CONFIG_FILE_DEFAULT = "config.ini.default" 22 | CONFIG = configparser.ConfigParser() 23 | 24 | DB: Connection = None 25 | 26 | # 存储 av_genre,av_extend, rename数据,用于快速查找 27 | DATA_STORAGE = {} 28 | 29 | # 缓存 30 | SQL_CACHE = {} 31 | 32 | # 任务队列 33 | QUEUE = Queue(maxsize=0) 34 | 35 | LOG_FORMAT = "%(asctime)s [%(filename)s:%(lineno)d] %(levelname)s: %(message)s" 36 | LOGGER = logging.getLogger(APP_NAME) 37 | 38 | 39 | def init(file_name = None): 40 | global CONFIG_FILE 41 | # 创建日志 42 | create_logger(APP_NAME) 43 | # 关闭 werkzeug 的日志 44 | logging.getLogger('werkzeug').setLevel(logging.ERROR) 45 | LOGGER.info("common.init") 46 | 47 | if non_empty(file_name): 48 | # 命令行指定配置文件 49 | CONFIG_FILE = file_name 50 | # 初始化配置 51 | config_check() 52 | config_init() 53 | db_init() 54 | 55 | 56 | def make_dicts(cursor, row): 57 | return dict((cursor.description[idx][0], value) 58 | for idx, value in enumerate(row)) 59 | 60 | 61 | def db_init(): 62 | LOGGER.info("common.init.db") 63 | # 初始化db 64 | global DB 65 | db_file = CONFIG.get("base", "db_file") 66 | if os.path.exists(db_file): 67 | DB = sqlite3.connect(db_file, check_same_thread=False) 68 | DB.row_factory = make_dicts 69 | 70 | 71 | def storage_init(table: str) -> None: 72 | if table in DATA_STORAGE and non_empty(DATA_STORAGE[table]): 73 | return 74 | DATA_STORAGE[table] = fetchall("SELECT * FROM " + table) 75 | 76 | 77 | # 仅av_genre和av_extend使用 78 | def storage(table: str, conditions: dict = None, col: str = None) -> list: 79 | storage_init(table) 80 | ret = [] 81 | if not conditions: 82 | return DATA_STORAGE[table] 83 | # 每条记录 84 | for row in DATA_STORAGE[table]: 85 | hit = True 86 | # 每个条件 87 | for cond_key, cond_val in conditions.items(): 88 | if not cond_val: 89 | continue 90 | if isinstance(cond_val, str): 91 | if cond_val != row[cond_key]: 92 | hit = False 93 | break 94 | elif isinstance(cond_val, list): 95 | if row[cond_key] not in cond_val: 96 | hit = False 97 | break 98 | else: 99 | LOGGER.fatal("wrong type") 100 | if not hit: 101 | continue 102 | if col: 103 | ret.append(row[col]) 104 | else: 105 | ret.append(row) 106 | return ret 107 | 108 | 109 | def config_path() -> str: 110 | if os.path.exists(CONFIG_FILE): 111 | return CONFIG_FILE 112 | return CONFIG_FILE_DEFAULT 113 | 114 | 115 | def config_init() -> None: 116 | # 初始化配置 117 | LOGGER.info('CONFIG FILE:%r', config_path()) 118 | CONFIG.read(config_path()) 119 | 120 | 121 | # 配置文件 122 | def config_check(): 123 | if not os.path.exists(CONFIG_FILE): 124 | return 125 | config = configparser.ConfigParser() 126 | config.read(CONFIG_FILE) 127 | config_default = configparser.ConfigParser() 128 | config_default.read(CONFIG_FILE_DEFAULT) 129 | for (section, option) in [x.split('.') for x in CONFIG_NAME_LIST]: 130 | if not config.has_section(section): 131 | config.add_section(section) 132 | if not config.has_option(section, option): 133 | config.set(section, option, config_default.get(section, option)) 134 | config_save(config) 135 | 136 | 137 | def config_save(config): 138 | with open(CONFIG_FILE, "w") as fp: 139 | config.write(fp) 140 | 141 | 142 | # 创建日志记录器 143 | def create_logger(app_name: str): 144 | logger = logging.getLogger(app_name) 145 | logger.setLevel(logging.INFO) # Log等级总开关 146 | # 第二步,创建一个handler,用于写入日志文件 147 | if not os.path.exists('logs'): 148 | os.mkdir('logs') 149 | log_path = os.getcwd() + '/logs/' 150 | logfile = log_path + app_name + '.' + time.strftime('%Y%m%d%H', time.localtime(time.time())) + '.log' 151 | 152 | fh = logging.FileHandler(logfile, mode='a', encoding='utf-8') 153 | fh.setLevel(logging.DEBUG) # 输出到file的log等级的开关 154 | # 第三步,定义handler的输出格式 155 | fh.setFormatter(logging.Formatter(LOG_FORMAT)) 156 | # 第四步,将logger添加到handler里面 157 | logger.addHandler(fh) 158 | 159 | fh = logging.StreamHandler() 160 | fh.setLevel(logging.DEBUG) # 输出到file的log等级的开关 161 | # 第三步,定义handler的输出格式 162 | fh.setFormatter(logging.Formatter(LOG_FORMAT)) 163 | # 第四步,将logger添加到handler里面 164 | logger.addHandler(fh) 165 | 166 | 167 | def replace_sql_build(table: str, data: dict) -> str: 168 | sql = "REPLACE INTO {} ({}) VALUES ({})".format( 169 | table, ','.join(list(data)), ("?," * len(data))[:-1] 170 | ) 171 | return sql 172 | 173 | 174 | # sql插入操作 175 | # av_genre av_extend 176 | def insert(table: str, data: list): 177 | if CONFIG.getboolean("base", "readonly"): 178 | return 179 | if not data: 180 | return 181 | sql = replace_sql_build(table, data[0]) 182 | if len(sql) < 150: 183 | LOGGER.info(color(36, sql)) 184 | else: 185 | LOGGER.info(color(36, "INSERT,table:{},count:{}".format(table, len(data)))) 186 | DB.cursor().executemany(sql, [tuple(x.values()) for x in data]) 187 | DB.commit() 188 | if table in DATA_STORAGE: 189 | DATA_STORAGE[table].clear() 190 | 191 | 192 | # sql删除操作 193 | def delete(table: str, data: dict): 194 | if CONFIG.getboolean("base", "readonly"): 195 | return 196 | if not data: 197 | return 198 | sql = "DELETE FROM {} WHERE {}".format( 199 | table, " AND ".join(["{}='{}'".format(field, value) for field, value in data.items()])) 200 | execute(sql) 201 | if table in DATA_STORAGE: 202 | DATA_STORAGE[table].clear() 203 | 204 | 205 | # 执行sql 206 | def execute(sql): 207 | if CONFIG.getboolean("base", "readonly"): 208 | return 209 | LOGGER.info(color(35, sql)) 210 | DB.cursor().execute(sql) 211 | DB.commit() 212 | 213 | 214 | # 查询sql 没缓存 215 | def fetchall(sql) -> list: 216 | if DB is None: 217 | # 触发安装程序 218 | raise IOError('db') 219 | 220 | cur = DB.cursor() 221 | LOGGER.info(color(36, sql)) 222 | cur.execute(sql) 223 | return cur.fetchall() 224 | 225 | 226 | # 查询sql 带缓存 227 | def query_sql(sql) -> list: 228 | cache_key = gen_cache_key(sql) 229 | # 是否使用缓存 230 | if CONFIG.getboolean("website", "use_cache"): 231 | LOGGER.info('CACHE[%s]', cache_key) 232 | # 是否有缓存 233 | if cache_key in SQL_CACHE.keys(): 234 | return SQL_CACHE[cache_key][:] 235 | else: 236 | ret = fetchall(sql) 237 | if CONFIG.getboolean("website", "use_cache") and ret != []: 238 | SQL_CACHE[cache_key] = ret 239 | return ret[:] 240 | else: 241 | return fetchall(sql) 242 | 243 | 244 | def get_new_avmoo_site() -> str: 245 | res = requests.get('https://tellme.pw/avmoo') 246 | html = etree.HTML(res.text) 247 | avmoo_site = html.xpath( 248 | '/html/body/div[1]/div[2]/div/div[2]/h4[1]/strong/a/@href')[0] 249 | return avmoo_site 250 | 251 | 252 | def list_in_str(target_list: tuple, target_string: str) -> bool: 253 | for item in target_list: 254 | if item in target_string: 255 | return True 256 | return False 257 | 258 | 259 | def get_url(page_type: str = '', keyword: str = '', page_no: int = 1) -> str: 260 | ret = '{}/{}'.format(CONFIG.get("base", "avmoo_site"), 261 | CONFIG.get("base", "country"), ) 262 | if page_type == "search": 263 | if keyword != '': 264 | ret += '/{}/{}'.format(page_type, keyword) 265 | else: 266 | if page_type != '': 267 | ret += '/{}'.format(page_type) 268 | if keyword != '': 269 | ret += '/{}'.format(keyword) 270 | if page_no > 1: 271 | ret += '/page/{}'.format(page_no) 272 | return ret 273 | 274 | 275 | def get_local_url(page_type: str = '', keyword: str = '', page_no: int = 1) -> str: 276 | ret = 'http://{}:{}'.format(LOCAL_IP, CONFIG.getint("base", "port")) 277 | if page_type == "popular": 278 | return '' 279 | if page_type != '': 280 | ret += '/{}'.format(page_type) 281 | if keyword != '': 282 | ret += '/{}'.format(keyword) 283 | if page_no > 1: 284 | ret += '/page/{}'.format(page_no) 285 | return ret 286 | 287 | 288 | def search_where(key_item: str) -> str: 289 | key_item = sql_escape(key_item) 290 | return "(av_list.title LIKE '%{0}%' OR ".format(key_item) + \ 291 | "av_list.director = '{0}' OR ".format(key_item) + \ 292 | "av_list.studio = '{0}' OR ".format(key_item) + \ 293 | "av_list.label = '{0}' OR ".format(key_item) + \ 294 | "av_list.series LIKE '%{0}%' OR ".format(key_item) + \ 295 | "av_list.genre LIKE '%{0}%' OR ".format(key_item) + \ 296 | "av_list.stars LIKE '%{0}%')".format(key_item) 297 | 298 | 299 | def open_browser_tab(url): 300 | if not url: 301 | return 302 | LOGGER.info("open_browser_tab:%s", url) 303 | 304 | def _open_tab(url_param): 305 | webbrowser.open_new_tab(url_param) 306 | 307 | thread = threading.Thread(target=_open_tab, args=(url,)) 308 | thread.daemon = True 309 | thread.start() 310 | 311 | 312 | def sql_escape(keyword: str) -> str: 313 | for item in ESCAPE_LIST: 314 | keyword = keyword.replace(item[0], item[1]) 315 | return keyword 316 | 317 | 318 | # 解析源站url, 返回 page_type, keyword, page_start 319 | def parse_url(url: str) -> tuple: 320 | if url is None or url == '': 321 | return '', '', -1 322 | 323 | pattern_1 = "https?://[^/]+/[^/]+/popular(/page/(\\d+))?" 324 | pattern_2 = "https?://[^/]+/[^/]+/(movie|star|genre|series|studio|label|director|search)/([^/]+)(/page/(\\d+))?" 325 | 326 | if re.match(pattern_1, url): 327 | res = re.findall(pattern_1, url) 328 | page_start = int(res[0][1]) if res[0][1] else 1 329 | return "popular", '', page_start 330 | 331 | if re.match(pattern_2, url): 332 | res = re.findall(pattern_2, url) 333 | page_start = int(res[0][3]) if res[0][3] else 1 334 | return res[0][0], res[0][1], page_start 335 | 336 | LOGGER.fatal("wrong url:{}".format(url)) 337 | return '', '', -1 338 | 339 | 340 | # 获取sql中的表名 341 | def get_table_name(sql): 342 | return list(set(re.findall("(av_[a-z]+)", sql))) 343 | 344 | 345 | # 获取缓存key 346 | def gen_cache_key(sql): 347 | return '|'.join(get_table_name(sql)) + ':' + str(binascii.crc32(sql.encode()) & 0xffffffff) 348 | 349 | 350 | def empty(i: any) -> bool: 351 | if i is None: 352 | return True 353 | if isinstance(i, str): 354 | return i == '' 355 | if isinstance(i, list) or isinstance(i, tuple): 356 | return len(i) == 0 357 | if isinstance(i, dict): 358 | return i == {} 359 | if isinstance(i, int) or isinstance(i, float): 360 | return i == 0 361 | return False 362 | 363 | 364 | def non_empty(i: any) -> bool: 365 | return not empty(i) 366 | 367 | 368 | # 命令行颜色 369 | def color(c, s): 370 | if not CONFIG.getboolean('log', 'ansi_color'): 371 | return s 372 | """ 373 | \033[30m黑\033[0m 374 | \033[31m酱红\033[0m 375 | \033[32m浅绿\033[0m 376 | \033[33m黄褐\033[0m 377 | \033[34m浅蓝\033[0m 378 | \033[35m紫\033[0m 379 | \033[36m天蓝\033[0m 380 | \033[37m灰白\033[0m 381 | """ 382 | return "\033[{}m{}\033[0m".format(c, s) 383 | 384 | 385 | def upper_path(path: str) -> str: 386 | # 如果为windows环境路径,则路径首字母大写 387 | if re.match("^[a-z]:\\\\", path): 388 | return path[0].upper() + path[1:] 389 | else: 390 | return path 391 | 392 | 393 | def a_tag_build(link): 394 | return '{}'.format(link, link) 395 | 396 | 397 | # 识别linkid 398 | def is_linkid(linkid: str = '') -> bool: 399 | if empty(linkid): 400 | return False 401 | return re.match('^[a-z0-9]{16}$', linkid) is not None 402 | 403 | 404 | # 替换链接中命中rename的{rename} 405 | def url_rename(s: str) -> str: 406 | res = re.findall("{(.+)}", s) 407 | if res: 408 | return s.replace('{'+res[0]+'}', rename(res[0])) 409 | return s 410 | 411 | 412 | # 重命名 413 | def rename(name): 414 | # 渲染前准备rename数据 415 | storage_init(AV_EXTEND) 416 | if 'rename' not in DATA_STORAGE: 417 | DATA_STORAGE['rename'] = {} 418 | for row in DATA_STORAGE[AV_EXTEND]: 419 | if row['extend_name'] == 'rename': 420 | DATA_STORAGE['rename'][row['key']] = row['val'] 421 | if name in DATA_STORAGE['rename']: 422 | return DATA_STORAGE['rename'][name] 423 | return name 424 | 425 | 426 | # 列表小头图 427 | def small_img(s): 428 | return CONFIG.get('website', 'cdn') + '/digital/video' + s[:-6] + 'ps' + s[-4:] 429 | 430 | 431 | # 获取大头图 432 | def big_img(s): 433 | return CONFIG.get('website', 'cdn') + '/digital/video' + s 434 | 435 | 436 | # 是否为可播放的链接 437 | def can_play_url(s): 438 | p = parse.urlparse(s) 439 | if p.scheme not in ['http', 'https']: 440 | return False 441 | return list_in_str(('.m3u8', '.mp4', '.flv'), p.path) 442 | 443 | 444 | if __name__ == "__main__": 445 | pass 446 | -------------------------------------------------------------------------------- /config.ini.default: -------------------------------------------------------------------------------- 1 | [base] 2 | avmoo_site = https://avmoo.sbs 3 | db_file = avmoo.db 4 | port = 5000 5 | debug_mode = False 6 | # readonly,nothing update in sqlite db 7 | readonly = False 8 | 9 | # cn/tw/en/ja 10 | country = cn 11 | 12 | [spider] 13 | # could by a decimal number 14 | sleep = 2 15 | 16 | # integer 17 | insert_threshold = 10 18 | 19 | # continued skip count > continued_skip_limit, crawl end 20 | # integer 21 | continued_skip_limit = 30 22 | 23 | minimum_movie_duration = 0 24 | 25 | [requests] 26 | timeout = 3 27 | user_agent = Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36 28 | 29 | [website] 30 | # image server,//jp.netcdn.space, //pics.dmm.com, //pics.dmm.co.jp 31 | cdn = //pics.dmm.com 32 | 33 | # page item limit 34 | page_limit = 30 35 | 36 | # /actresses item limit 37 | actresses_page_limit = 36 38 | 39 | # /group item limit 40 | group_page_limit = 30 41 | 42 | # update work list and newest movie, interval time(ms) 43 | spider_page_interval_timeout = 20000 44 | 45 | # release_date or count 46 | group_page_order_by = count 47 | 48 | # cache enable or not 49 | use_cache = False 50 | 51 | # auto open site on run or not 52 | auto_open_site_on_run = True 53 | 54 | # auto open link when spider done work 55 | auto_open_link_when_crawl_done = True 56 | 57 | # fast run,less function 58 | efficiency_mode = False 59 | 60 | # search site url,av_id will be end of url 61 | search_url = https://btsow.rest/search/ 62 | 63 | [log] 64 | ansi_color = False 65 | 66 | [aqd] 67 | # from aqd get movie m3u8 play link 68 | aqd_site = https://vip.aqdx200.com 69 | -------------------------------------------------------------------------------- /define.py: -------------------------------------------------------------------------------- 1 | # flask应用名 2 | APP_NAME = 'website' 3 | 4 | # 表名 5 | AV_STARS = 'av_stars' 6 | AV_GENRE = 'av_genre' 7 | AV_LIST = 'av_list' 8 | AV_EXTEND = 'av_extend' 9 | 10 | # 右上角切换语言 11 | COUNTRY_MAP = { 12 | 'en': 'English', 13 | 'ja': '日本语', 14 | 'tw': '正體中文', 15 | 'cn': '简体中文', 16 | } 17 | 18 | PAGE_TYPE_MAP = { 19 | # page_type名 20 | 'director': { 21 | # 页面名称 22 | 'name': '导演', 23 | # 是否允许收藏 24 | 'like_enable': False, 25 | # 是否允许改显示名称 26 | 'rename_enable': True, 27 | # db字段, like key 28 | 'key': 'director_url', 29 | # av_list影片列表查询条件 30 | 'where': "director_url='{}'", 31 | }, 32 | 'movie': { 33 | 'name': '影片', 34 | 'like_enable': True, 35 | 'rename_enable': False, 36 | 'key': 'av_id', 37 | 'where': "linkid='{0}' OR av_id='{0}'", 38 | }, 39 | 'studio': { 40 | 'name': '制作商', 41 | 'like_enable': True, 42 | 'rename_enable': True, 43 | 'key': 'studio_url', 44 | 'where': "studio_url='{}'", 45 | }, 46 | 'label': { 47 | 'name': '发行商', 48 | 'like_enable': True, 49 | 'rename_enable': True, 50 | 'key': 'label_url', 51 | 'where': "label_url='{}'", 52 | }, 53 | 'series': { 54 | 'name': '系列', 55 | 'like_enable': True, 56 | 'rename_enable': True, 57 | 'key': 'series_url', 58 | 'where': "series_url='{}'", 59 | }, 60 | 'star': { 61 | 'name': '演员', 62 | 'like_enable': False, 63 | 'rename_enable': True, 64 | 'key': 'stars_url', 65 | 'where': "stars_url GLOB '*|{}*'", 66 | }, 67 | 'genre': { 68 | 'name': '类别', 69 | 'like_enable': False, 70 | 'rename_enable': True, 71 | 'key': 'genre_url', 72 | 'where': "genre GLOB '*|{}|*'", 73 | }, 74 | 'group': { 75 | 'name': '番号', 76 | 'like_enable': True, 77 | 'rename_enable': False, 78 | 'key': 'group', 79 | 'where': "av_id LIKE '{}-%'", 80 | }, 81 | 'like': { 82 | 'name': '收藏', 83 | 'like_enable': False, 84 | 'rename_enable': False, 85 | }, 86 | } 87 | 88 | # sqlite escapt list 89 | ESCAPE_LIST = ( 90 | ("/", "//"), 91 | ("'", "''"), 92 | ("[", "/["), 93 | ("]", "/]"), 94 | ("%", "/%"), 95 | ("&", "/&"), 96 | ("_", "/_"), 97 | ("(", "/("), 98 | (")", "/)"), 99 | ) 100 | 101 | PAGE_MAX = 100 102 | 103 | LOCAL_IP = "127.0.0.1" 104 | 105 | # /config 106 | CONFIG_NAME_LIST = [ 107 | "base.avmoo_site", 108 | "base.db_file", 109 | "base.port", 110 | "base.debug_mode", 111 | "base.readonly", 112 | "base.country", 113 | 114 | "spider.sleep", 115 | "spider.insert_threshold", 116 | "spider.continued_skip_limit", 117 | "spider.minimum_movie_duration", 118 | 119 | "requests.timeout", 120 | "requests.user_agent", 121 | 122 | "website.cdn", 123 | "website.page_limit", 124 | "website.actresses_page_limit", 125 | "website.group_page_limit", 126 | "website.spider_page_interval_timeout", 127 | "website.search_url", 128 | 129 | "website.group_page_order_by", 130 | "website.use_cache", 131 | "website.auto_open_site_on_run", 132 | "website.auto_open_link_when_crawl_done", 133 | "website.efficiency_mode", 134 | 135 | "log.ansi_color", 136 | ] 137 | 138 | # /spider 文件类型与判定 139 | FILE_TAIL = { 140 | 'mp4': "\\.(mp4|mkv|flv|avi|rm|rmvb|mpg|mpeg|mpe|m1v|mov|3gp|m4v|m3p|wmv|wmp|wm)$", 141 | 'jpg': "\\.(jpg|png|gif|jpeg|bmp|ico)$", 142 | 'mp3': "\\.(mp3|wav|wmv|mpa|mp2|ogg|m4a|aac)$", 143 | 'torrent': "\\.torrent$", 144 | 'zip': "\\.(zip|rar|gz|7z)$", 145 | 'doc': "\\.(xls|xlsx|doc|docx|ppt|pptx|csv|pdf|html|txt)$", 146 | } 147 | 148 | # /spider av视频文件判断正则 149 | AV_FILE_REG = "[a-zA-Z]{3,5}-\\d{3,4}" 150 | 151 | CREATE_AV_GENRE_SQL = ''' 152 | CREATE TABLE IF NOT EXISTS "av_genre" ( 153 | "linkid" CHAR(16) NOT NULL, 154 | "name" TEXT, 155 | "title" TEXT, 156 | PRIMARY KEY ("linkid") 157 | ); 158 | ''' 159 | 160 | CREATE_AV_LIST_SQL = ''' 161 | CREATE TABLE IF NOT EXISTS "av_list" ( 162 | "linkid" CHAR(16) NOT NULL, 163 | "title" TEXT, 164 | "av_id" VARCHAR(20), 165 | "release_date" CHAR(10), 166 | "len" INTEGER, 167 | "director" TEXT, 168 | "studio" TEXT, 169 | "label" TEXT, 170 | "series" TEXT, 171 | "genre" TEXT, 172 | "stars" TEXT, 173 | "director_url" TEXT, 174 | "studio_url" CHAR(16), 175 | "label_url" CHAR(16), 176 | "series_url" TEXT, 177 | "stars_url" TEXT, 178 | "bigimage" TEXT, 179 | "image_len" INTEGER, 180 | PRIMARY KEY ("linkid") 181 | ); 182 | ''' 183 | 184 | CREATE_AV_STARS_SQL = ''' 185 | CREATE TABLE IF NOT EXISTS "av_stars" ( 186 | "linkid" CHAR(16) NOT NULL, 187 | "name" TEXT, 188 | "name_history" TEXT, 189 | "birthday" TEXT, 190 | "height" TEXT, 191 | "cup" CHAR(1), 192 | "bust" TEXT, 193 | "waist" TEXT, 194 | "hips" TEXT, 195 | "hometown" TEXT, 196 | "hobby" TEXT, 197 | "headimg" TEXT, 198 | PRIMARY KEY ("linkid") 199 | ); 200 | ''' 201 | 202 | CREATE_AV_EXTEND_SQL = ''' 203 | CREATE TABLE IF NOT EXISTS "av_extend" ( 204 | "id" INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT, 205 | "extend_name" VARCHAR(10) NOT NULL, 206 | "key" VARCHAR(20) NOT NULL, 207 | "val" TEXT NOT NULL 208 | ); 209 | ''' 210 | 211 | AV_GENRE_DEMO_DATA = [ 212 | ('like', 'group', 'SSIS'), 213 | ('like', 'studio_url', '80be243ea6164094'), 214 | ('like', 'label_url', 'b0b3be30e6bf490f'), 215 | ('like', 'series_url', 'c343a1499f108277'), 216 | ('like', 'av_id', 'SSIS-318'), 217 | ('like', 'av_id', 'SSIS-318'), 218 | ('movie_res', 'SSIS-318', 'magnet:?xt=urn:btih:E0C7B27071A832388AF9C54553EECF71F4094256&dn=SSIS-318-C'), 219 | ] 220 | -------------------------------------------------------------------------------- /run.py: -------------------------------------------------------------------------------- 1 | import common 2 | import spider 3 | import website 4 | import sys 5 | 6 | # 配置初始化 7 | # 1. 读取配置 8 | # 2. 初始化db 9 | # 3. 建表 10 | config_file = None 11 | if len(sys.argv) > 1: 12 | config_file = sys.argv[1] 13 | common.init(config_file) 14 | 15 | # 爬虫类初始化 16 | # 1. 初始化db 17 | # 2. 初始化requests 18 | # 3. genre为空则获取 19 | # 4. 启动爬虫线程 20 | spider.Spider().run() 21 | 22 | # flask应用 23 | website.run() 24 | -------------------------------------------------------------------------------- /spider.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding:utf-8 -*- 3 | import time 4 | 5 | from requests import Timeout 6 | 7 | from typing import Iterator 8 | from common import * 9 | 10 | 11 | class Spider: 12 | instance = None 13 | requests_ins = None 14 | db_ins = None 15 | log = logging.getLogger('spider') 16 | 17 | def __init__(self): 18 | self.last_insert_list = [] 19 | self.running_work = None 20 | self.done_work = [] 21 | Spider.log.info("spider.init") 22 | 23 | def __new__(cls, *args, **kwargs): 24 | if not cls.instance: 25 | cls.instance = super(Spider, cls).__new__(cls) 26 | return cls.instance 27 | 28 | def run(self): 29 | create_logger('spider') 30 | # 启动爬虫线程 31 | if CONFIG.getboolean("base", "readonly"): 32 | return 33 | thread = threading.Thread(target=self.spider_thread, args=()) 34 | thread.daemon = True 35 | thread.start() 36 | 37 | @staticmethod 38 | def db(): 39 | if Spider.db_ins is None: 40 | Spider.log.info('spider.db.init') 41 | # 链接数据库 42 | Spider.db_ins = sqlite3.connect(CONFIG.get("base", "db_file")) 43 | Spider.db_ins.row_factory = make_dicts 44 | return Spider.db_ins 45 | 46 | @staticmethod 47 | def requests(): 48 | if Spider.requests_ins is None: 49 | requests.packages.urllib3.disable_warnings() 50 | Spider.log.info('spider.requests.init') 51 | # 创建会话对象 52 | Spider.requests_ins = requests.Session() 53 | # 忽略证书 54 | Spider.requests_ins.verify = False 55 | Spider.requests_ins.headers = { 56 | 'User-Agent': CONFIG.get("requests", "user_agent"), 57 | } 58 | # 代理 59 | Spider.requests_ins.proxies = { 60 | # 'https':'http://127.0.0.1:1080' 61 | } 62 | return Spider.requests_ins 63 | 64 | # 爬虫线程 65 | def spider_thread(self): 66 | Spider.log.info("spider_thread.start") 67 | while True: 68 | time.sleep(CONFIG.getfloat("spider", "sleep")) 69 | 70 | # 获取一个任务 71 | work_param = QUEUE.get() 72 | work_param["url"] = get_url(work_param["page_type"], work_param["keyword"], work_param["page_start"]) 73 | work_param["status"] = "ING" 74 | 75 | # 记录运行中任务 76 | self.running_work = work_param.copy() 77 | 78 | work_param["exist_linkid"] = {} 79 | # 是否跳过 默认跳过 80 | if "skip_exist" not in work_param or work_param.get("skip_exist"): 81 | work_param["exist_linkid"] = Spider.get_exist_linkid(work_param["page_type"], work_param["keyword"]) 82 | 83 | Spider.log.info("[crawl start]url:{0[url]} page_limit:{0[page_limit]}, exist_count:{1}".format( 84 | work_param, len(work_param["exist_linkid"]))) 85 | ret = self.crawl_accurate(work_param) 86 | 87 | # 打开浏览器提醒抓取完成 88 | if ret: 89 | # 清空缓存 90 | if CONFIG.getboolean("website", "use_cache"): 91 | SQL_CACHE.clear() 92 | if CONFIG.getboolean("website", "auto_open_link_when_crawl_done"): 93 | open_browser_tab(get_local_url(work_param["page_type"], work_param["keyword"], work_param["page_start"])) 94 | 95 | 96 | if "exist_linkid" in self.running_work: 97 | del self.running_work["exist_linkid"] 98 | self.done_work.append(self.running_work) 99 | self.running_work = None 100 | 101 | def get_last_insert_list(self): 102 | max_count = CONFIG.getint("spider", "insert_threshold") 103 | if len(self.last_insert_list) > max_count: 104 | # 取最后几个 105 | self.last_insert_list = self.last_insert_list[-max_count:] 106 | return self.last_insert_list 107 | 108 | def get_running_work(self, action: str = ''): 109 | if action: 110 | self.running_work["status"] = action 111 | return 112 | return self.running_work 113 | 114 | def get_done_work(self): 115 | return self.done_work 116 | 117 | @staticmethod 118 | def fetchall(sql) -> list: 119 | cur = Spider.db().cursor() 120 | cur.execute(sql) 121 | return cur.fetchall() 122 | 123 | # 根据链接参数抓取 124 | def crawl_accurate(self, work_param: dict) -> bool: 125 | page_type = work_param["page_type"] 126 | if not page_type: 127 | Spider.log.error("wrong param") 128 | return False 129 | # 单个电影 130 | if page_type == "movie": 131 | (status_code, data) = Spider.crawl_by_movie_linkid(work_param["keyword"]) 132 | if empty(data) or status_code != 200: 133 | Spider.log.warning("crawl_by_movie_linkid wrong,data:%r,status_code:%d", data, status_code) 134 | return False 135 | self.movie_save([data]) 136 | return True 137 | # 其他 138 | if page_type in ('genre', 'series', 'studio', 'label', 'director', 'search', 'star', 'popular'): 139 | self.crawl_by_page_type(work_param) 140 | return True 141 | Spider.log.fatal("wrong param,work_param:%s", work_param) 142 | return False 143 | 144 | # 获取所有类别 145 | @staticmethod 146 | def crawl_genre() -> list: 147 | genre_url = get_url('genre', '') 148 | Spider.log.info("get:%s", genre_url) 149 | (status_code, html) = Spider.get_html_by_url(genre_url) 150 | insert_list = [] 151 | h4 = html.xpath('/html/body/div[2]/h4/text()') 152 | div = html.xpath('/html/body/div[2]/div') 153 | for div_item in range(len(div)): 154 | g_title = h4[div_item] 155 | a_list = div[div_item].xpath('a') 156 | for a_item in a_list: 157 | if empty(a_item.text): 158 | continue 159 | insert_list.append({ 160 | "linkid": a_item.attrib.get('href')[-16:], 161 | "name": a_item.text, 162 | "title": g_title 163 | }) 164 | Spider.log.info('genre fetch record:%r', len(insert_list)) 165 | return insert_list 166 | 167 | # 根据页面类型抓取所有影片 168 | def crawl_by_page_type(self, work_param: dict) -> None: 169 | if work_param["page_type"] == 'star': 170 | Spider.stars_one(work_param["keyword"]) 171 | # 待插入 172 | insert_list = [] 173 | insert_count = 0 174 | skip_count = 0 175 | banned_count = 0 176 | continued_skip_count = 0 177 | for movie_linkid in Spider.linkid_general(work_param): 178 | # 跳出 179 | if self.running_work["status"] != "ING": 180 | # 任务结束 181 | break 182 | 183 | # 跳过已存在的 184 | if movie_linkid in work_param["exist_linkid"]: 185 | skip_count += 1 186 | continued_skip_count += 1 187 | # Spider.log.info("SKIP EXIST,URL:%s", get_local_url("movie", movie_linkid)) 188 | # 连续跳过到指定数量,则跳出抓取 189 | if continued_skip_count >= CONFIG.getint("spider", "continued_skip_limit"): 190 | break 191 | continue 192 | 193 | continued_skip_count = 0 194 | time.sleep(CONFIG.getfloat("spider", "sleep")) 195 | 196 | (status_code, data) = Spider.crawl_by_movie_linkid(movie_linkid) 197 | if status_code == 403: 198 | banned_count += 1 199 | if banned_count == 10: 200 | Spider.log.info("banned count:%d,break loop", banned_count) 201 | break 202 | continue 203 | if empty(data): 204 | continue 205 | 206 | # 判断影片是否符合要求 207 | duration = CONFIG.getint("spider", "minimum_movie_duration") 208 | if duration > 0 and data["len"] < duration: 209 | Spider.log.info("movie duration non conformance,url:%s", get_url("movie", movie_linkid)) 210 | continue 211 | 212 | insert_list.append(data) 213 | # 存储数据 214 | if len(insert_list) == CONFIG.getint("spider", "insert_threshold"): 215 | self.movie_save(insert_list) 216 | insert_count += len(insert_list) 217 | insert_list = [] 218 | # 插入剩余的数据 219 | self.movie_save(insert_list) 220 | insert_count += len(insert_list) 221 | Spider.log.info("[exist_count:{}][fetch_count:{}][skip_count:{}]".format( 222 | len(work_param["exist_linkid"]), insert_count, skip_count)) 223 | 224 | # 根据linkid抓取一个movie页面 225 | @staticmethod 226 | def crawl_by_movie_linkid(movie_linkid: str) -> tuple: 227 | url = get_url('movie', movie_linkid) 228 | (status_code, html) = Spider.get_html_by_url(url) 229 | if status_code != 200: 230 | return status_code, None 231 | if html is None: 232 | return status_code, None 233 | # 解析页面内容 234 | try: 235 | data = Spider.movie_page_data(html) 236 | except Exception as e: 237 | Spider.log.error('movie_page_data error:%s', traceback.format_exc()) 238 | return status_code, None 239 | 240 | if empty(data) or empty(data['av_id']) or empty(data["title"]): 241 | Spider.log.error("movie crawl fatal,linkid:%s", movie_linkid) 242 | return 500, None 243 | data['linkid'] = movie_linkid 244 | # 输出当前进度 245 | Spider.log.info(data['av_id'].ljust(15) + data['release_date'] + ' ' + data['stars']) 246 | return status_code, data 247 | 248 | # 获取一个明星的信息 249 | @staticmethod 250 | def stars_one(linkid: str): 251 | stars_res = Spider.fetchall("SELECT * FROM av_stars WHERE linkid='{}'".format(linkid)) 252 | if len(stars_res) == 1: 253 | return stars_res[0] 254 | 255 | def get_val(str_param): 256 | return str_param.split(':')[1].strip() 257 | 258 | url = get_url('star', linkid) 259 | data = { 260 | 'linkid': linkid, 261 | 'name': '', 262 | 'name_history': '', 263 | 'birthday': '', 264 | 'height': '', 265 | 'cup': '', 266 | 'bust': '', 267 | 'waist': '', 268 | 'hips': '', 269 | 'hometown': '', 270 | 'hobby': '', 271 | 'headimg': '' 272 | } 273 | Spider.log.info("get:%s", url) 274 | (status_code, html) = Spider.get_html_by_url(url) 275 | if html is None: 276 | return False 277 | 278 | try: 279 | data['name'] = html.xpath( 280 | '/html/head/meta[8]/@content')[0].split(',', 1)[0] 281 | data['headimg'] = html.xpath( 282 | '//*[@id="waterfall"]/div[1]/div/div[1]/img/@src')[0].split('/', 3)[3].replace( 283 | 'mono/actjpgs/nowprinting.gif', '') 284 | except: 285 | return False 286 | 287 | for item_p in html.xpath('//*[@id="waterfall"]/div[1]/div/div[2]/p'): 288 | if empty(item_p.text): 289 | continue 290 | if list_in_str(('生日:', 'Birthday:', '生年月日:'), item_p.text): 291 | data['birthday'] = get_val(item_p.text) 292 | continue 293 | if list_in_str(('身高:', 'Height:', '身長:'), item_p.text): 294 | data['height'] = get_val(item_p.text) 295 | continue 296 | if list_in_str(('罩杯:', 'Cup:', 'ブラのサイズ:'), item_p.text): 297 | data['cup'] = get_val(item_p.text) 298 | continue 299 | if list_in_str(('胸围:', 'Bust:', 'バスト:'), item_p.text): 300 | data['bust'] = get_val(item_p.text) 301 | continue 302 | if list_in_str(('腰围:', 'Waist:', 'ウエスト:'), item_p.text): 303 | data['waist'] = get_val(item_p.text) 304 | continue 305 | if list_in_str(('臀围:', 'Hips:', 'ヒップ:'), item_p.text): 306 | data['hips'] = get_val(item_p.text) 307 | continue 308 | if list_in_str(('出生地:', 'Hometown:', '出身地:'), item_p.text): 309 | data['hometown'] = get_val(item_p.text) 310 | continue 311 | if list_in_str(('爱好:', 'Hobby:', '趣味:'), item_p.text): 312 | data['hobby'] = get_val(item_p.text) 313 | continue 314 | # 讲括号中的名字记录为曾用名 315 | tmp = data['name'].replace('(', '(').replace(')', '').split('(') 316 | if len(tmp) == 2: 317 | data['name_history'] = tmp[1] 318 | Spider.log.info("star:%r", data) 319 | Spider.stars_save(data) 320 | return data 321 | 322 | # 自动翻页返回movie_id 323 | @staticmethod 324 | def linkid_general(work_param: dict) -> Iterator[str]: 325 | # 网站限制最多100页 326 | for page_no in range(work_param["page_start"], work_param["page_limit"] + 1): 327 | time.sleep(CONFIG.getfloat("spider", "sleep")) 328 | 329 | url = get_url(work_param["page_type"], work_param["keyword"], page_no) 330 | Spider.log.info("get:{}".format(url)) 331 | 332 | (status_code, html) = Spider.get_html_by_url(url) 333 | if status_code in [304, 403, 404, 500] or html is None: 334 | break 335 | 336 | movie_id_list = html.xpath('//*[@id="waterfall"]/div/a/@href') 337 | if not movie_id_list: 338 | Spider.log.warning("page empty break") 339 | break 340 | for item in movie_id_list: 341 | if re.search("movie/[a-z0-9]{16}$", item): 342 | yield item[-16:] 343 | 344 | # 检查是否有下一页 345 | next_page = html.xpath( 346 | '//span[@class="glyphicon glyphicon-chevron-right"]') 347 | if not next_page: 348 | break 349 | 350 | @staticmethod 351 | def stars_save(data: dict) -> None: 352 | insert_sql = replace_sql_build(AV_STARS, data) 353 | Spider.db().execute(insert_sql, tuple(data.values())) 354 | Spider.db().commit() 355 | 356 | # 插入数据库 357 | def movie_save(self, insert_list: list) -> None: 358 | if empty(insert_list): 359 | return 360 | self.last_insert_list.extend(insert_list) 361 | 362 | insert_sql = replace_sql_build(AV_LIST, insert_list[0]) 363 | cur = Spider.db().cursor() 364 | cur.executemany(insert_sql, [tuple(x.values()) for x in insert_list]) 365 | Spider.db().commit() 366 | Spider.log.info('INSERT:%d', len(insert_list)) 367 | 368 | # 解析html数据 369 | @staticmethod 370 | def movie_page_data(html) -> dict: 371 | data = { 372 | 'linkid': '', 373 | # 番号 374 | 'av_id': html.xpath('/html/body/div[2]/div[1]/div[2]/p[1]/span[2]/text()')[0].strip().upper(), 375 | 'director': '', 376 | 'director_url': '', 377 | 'studio': '', 378 | 'studio_url': '', 379 | 'label': '', 380 | 'label_url': '', 381 | 'series': '', 382 | 'series_url': '', 383 | 'genre': '', 384 | 'stars': '', 385 | 'stars_url': '', 386 | # 图片个数image_len 387 | 'image_len': int(len(html.xpath('//div[@id="sample-waterfall"]/a'))), 388 | 'len': 0, 389 | # 标题 390 | 'title': html.xpath('/html/body/div[2]/h3/text()')[0].strip(), 391 | # 封面 截取域名之后的部分 392 | 'bigimage': '/' + html.xpath('/html/body/div[2]/div[1]/div[1]/a/img/@src')[0].split('/', 5)[5].strip(), 393 | # 发行时间 394 | 'release_date': html.xpath('/html/body/div[2]/div[1]/div[2]/p[2]/text()')[0].strip() 395 | } 396 | # 时长len 397 | len_text = html.xpath('/html/body/div[2]/div[1]/div[2]/p[3]/text()') 398 | if non_empty(len_text): 399 | res = re.findall("(\\d+)", len_text[0]) 400 | if non_empty(res): 401 | data['len'] = int(res[0].strip()) 402 | 403 | # 获取:导演、制作商、发行商、系列 404 | right_info = html.xpath('/html/body/div[2]/div[1]/div[2]/p/a') 405 | for i in right_info: 406 | if empty(i.text): 407 | continue 408 | tmp_href = i.attrib.get('href') 409 | 410 | if "/director/" in tmp_href: 411 | # 导演 412 | data['director'] = i.text.strip() 413 | data['director_url'] = tmp_href[-16:] 414 | elif "/studio/" in tmp_href: 415 | # 制作商 416 | data['studio'] = i.text.strip() 417 | data['studio_url'] = tmp_href[-16:] 418 | elif "/label/" in tmp_href: 419 | # 发行商 420 | data['label'] = i.text.strip() 421 | data['label_url'] = tmp_href[-16:] 422 | elif "/series/" in tmp_href: 423 | # 系列 424 | data['series'] = i.text.strip() 425 | data['series_url'] = tmp_href[-16:] 426 | 427 | genre_list = [] 428 | # 获取类别列表genre 类别列表genre_url 429 | for genre_tag in html.xpath('/html/body/div[2]/div[1]/div[2]/p/span/a'): 430 | if genre_tag.text is None: 431 | continue 432 | # 获取类目链接 433 | link = genre_tag.attrib.get('href') 434 | # 获取类目名 435 | name = genre_tag.text.strip() 436 | genre_list.append(name) 437 | 438 | # 查看类目是否存在,不存在则添加 439 | storage_ret = storage(AV_GENRE, {"linkid": link[-16:]}, "name") 440 | if empty(storage_ret): 441 | # 添加新类目 442 | genre_data = { 443 | 'linkid': link[-16:], 444 | 'name': name, 445 | 'title': '未知分类' 446 | } 447 | Spider.log.info('find new genre:%r', genre_data) 448 | sql = replace_sql_build(AV_GENRE, genre_data) 449 | Spider.db().execute(sql, tuple(genre_data.values())) 450 | Spider.db().commit() 451 | DATA_STORAGE[AV_GENRE].clear() 452 | 453 | data['genre'] = '|'.join(genre_list) 454 | if non_empty(data['genre']): 455 | data['genre'] = '|' + data['genre'] + '|' 456 | 457 | # 演员stars 458 | star_list = html.xpath('//div[@id="avatar-waterfall"]/a/span/text()') 459 | data['stars'] = '|'.join([x.strip() for x in star_list]) 460 | if non_empty(data['stars']): 461 | data['stars'] = '|' + data['stars'] + '|' 462 | 463 | # stars_url 464 | stars_url_list = html.xpath('//div[@id="avatar-waterfall"]/a/@href') 465 | if non_empty(stars_url_list): 466 | data['stars_url'] = '|' + '|'.join([re.findall('([a-z0-9]+)$', x)[0] 467 | for x in stars_url_list]) 468 | 469 | return data 470 | 471 | # 查询已存在影片 472 | @staticmethod 473 | def get_exist_linkid(page_type: str, keyword: str) -> dict: 474 | sql = '' 475 | exist_linkid_dict = {} 476 | # 必须有值 477 | if not keyword: 478 | return {} 479 | # 查询已存在的 480 | if page_type in ['director', 'studio', 'label', 'series']: 481 | sql = "SELECT linkid FROM av_list WHERE {}_url='{}'".format(page_type, keyword) 482 | if page_type == 'genre': 483 | genre = Spider.fetchall("SELECT name FROM av_genre WHERE linkid='{}'".format(keyword)) 484 | if genre: 485 | sql = "SELECT linkid FROM av_list WHERE genre LIKE '%|{}|%'".format(genre[0]['name']) 486 | if page_type == 'star': 487 | sql = "SELECT linkid FROM av_list WHERE stars_url LIKE '%{}%'".format(keyword) 488 | if page_type == 'group': 489 | sql = "SELECT linkid FROM av_list WHERE av_id LIKE '{}-%'".format(keyword) 490 | if page_type == 'search': 491 | where = [] 492 | for key_item in keyword.split(' '): 493 | where.append(search_where(key_item)) 494 | sql = "SELECT linkid FROM av_list WHERE " + " AND ".join(where) 495 | if non_empty(sql): 496 | ret = Spider.fetchall(sql) 497 | exist_linkid_dict = {x["linkid"]: True for x in ret} 498 | return exist_linkid_dict 499 | 500 | @staticmethod 501 | def get_html_by_url(url: str) -> tuple: 502 | retry_limit = 100 503 | for i in range(retry_limit): 504 | try: 505 | res = Spider.requests().get(url, timeout=CONFIG.getint("requests", "timeout")) 506 | if res.status_code != 200: 507 | Spider.log.error("status_code = {},url:{}".format(res.status_code, url)) 508 | return res.status_code, None 509 | 510 | return 200, etree.HTML(res.text) 511 | except Timeout as e: 512 | Spider.log.warning("requests Timeout,error:{}\nretry url:{}".format( 513 | e, url 514 | )) 515 | # 休眠 516 | time.sleep(10) 517 | # 超时重试 518 | continue 519 | 520 | except ConnectionError as e: 521 | Spider.log.warning("requests ConnectionError,error:{}\nretry url:{}".format( 522 | e, url 523 | )) 524 | # 休眠 525 | time.sleep(10) 526 | # 链接异常 527 | continue 528 | 529 | except Exception as e: 530 | Spider.log.warning("requests Exception:{}\nurl:{}".format(e, url)) 531 | time.sleep(10) 532 | continue 533 | # 返回错误 534 | return 500, None 535 | 536 | 537 | if __name__ == '__main__': 538 | pass 539 | -------------------------------------------------------------------------------- /static/glyphicons-halflings-regular.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/moozik/avmoo-spider/424d5f95f4f8dc3b766231235a49cd6e8cc49d4b/static/glyphicons-halflings-regular.woff2 -------------------------------------------------------------------------------- /static/icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/moozik/avmoo-spider/424d5f95f4f8dc3b766231235a49cd6e8cc49d4b/static/icon.png -------------------------------------------------------------------------------- /templates/actresses.html: -------------------------------------------------------------------------------- 1 | {% extends "main.html" %} 2 | 3 | 4 | {% block head %} 5 | 10 | {% endblock %} 11 | 12 | {% block container %} 13 |
生日: {{ data.av_stars.birthday }}
{% endif %} 21 | {% if data.av_stars.age %}年龄: {{ data.av_stars.age }}岁
{% endif %} 22 | {% if data.av_stars.height %}身高: {{ data.av_stars.height }}
{% endif %} 23 | {% if data.av_stars.cup %}罩杯: {{ data.av_stars.cup }}
{% endif %} 24 | {% if data.av_stars.bust %}胸围: {{ data.av_stars.bust }}
{% endif %} 25 | {% if data.av_stars.waist %}腰围: {{ data.av_stars.waist }}
{% endif %} 26 | {% if data.av_stars.hips %}臀围: {{ data.av_stars.hips }}
{% endif %} 27 | {% if data.av_stars.hometown %}出生地: {{ data.av_stars.hometown }}
{% endif %} 28 | {% if data.av_stars.hobby %}爱好: {{ data.av_stars.hobby }}
{% endif %} 29 |{{ page_type_map[data.page_type].name }}:
40 |{{ frame_data.placeholder | rename }}
41 | {% endif %} 42 |43 | 47 |
48 | 50 | {# page_type_map中定义的才展示收藏 #} 51 | {% if page_type_map[data.page_type].like_enable %} 52 |53 | {% if data.is_like %} 54 | 58 | {% else %} 59 | 63 | {% endif %} 64 |
65 | {% endif %} 66 | {% if page_type_map[data.page_type].rename_enable %} 67 |68 | 72 |
73 | {% endif %} 74 |14 | 识别码: 15 | {{data.av_id}} 16 |
17 |18 | 番号: 19 | {{data.av_group}} 20 |
21 | {% if data.release_date %} 22 |23 | 发行时间: {{data.release_date}} 24 |
25 | {% endif %} 26 | {% if data.len %} 27 |28 | 长度: {{data.len}}分钟 29 |
30 | {% endif %} 31 | {% if data.director_url %} 32 |33 | 导演: 34 | {{data.director | rename}} 35 |
36 | {% endif %} 37 | {% if data.studio_url %} 38 |制作商:
39 |40 | {{data.studio | rename}} 41 |
42 | {% endif %} 43 | {% if data.label_url %} 44 |发行商:
45 |46 | {{data.label | rename}} 47 |
48 | {% endif %} 49 | {% if data.series_url %} 50 |系列:
51 |52 | {{data.series | rename}} 53 |
54 | {% endif %} 55 | {% if data.genre_data %} 56 |类别:
57 |58 | {% for item in data.genre_data %} 59 | 60 | {{item.name | rename}} 61 | 62 | {% endfor %} 63 |
64 | {% endif %} 65 |操作:
66 |67 | {% if data.is_like %} 68 | 69 | {% else %} 70 | 71 | {% endif %} 72 | 73 |
74 |130 | {% for item in data.stars_map %} 131 | 132 | {{item.name}} 133 | 134 | {% endfor %} 135 |
136 | {% endif %} 137 | 138 | 139 | 140 | {% if data.image_len > 0 %} 141 |路径 | 41 |打开 | 42 | {% if file_target == "mp4" %} 43 |信息 | 44 | {% endif %} 45 |
---|---|---|
{{resource.file_path}} | 52 |53 | |
54 | {{resource.av_id}}
55 | {% if resource.info.has_fetch_movie %}
56 | [影片已抓取]
57 | {% else %}
58 | [影片未抓取]
59 | {% endif %}
60 | {% if resource.info.has_res_extend %}
61 | [已存储路径]
62 | {% else %}
63 | [未存储路径]
64 |
65 | {% endif %}
66 | {# #}
67 | {# #}
68 | {# #}
72 | {# #}
73 | {# #}
74 | {# #}
78 | {# #}
79 | |
80 |
{{resource.file_path}} | 88 |89 | | {{resource.av_id}} [影片未抓取] | 90 |
{{resource.file_path}} | 98 |99 | | 100 | |
{{resource.file_path}} | 109 |110 | |