├── .gitignore ├── README.md ├── aqd_spider.py ├── common.py ├── config.ini.default ├── define.py ├── run.py ├── spider.py ├── static ├── app.min.css ├── app.min.js ├── glyphicons-halflings-regular.woff2 └── icon.png ├── templates ├── actresses.html ├── analyse.html ├── config.html ├── genre.html ├── group.html ├── index.html ├── install.html ├── main.html ├── movie.html ├── scandisk.html └── spider.html └── website.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.db 2 | *.ini 3 | *.txt 4 | .vscode/ 5 | .idea/ 6 | __pycache__/ 7 | /*.js 8 | /*.json 9 | test.py 10 | logs 11 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## 运行需求 2 | python3 3 | **第三方包** 4 | ```bash 5 | pip install requests 6 | pip install lxml 7 | pip install flask 8 | ``` 9 | ## 页面功能 10 | 11 | - `http://127.0.0.1:5000/` 首页 12 | - `http://127.0.0.1:5000/search/已发布` 已发布 13 | - `http://127.0.0.1:5000/search/已下载` 已下载 14 | - `http://127.0.0.1:5000/search/有资源` 有资源 15 | 16 | ### 列表页 17 | - `http://127.0.0.1:5000/group` 番号列表 18 | - `http://127.0.0.1:5000/actresses` 演员列表 19 | - `http://127.0.0.1:5000/genre` 类别列表 20 | - `http://127.0.0.1:5000/studio` 制作商列表 21 | - `http://127.0.0.1:5000/label` 发行商列表 22 | - `http://127.0.0.1:5000/series` 系列列表 23 | 24 | ### 明细页 25 | - `http://127.0.0.1:5000/movie/e3dedf889e44cee8` 影片明细 26 | - `http://127.0.0.1:5000/group/IPX` 番号明细 27 | - `http://127.0.0.1:5000/star/1971f1973cf8172f` 演员明细 28 | - `http://127.0.0.1:5000/genre/dd21aefe7ae3228c` 类别明细 29 | - `http://127.0.0.1:5000/studio/80be243ea6164094` 制作商明细 30 | - `http://127.0.0.1:5000/label/b0b3be30e6bf490f` 发行商明细 31 | - `http://127.0.0.1:5000/series/c28ffa16eae1bf1e` 系列明细 32 | - `http://127.0.0.1:5000/director/bb914a54dc51b21b` 导演明细 33 | 34 | ### 收藏页 35 | - `http://127.0.0.1:5000/like/group` 收藏番号 36 | - `http://127.0.0.1:5000/like/movie` 收藏影片 37 | - `http://127.0.0.1:5000/like/studio` 收藏制作商 38 | - `http://127.0.0.1:5000/like/label` 收藏发行商 39 | - `http://127.0.0.1:5000/like/series` 收藏系列 40 | 41 | ### 分析页 42 | - `http://127.0.0.1:5000/analyse/group/IPX` 分析番号 43 | - `http://127.0.0.1:5000/analyse/star/1971f1973cf8172f` 分析演员 44 | - `http://127.0.0.1:5000/analyse/genre/dd21aefe7ae3228c` 分析类别 45 | - `http://127.0.0.1:5000/analyse/studio/80be243ea6164094` 分析制作商 46 | - `http://127.0.0.1:5000/analyse/label/b0b3be30e6bf490f` 分析发行商 47 | - `http://127.0.0.1:5000/analyse/director/bb914a54dc51b21b` 分析导演 48 | 49 | ### 功能页 50 | - `http://127.0.0.1:5000/spider` 爬虫,输入链接,抓取链接内所有影片 51 | - `http://127.0.0.1:5000/scandisk` 扫描硬盘,扫描本地硬盘,识别番号文件名 52 | - `http://127.0.0.1:5000/config` 修改配置 53 | 54 | ## 注意事项 55 | 1. 右上角的语言切换是用来切换目标站的语言的,会影响演员名/类目名,仿站没有做多语言 56 | 2. avmoo最多只支持抓取到100页,超过100页无法抓取 57 | 3. 图片卡的时候尝试去`config`页面切换`图片cdn源` 58 | 4. 右上角的`链接`按钮指的是avmoo源站对应的链接 59 | 5. 爬虫页可以查看最近写入库中的影片,也可以操作爬虫 60 | 6. 磁力搜索网站可以自己在配置添加,会在末尾拼av_id 61 | 62 | ## 打开方法 63 | 命令行运行`python run.py`启动本地web(首次打开会跳转到安装页面) 64 | 指定配置文件运行`python run.py config_main.db` 65 | 默认地址为`http://127.0.0.1:5000`,端口可通过配置修改 66 | -------------------------------------------------------------------------------- /aqd_spider.py: -------------------------------------------------------------------------------- 1 | from ast import Constant 2 | from asyncio import constants 3 | import time 4 | import sys 5 | from requests import Timeout 6 | import json 7 | 8 | from typing import Iterator 9 | from common import * 10 | 11 | ''' 12 | 抓取aqdav.net的影片添加到扩展信息 13 | 1. av_extend 中必须有{aqd}配置,(rename,aqd,https://vip.aqdtv540.com),用来记录最新的地址 14 | ''' 15 | class Aqd: 16 | instance = None 17 | requests_ins = None 18 | db_ins = None 19 | log = logging.getLogger('aqd') 20 | save_file = 'aqd_result.txt' 21 | 22 | @staticmethod 23 | def db(): 24 | if Aqd.db_ins is None: 25 | Aqd.log.info('spider.db.init') 26 | # 链接数据库 27 | Aqd.db_ins = sqlite3.connect(CONFIG.get("base", "db_file")) 28 | Aqd.db_ins.row_factory = make_dicts 29 | return Aqd.db_ins 30 | 31 | @staticmethod 32 | def requests(): 33 | if Aqd.requests_ins is None: 34 | Aqd.log.info('spider.requests.init') 35 | # 创建会话对象 36 | Aqd.requests_ins = requests.Session() 37 | Aqd.requests_ins.headers = { 38 | 'User-Agent': CONFIG.get("requests", "user_agent"), 39 | } 40 | return Aqd.requests_ins 41 | 42 | @staticmethod 43 | def fetchall(sql) -> list: 44 | cur = Aqd.db().cursor() 45 | cur.execute(sql) 46 | return cur.fetchall() 47 | 48 | @staticmethod 49 | def aqd_site_url() -> str: 50 | site_url = CONFIG.get('aqd', 'aqd_site').strip("/") 51 | r = Aqd.requests().get(site_url, timeout=CONFIG.getint("requests", "timeout")) 52 | p = parse.urlparse(r.url) 53 | 54 | new_site_url = "https://" + p.hostname 55 | if site_url != new_site_url: 56 | # 存储新的链接 57 | CONFIG.set(section='aqd', option='aqd_site', value=new_site_url) 58 | config_save(CONFIG) 59 | 60 | return new_site_url 61 | 62 | # 自动翻页返回影片url 63 | @staticmethod 64 | def url_general() -> Iterator[str]: 65 | site_url = Aqd.aqd_site_url() 66 | if empty(site_url): 67 | site_url = "https://www.aqd99.com" 68 | return 69 | 70 | for page_no in range(1, 500): 71 | time.sleep(1) 72 | # 有PART关键字的影片都是AV影片 73 | # url = site_url + '/videos/search?key=PART&page={}'.format(page_no) 74 | url = site_url + '/videos/category/jp/{}'.format(page_no) 75 | Aqd.log.info("get:{}".format(url)) 76 | 77 | (status_code, html) = Aqd.get_html_by_url(url) 78 | if status_code in [403, 404, 500] or html is None: 79 | Aqd.log.fatal("url:{} status_code:{}".format(url, status_code)) 80 | break 81 | 82 | item_a_list = html.xpath('//div[@class="row index-videos-list index-videos-item-list"]/div/div/div/a') 83 | if not item_a_list: 84 | Aqd.log.warning("page empty break") 85 | break 86 | for item in item_a_list: 87 | # 判断是否有番号id 88 | title = item.attrib.get('alt') 89 | url = item.attrib.get('href') 90 | # av_id = Aqd.get_av_id(title) 91 | # if empty(av_id): 92 | # continue 93 | head_img = item.xpath('img')[0].attrib.get('data-original') 94 | # Aqd.log.info("aqdurl:{},title:{}".format(url, title)) 95 | yield site_url + url, head_img 96 | 97 | @staticmethod 98 | def movie_save(insert_list: list) -> None: 99 | if empty(insert_list): 100 | return 101 | insert_list_str = "\n".join([json.dumps(x, ensure_ascii = False) for x in insert_list]) 102 | with open(Aqd.save_file, "a", encoding='utf-8') as f: 103 | f.write(insert_list_str + "\n") 104 | 105 | # 解析html数据 106 | @staticmethod 107 | def movie_page_data(html) -> dict: 108 | title = html.xpath("/html/body/section/div[2]/div[2]/div[3]/div/div[1]/h3/text()")[0] 109 | video = "" 110 | res = re.findall(r"(http.+\.m3u8)", html.xpath("//script")[-5].text) 111 | if non_empty(res): 112 | video = res[0] 113 | url = html.xpath('/html/head/meta[15]')[0].attrib.get('content') 114 | data = { 115 | 'id': int(re.findall("\d+$", url)[0]), 116 | 'title': title, 117 | 'av_id': Aqd.get_av_id(title), 118 | 'video': video, 119 | 'img': '', 120 | # 发行时间 121 | 'date': html.xpath('/html/body/section/div[2]/div[2]/div[3]/div/div[3]/span/text()')[0].strip()[-19:] 122 | } 123 | return data 124 | 125 | @staticmethod 126 | def get_html_by_url(url: str) -> tuple: 127 | retry_limit = 100 128 | for i in range(retry_limit): 129 | try: 130 | res = Aqd.requests().get(url, timeout=CONFIG.getint("requests", "timeout")) 131 | if res.status_code != 200: 132 | Aqd.log.error("status_code = {},url:{}".format(res.status_code, url)) 133 | return res.status_code, None 134 | 135 | return 200, etree.HTML(res.text) 136 | except Timeout as e: 137 | Aqd.log.warning("requests Timeout,error:{}\nretry url:{}".format( 138 | e, url 139 | )) 140 | # 休眠 141 | time.sleep(10) 142 | # 超时重试 143 | continue 144 | 145 | except ConnectionError as e: 146 | Aqd.log.warning("requests ConnectionError,error:{}\nretry url:{}".format( 147 | e, url 148 | )) 149 | # 休眠 150 | time.sleep(10) 151 | # 链接异常 152 | continue 153 | 154 | except Exception as e: 155 | Aqd.log.warning("requests Exception:{}\nurl:{}".format(e, url)) 156 | time.sleep(10) 157 | continue 158 | # 返回错误 159 | return 500, None 160 | 161 | @staticmethod 162 | def get_av_id(title: str) -> str: 163 | ''' 164 | 从title中获取avid,取不到返回空''' 165 | res = re.findall(r"\[([A-Z]+\-\d+)\]", title) 166 | if not res: 167 | return '' 168 | return res[0] 169 | 170 | @staticmethod 171 | def get_max_id() -> int: 172 | max_id = 0 173 | with open(Aqd.save_file, "r", encoding="utf-8") as f: 174 | for line in f.readlines(): 175 | row = json.loads(line.strip()) 176 | if row["id"] > max_id: 177 | max_id = row["id"] 178 | return max_id 179 | 180 | @staticmethod 181 | def fetch_data(): 182 | max_id = aqd.get_max_id() 183 | for url, img in aqd.url_general(): 184 | id = re.findall("\d+$", url)[0] 185 | if int(id) <= max_id: 186 | break 187 | status_code,html = Aqd.get_html_by_url(url) 188 | if status_code != 200: 189 | continue 190 | 191 | Aqd.log.info('fetch:{}'.format(url)) 192 | data = Aqd.movie_page_data(html) 193 | data['img'] = img 194 | Aqd.movie_save([data]) 195 | 196 | @staticmethod 197 | def insert_data(): 198 | with open(Aqd.save_file, "r", encoding="utf-8") as f: 199 | file_data = f.read() 200 | file_data = file_data.split("\n") 201 | for i in range(len(file_data))[::-1]: 202 | line = file_data[i].strip() 203 | if len(line) < 20: 204 | continue 205 | row = json.loads(line) 206 | if empty(row['av_id']): 207 | continue 208 | # 查询库里有没有当前id 209 | res = fetchall("select * from av_list where av_id ='{}'".format(row['av_id'])) 210 | if empty(res): 211 | Aqd.log.warning("av_id:{} none {}".format(row["av_id"], get_url("search", row["av_id"]))) 212 | else: 213 | Aqd.log.info("av_id:{} complete {}".format(row["av_id"], get_local_url("movie", row["av_id"]))) 214 | 215 | m3u8_url = "{}#{}".format(row["video"], row["id"]) 216 | # 查询数据是不是已存在 217 | res = fetchall("select * from av_extend where extend_name='movie_res' and key='{}' and val='{}'".format(row['av_id'], m3u8_url)) 218 | if non_empty(res): 219 | Aqd.log.info("{} exist,break".format(row['av_id'])) 220 | break 221 | insert("av_extend", [{ 222 | "extend_name": "movie_res", 223 | "key": row['av_id'], 224 | "val": m3u8_url 225 | }]) 226 | if __name__ == '__main__': 227 | # python aqd_spider.py ./config.ini.default 228 | if len(sys.argv) == 2: 229 | conf_file = sys.argv[1] 230 | else: 231 | print("wrong config file.") 232 | exit() 233 | 234 | init(conf_file) 235 | create_logger('aqd') 236 | aqd = Aqd() 237 | Aqd.log.info("[fetch_data start]") 238 | aqd.fetch_data() 239 | Aqd.log.info("[insert_data start]") 240 | aqd.insert_data() 241 | 242 | # print(aqd.get_max_id()) 243 | # status_code,html = Aqd.get_html_by_url("/videos/play/6988") 244 | # data = Aqd.movie_page_data(html) 245 | # print(data) 246 | -------------------------------------------------------------------------------- /common.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import time 3 | from sqlite3 import Connection 4 | 5 | import requests 6 | import configparser 7 | import os 8 | import re 9 | import sqlite3 10 | from lxml import etree 11 | import webbrowser 12 | import threading 13 | import binascii 14 | import traceback 15 | from urllib.parse import quote 16 | from queue import Queue 17 | from define import * 18 | from urllib import parse 19 | 20 | CONFIG_FILE = "config.ini" 21 | CONFIG_FILE_DEFAULT = "config.ini.default" 22 | CONFIG = configparser.ConfigParser() 23 | 24 | DB: Connection = None 25 | 26 | # 存储 av_genre,av_extend, rename数据,用于快速查找 27 | DATA_STORAGE = {} 28 | 29 | # 缓存 30 | SQL_CACHE = {} 31 | 32 | # 任务队列 33 | QUEUE = Queue(maxsize=0) 34 | 35 | LOG_FORMAT = "%(asctime)s [%(filename)s:%(lineno)d] %(levelname)s: %(message)s" 36 | LOGGER = logging.getLogger(APP_NAME) 37 | 38 | 39 | def init(file_name = None): 40 | global CONFIG_FILE 41 | # 创建日志 42 | create_logger(APP_NAME) 43 | # 关闭 werkzeug 的日志 44 | logging.getLogger('werkzeug').setLevel(logging.ERROR) 45 | LOGGER.info("common.init") 46 | 47 | if non_empty(file_name): 48 | # 命令行指定配置文件 49 | CONFIG_FILE = file_name 50 | # 初始化配置 51 | config_check() 52 | config_init() 53 | db_init() 54 | 55 | 56 | def make_dicts(cursor, row): 57 | return dict((cursor.description[idx][0], value) 58 | for idx, value in enumerate(row)) 59 | 60 | 61 | def db_init(): 62 | LOGGER.info("common.init.db") 63 | # 初始化db 64 | global DB 65 | db_file = CONFIG.get("base", "db_file") 66 | if os.path.exists(db_file): 67 | DB = sqlite3.connect(db_file, check_same_thread=False) 68 | DB.row_factory = make_dicts 69 | 70 | 71 | def storage_init(table: str) -> None: 72 | if table in DATA_STORAGE and non_empty(DATA_STORAGE[table]): 73 | return 74 | DATA_STORAGE[table] = fetchall("SELECT * FROM " + table) 75 | 76 | 77 | # 仅av_genre和av_extend使用 78 | def storage(table: str, conditions: dict = None, col: str = None) -> list: 79 | storage_init(table) 80 | ret = [] 81 | if not conditions: 82 | return DATA_STORAGE[table] 83 | # 每条记录 84 | for row in DATA_STORAGE[table]: 85 | hit = True 86 | # 每个条件 87 | for cond_key, cond_val in conditions.items(): 88 | if not cond_val: 89 | continue 90 | if isinstance(cond_val, str): 91 | if cond_val != row[cond_key]: 92 | hit = False 93 | break 94 | elif isinstance(cond_val, list): 95 | if row[cond_key] not in cond_val: 96 | hit = False 97 | break 98 | else: 99 | LOGGER.fatal("wrong type") 100 | if not hit: 101 | continue 102 | if col: 103 | ret.append(row[col]) 104 | else: 105 | ret.append(row) 106 | return ret 107 | 108 | 109 | def config_path() -> str: 110 | if os.path.exists(CONFIG_FILE): 111 | return CONFIG_FILE 112 | return CONFIG_FILE_DEFAULT 113 | 114 | 115 | def config_init() -> None: 116 | # 初始化配置 117 | LOGGER.info('CONFIG FILE:%r', config_path()) 118 | CONFIG.read(config_path()) 119 | 120 | 121 | # 配置文件 122 | def config_check(): 123 | if not os.path.exists(CONFIG_FILE): 124 | return 125 | config = configparser.ConfigParser() 126 | config.read(CONFIG_FILE) 127 | config_default = configparser.ConfigParser() 128 | config_default.read(CONFIG_FILE_DEFAULT) 129 | for (section, option) in [x.split('.') for x in CONFIG_NAME_LIST]: 130 | if not config.has_section(section): 131 | config.add_section(section) 132 | if not config.has_option(section, option): 133 | config.set(section, option, config_default.get(section, option)) 134 | config_save(config) 135 | 136 | 137 | def config_save(config): 138 | with open(CONFIG_FILE, "w") as fp: 139 | config.write(fp) 140 | 141 | 142 | # 创建日志记录器 143 | def create_logger(app_name: str): 144 | logger = logging.getLogger(app_name) 145 | logger.setLevel(logging.INFO) # Log等级总开关 146 | # 第二步,创建一个handler,用于写入日志文件 147 | if not os.path.exists('logs'): 148 | os.mkdir('logs') 149 | log_path = os.getcwd() + '/logs/' 150 | logfile = log_path + app_name + '.' + time.strftime('%Y%m%d%H', time.localtime(time.time())) + '.log' 151 | 152 | fh = logging.FileHandler(logfile, mode='a', encoding='utf-8') 153 | fh.setLevel(logging.DEBUG) # 输出到file的log等级的开关 154 | # 第三步,定义handler的输出格式 155 | fh.setFormatter(logging.Formatter(LOG_FORMAT)) 156 | # 第四步,将logger添加到handler里面 157 | logger.addHandler(fh) 158 | 159 | fh = logging.StreamHandler() 160 | fh.setLevel(logging.DEBUG) # 输出到file的log等级的开关 161 | # 第三步,定义handler的输出格式 162 | fh.setFormatter(logging.Formatter(LOG_FORMAT)) 163 | # 第四步,将logger添加到handler里面 164 | logger.addHandler(fh) 165 | 166 | 167 | def replace_sql_build(table: str, data: dict) -> str: 168 | sql = "REPLACE INTO {} ({}) VALUES ({})".format( 169 | table, ','.join(list(data)), ("?," * len(data))[:-1] 170 | ) 171 | return sql 172 | 173 | 174 | # sql插入操作 175 | # av_genre av_extend 176 | def insert(table: str, data: list): 177 | if CONFIG.getboolean("base", "readonly"): 178 | return 179 | if not data: 180 | return 181 | sql = replace_sql_build(table, data[0]) 182 | if len(sql) < 150: 183 | LOGGER.info(color(36, sql)) 184 | else: 185 | LOGGER.info(color(36, "INSERT,table:{},count:{}".format(table, len(data)))) 186 | DB.cursor().executemany(sql, [tuple(x.values()) for x in data]) 187 | DB.commit() 188 | if table in DATA_STORAGE: 189 | DATA_STORAGE[table].clear() 190 | 191 | 192 | # sql删除操作 193 | def delete(table: str, data: dict): 194 | if CONFIG.getboolean("base", "readonly"): 195 | return 196 | if not data: 197 | return 198 | sql = "DELETE FROM {} WHERE {}".format( 199 | table, " AND ".join(["{}='{}'".format(field, value) for field, value in data.items()])) 200 | execute(sql) 201 | if table in DATA_STORAGE: 202 | DATA_STORAGE[table].clear() 203 | 204 | 205 | # 执行sql 206 | def execute(sql): 207 | if CONFIG.getboolean("base", "readonly"): 208 | return 209 | LOGGER.info(color(35, sql)) 210 | DB.cursor().execute(sql) 211 | DB.commit() 212 | 213 | 214 | # 查询sql 没缓存 215 | def fetchall(sql) -> list: 216 | if DB is None: 217 | # 触发安装程序 218 | raise IOError('db') 219 | 220 | cur = DB.cursor() 221 | LOGGER.info(color(36, sql)) 222 | cur.execute(sql) 223 | return cur.fetchall() 224 | 225 | 226 | # 查询sql 带缓存 227 | def query_sql(sql) -> list: 228 | cache_key = gen_cache_key(sql) 229 | # 是否使用缓存 230 | if CONFIG.getboolean("website", "use_cache"): 231 | LOGGER.info('CACHE[%s]', cache_key) 232 | # 是否有缓存 233 | if cache_key in SQL_CACHE.keys(): 234 | return SQL_CACHE[cache_key][:] 235 | else: 236 | ret = fetchall(sql) 237 | if CONFIG.getboolean("website", "use_cache") and ret != []: 238 | SQL_CACHE[cache_key] = ret 239 | return ret[:] 240 | else: 241 | return fetchall(sql) 242 | 243 | 244 | def get_new_avmoo_site() -> str: 245 | res = requests.get('https://tellme.pw/avmoo') 246 | html = etree.HTML(res.text) 247 | avmoo_site = html.xpath( 248 | '/html/body/div[1]/div[2]/div/div[2]/h4[1]/strong/a/@href')[0] 249 | return avmoo_site 250 | 251 | 252 | def list_in_str(target_list: tuple, target_string: str) -> bool: 253 | for item in target_list: 254 | if item in target_string: 255 | return True 256 | return False 257 | 258 | 259 | def get_url(page_type: str = '', keyword: str = '', page_no: int = 1) -> str: 260 | ret = '{}/{}'.format(CONFIG.get("base", "avmoo_site"), 261 | CONFIG.get("base", "country"), ) 262 | if page_type == "search": 263 | if keyword != '': 264 | ret += '/{}/{}'.format(page_type, keyword) 265 | else: 266 | if page_type != '': 267 | ret += '/{}'.format(page_type) 268 | if keyword != '': 269 | ret += '/{}'.format(keyword) 270 | if page_no > 1: 271 | ret += '/page/{}'.format(page_no) 272 | return ret 273 | 274 | 275 | def get_local_url(page_type: str = '', keyword: str = '', page_no: int = 1) -> str: 276 | ret = 'http://{}:{}'.format(LOCAL_IP, CONFIG.getint("base", "port")) 277 | if page_type == "popular": 278 | return '' 279 | if page_type != '': 280 | ret += '/{}'.format(page_type) 281 | if keyword != '': 282 | ret += '/{}'.format(keyword) 283 | if page_no > 1: 284 | ret += '/page/{}'.format(page_no) 285 | return ret 286 | 287 | 288 | def search_where(key_item: str) -> str: 289 | key_item = sql_escape(key_item) 290 | return "(av_list.title LIKE '%{0}%' OR ".format(key_item) + \ 291 | "av_list.director = '{0}' OR ".format(key_item) + \ 292 | "av_list.studio = '{0}' OR ".format(key_item) + \ 293 | "av_list.label = '{0}' OR ".format(key_item) + \ 294 | "av_list.series LIKE '%{0}%' OR ".format(key_item) + \ 295 | "av_list.genre LIKE '%{0}%' OR ".format(key_item) + \ 296 | "av_list.stars LIKE '%{0}%')".format(key_item) 297 | 298 | 299 | def open_browser_tab(url): 300 | if not url: 301 | return 302 | LOGGER.info("open_browser_tab:%s", url) 303 | 304 | def _open_tab(url_param): 305 | webbrowser.open_new_tab(url_param) 306 | 307 | thread = threading.Thread(target=_open_tab, args=(url,)) 308 | thread.daemon = True 309 | thread.start() 310 | 311 | 312 | def sql_escape(keyword: str) -> str: 313 | for item in ESCAPE_LIST: 314 | keyword = keyword.replace(item[0], item[1]) 315 | return keyword 316 | 317 | 318 | # 解析源站url, 返回 page_type, keyword, page_start 319 | def parse_url(url: str) -> tuple: 320 | if url is None or url == '': 321 | return '', '', -1 322 | 323 | pattern_1 = "https?://[^/]+/[^/]+/popular(/page/(\\d+))?" 324 | pattern_2 = "https?://[^/]+/[^/]+/(movie|star|genre|series|studio|label|director|search)/([^/]+)(/page/(\\d+))?" 325 | 326 | if re.match(pattern_1, url): 327 | res = re.findall(pattern_1, url) 328 | page_start = int(res[0][1]) if res[0][1] else 1 329 | return "popular", '', page_start 330 | 331 | if re.match(pattern_2, url): 332 | res = re.findall(pattern_2, url) 333 | page_start = int(res[0][3]) if res[0][3] else 1 334 | return res[0][0], res[0][1], page_start 335 | 336 | LOGGER.fatal("wrong url:{}".format(url)) 337 | return '', '', -1 338 | 339 | 340 | # 获取sql中的表名 341 | def get_table_name(sql): 342 | return list(set(re.findall("(av_[a-z]+)", sql))) 343 | 344 | 345 | # 获取缓存key 346 | def gen_cache_key(sql): 347 | return '|'.join(get_table_name(sql)) + ':' + str(binascii.crc32(sql.encode()) & 0xffffffff) 348 | 349 | 350 | def empty(i: any) -> bool: 351 | if i is None: 352 | return True 353 | if isinstance(i, str): 354 | return i == '' 355 | if isinstance(i, list) or isinstance(i, tuple): 356 | return len(i) == 0 357 | if isinstance(i, dict): 358 | return i == {} 359 | if isinstance(i, int) or isinstance(i, float): 360 | return i == 0 361 | return False 362 | 363 | 364 | def non_empty(i: any) -> bool: 365 | return not empty(i) 366 | 367 | 368 | # 命令行颜色 369 | def color(c, s): 370 | if not CONFIG.getboolean('log', 'ansi_color'): 371 | return s 372 | """ 373 | \033[30m黑\033[0m 374 | \033[31m酱红\033[0m 375 | \033[32m浅绿\033[0m 376 | \033[33m黄褐\033[0m 377 | \033[34m浅蓝\033[0m 378 | \033[35m紫\033[0m 379 | \033[36m天蓝\033[0m 380 | \033[37m灰白\033[0m 381 | """ 382 | return "\033[{}m{}\033[0m".format(c, s) 383 | 384 | 385 | def upper_path(path: str) -> str: 386 | # 如果为windows环境路径,则路径首字母大写 387 | if re.match("^[a-z]:\\\\", path): 388 | return path[0].upper() + path[1:] 389 | else: 390 | return path 391 | 392 | 393 | def a_tag_build(link): 394 | return '{}'.format(link, link) 395 | 396 | 397 | # 识别linkid 398 | def is_linkid(linkid: str = '') -> bool: 399 | if empty(linkid): 400 | return False 401 | return re.match('^[a-z0-9]{16}$', linkid) is not None 402 | 403 | 404 | # 替换链接中命中rename的{rename} 405 | def url_rename(s: str) -> str: 406 | res = re.findall("{(.+)}", s) 407 | if res: 408 | return s.replace('{'+res[0]+'}', rename(res[0])) 409 | return s 410 | 411 | 412 | # 重命名 413 | def rename(name): 414 | # 渲染前准备rename数据 415 | storage_init(AV_EXTEND) 416 | if 'rename' not in DATA_STORAGE: 417 | DATA_STORAGE['rename'] = {} 418 | for row in DATA_STORAGE[AV_EXTEND]: 419 | if row['extend_name'] == 'rename': 420 | DATA_STORAGE['rename'][row['key']] = row['val'] 421 | if name in DATA_STORAGE['rename']: 422 | return DATA_STORAGE['rename'][name] 423 | return name 424 | 425 | 426 | # 列表小头图 427 | def small_img(s): 428 | return CONFIG.get('website', 'cdn') + '/digital/video' + s[:-6] + 'ps' + s[-4:] 429 | 430 | 431 | # 获取大头图 432 | def big_img(s): 433 | return CONFIG.get('website', 'cdn') + '/digital/video' + s 434 | 435 | 436 | # 是否为可播放的链接 437 | def can_play_url(s): 438 | p = parse.urlparse(s) 439 | if p.scheme not in ['http', 'https']: 440 | return False 441 | return list_in_str(('.m3u8', '.mp4', '.flv'), p.path) 442 | 443 | 444 | if __name__ == "__main__": 445 | pass 446 | -------------------------------------------------------------------------------- /config.ini.default: -------------------------------------------------------------------------------- 1 | [base] 2 | avmoo_site = https://avmoo.sbs 3 | db_file = avmoo.db 4 | port = 5000 5 | debug_mode = False 6 | # readonly,nothing update in sqlite db 7 | readonly = False 8 | 9 | # cn/tw/en/ja 10 | country = cn 11 | 12 | [spider] 13 | # could by a decimal number 14 | sleep = 2 15 | 16 | # integer 17 | insert_threshold = 10 18 | 19 | # continued skip count > continued_skip_limit, crawl end 20 | # integer 21 | continued_skip_limit = 30 22 | 23 | minimum_movie_duration = 0 24 | 25 | [requests] 26 | timeout = 3 27 | user_agent = Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36 28 | 29 | [website] 30 | # image server,//jp.netcdn.space, //pics.dmm.com, //pics.dmm.co.jp 31 | cdn = //pics.dmm.com 32 | 33 | # page item limit 34 | page_limit = 30 35 | 36 | # /actresses item limit 37 | actresses_page_limit = 36 38 | 39 | # /group item limit 40 | group_page_limit = 30 41 | 42 | # update work list and newest movie, interval time(ms) 43 | spider_page_interval_timeout = 20000 44 | 45 | # release_date or count 46 | group_page_order_by = count 47 | 48 | # cache enable or not 49 | use_cache = False 50 | 51 | # auto open site on run or not 52 | auto_open_site_on_run = True 53 | 54 | # auto open link when spider done work 55 | auto_open_link_when_crawl_done = True 56 | 57 | # fast run,less function 58 | efficiency_mode = False 59 | 60 | # search site url,av_id will be end of url 61 | search_url = https://btsow.rest/search/ 62 | 63 | [log] 64 | ansi_color = False 65 | 66 | [aqd] 67 | # from aqd get movie m3u8 play link 68 | aqd_site = https://vip.aqdx200.com 69 | -------------------------------------------------------------------------------- /define.py: -------------------------------------------------------------------------------- 1 | # flask应用名 2 | APP_NAME = 'website' 3 | 4 | # 表名 5 | AV_STARS = 'av_stars' 6 | AV_GENRE = 'av_genre' 7 | AV_LIST = 'av_list' 8 | AV_EXTEND = 'av_extend' 9 | 10 | # 右上角切换语言 11 | COUNTRY_MAP = { 12 | 'en': 'English', 13 | 'ja': '日本语', 14 | 'tw': '正體中文', 15 | 'cn': '简体中文', 16 | } 17 | 18 | PAGE_TYPE_MAP = { 19 | # page_type名 20 | 'director': { 21 | # 页面名称 22 | 'name': '导演', 23 | # 是否允许收藏 24 | 'like_enable': False, 25 | # 是否允许改显示名称 26 | 'rename_enable': True, 27 | # db字段, like key 28 | 'key': 'director_url', 29 | # av_list影片列表查询条件 30 | 'where': "director_url='{}'", 31 | }, 32 | 'movie': { 33 | 'name': '影片', 34 | 'like_enable': True, 35 | 'rename_enable': False, 36 | 'key': 'av_id', 37 | 'where': "linkid='{0}' OR av_id='{0}'", 38 | }, 39 | 'studio': { 40 | 'name': '制作商', 41 | 'like_enable': True, 42 | 'rename_enable': True, 43 | 'key': 'studio_url', 44 | 'where': "studio_url='{}'", 45 | }, 46 | 'label': { 47 | 'name': '发行商', 48 | 'like_enable': True, 49 | 'rename_enable': True, 50 | 'key': 'label_url', 51 | 'where': "label_url='{}'", 52 | }, 53 | 'series': { 54 | 'name': '系列', 55 | 'like_enable': True, 56 | 'rename_enable': True, 57 | 'key': 'series_url', 58 | 'where': "series_url='{}'", 59 | }, 60 | 'star': { 61 | 'name': '演员', 62 | 'like_enable': False, 63 | 'rename_enable': True, 64 | 'key': 'stars_url', 65 | 'where': "stars_url GLOB '*|{}*'", 66 | }, 67 | 'genre': { 68 | 'name': '类别', 69 | 'like_enable': False, 70 | 'rename_enable': True, 71 | 'key': 'genre_url', 72 | 'where': "genre GLOB '*|{}|*'", 73 | }, 74 | 'group': { 75 | 'name': '番号', 76 | 'like_enable': True, 77 | 'rename_enable': False, 78 | 'key': 'group', 79 | 'where': "av_id LIKE '{}-%'", 80 | }, 81 | 'like': { 82 | 'name': '收藏', 83 | 'like_enable': False, 84 | 'rename_enable': False, 85 | }, 86 | } 87 | 88 | # sqlite escapt list 89 | ESCAPE_LIST = ( 90 | ("/", "//"), 91 | ("'", "''"), 92 | ("[", "/["), 93 | ("]", "/]"), 94 | ("%", "/%"), 95 | ("&", "/&"), 96 | ("_", "/_"), 97 | ("(", "/("), 98 | (")", "/)"), 99 | ) 100 | 101 | PAGE_MAX = 100 102 | 103 | LOCAL_IP = "127.0.0.1" 104 | 105 | # /config 106 | CONFIG_NAME_LIST = [ 107 | "base.avmoo_site", 108 | "base.db_file", 109 | "base.port", 110 | "base.debug_mode", 111 | "base.readonly", 112 | "base.country", 113 | 114 | "spider.sleep", 115 | "spider.insert_threshold", 116 | "spider.continued_skip_limit", 117 | "spider.minimum_movie_duration", 118 | 119 | "requests.timeout", 120 | "requests.user_agent", 121 | 122 | "website.cdn", 123 | "website.page_limit", 124 | "website.actresses_page_limit", 125 | "website.group_page_limit", 126 | "website.spider_page_interval_timeout", 127 | "website.search_url", 128 | 129 | "website.group_page_order_by", 130 | "website.use_cache", 131 | "website.auto_open_site_on_run", 132 | "website.auto_open_link_when_crawl_done", 133 | "website.efficiency_mode", 134 | 135 | "log.ansi_color", 136 | ] 137 | 138 | # /spider 文件类型与判定 139 | FILE_TAIL = { 140 | 'mp4': "\\.(mp4|mkv|flv|avi|rm|rmvb|mpg|mpeg|mpe|m1v|mov|3gp|m4v|m3p|wmv|wmp|wm)$", 141 | 'jpg': "\\.(jpg|png|gif|jpeg|bmp|ico)$", 142 | 'mp3': "\\.(mp3|wav|wmv|mpa|mp2|ogg|m4a|aac)$", 143 | 'torrent': "\\.torrent$", 144 | 'zip': "\\.(zip|rar|gz|7z)$", 145 | 'doc': "\\.(xls|xlsx|doc|docx|ppt|pptx|csv|pdf|html|txt)$", 146 | } 147 | 148 | # /spider av视频文件判断正则 149 | AV_FILE_REG = "[a-zA-Z]{3,5}-\\d{3,4}" 150 | 151 | CREATE_AV_GENRE_SQL = ''' 152 | CREATE TABLE IF NOT EXISTS "av_genre" ( 153 | "linkid" CHAR(16) NOT NULL, 154 | "name" TEXT, 155 | "title" TEXT, 156 | PRIMARY KEY ("linkid") 157 | ); 158 | ''' 159 | 160 | CREATE_AV_LIST_SQL = ''' 161 | CREATE TABLE IF NOT EXISTS "av_list" ( 162 | "linkid" CHAR(16) NOT NULL, 163 | "title" TEXT, 164 | "av_id" VARCHAR(20), 165 | "release_date" CHAR(10), 166 | "len" INTEGER, 167 | "director" TEXT, 168 | "studio" TEXT, 169 | "label" TEXT, 170 | "series" TEXT, 171 | "genre" TEXT, 172 | "stars" TEXT, 173 | "director_url" TEXT, 174 | "studio_url" CHAR(16), 175 | "label_url" CHAR(16), 176 | "series_url" TEXT, 177 | "stars_url" TEXT, 178 | "bigimage" TEXT, 179 | "image_len" INTEGER, 180 | PRIMARY KEY ("linkid") 181 | ); 182 | ''' 183 | 184 | CREATE_AV_STARS_SQL = ''' 185 | CREATE TABLE IF NOT EXISTS "av_stars" ( 186 | "linkid" CHAR(16) NOT NULL, 187 | "name" TEXT, 188 | "name_history" TEXT, 189 | "birthday" TEXT, 190 | "height" TEXT, 191 | "cup" CHAR(1), 192 | "bust" TEXT, 193 | "waist" TEXT, 194 | "hips" TEXT, 195 | "hometown" TEXT, 196 | "hobby" TEXT, 197 | "headimg" TEXT, 198 | PRIMARY KEY ("linkid") 199 | ); 200 | ''' 201 | 202 | CREATE_AV_EXTEND_SQL = ''' 203 | CREATE TABLE IF NOT EXISTS "av_extend" ( 204 | "id" INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT, 205 | "extend_name" VARCHAR(10) NOT NULL, 206 | "key" VARCHAR(20) NOT NULL, 207 | "val" TEXT NOT NULL 208 | ); 209 | ''' 210 | 211 | AV_GENRE_DEMO_DATA = [ 212 | ('like', 'group', 'SSIS'), 213 | ('like', 'studio_url', '80be243ea6164094'), 214 | ('like', 'label_url', 'b0b3be30e6bf490f'), 215 | ('like', 'series_url', 'c343a1499f108277'), 216 | ('like', 'av_id', 'SSIS-318'), 217 | ('like', 'av_id', 'SSIS-318'), 218 | ('movie_res', 'SSIS-318', 'magnet:?xt=urn:btih:E0C7B27071A832388AF9C54553EECF71F4094256&dn=SSIS-318-C'), 219 | ] 220 | -------------------------------------------------------------------------------- /run.py: -------------------------------------------------------------------------------- 1 | import common 2 | import spider 3 | import website 4 | import sys 5 | 6 | # 配置初始化 7 | # 1. 读取配置 8 | # 2. 初始化db 9 | # 3. 建表 10 | config_file = None 11 | if len(sys.argv) > 1: 12 | config_file = sys.argv[1] 13 | common.init(config_file) 14 | 15 | # 爬虫类初始化 16 | # 1. 初始化db 17 | # 2. 初始化requests 18 | # 3. genre为空则获取 19 | # 4. 启动爬虫线程 20 | spider.Spider().run() 21 | 22 | # flask应用 23 | website.run() 24 | -------------------------------------------------------------------------------- /spider.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding:utf-8 -*- 3 | import time 4 | 5 | from requests import Timeout 6 | 7 | from typing import Iterator 8 | from common import * 9 | 10 | 11 | class Spider: 12 | instance = None 13 | requests_ins = None 14 | db_ins = None 15 | log = logging.getLogger('spider') 16 | 17 | def __init__(self): 18 | self.last_insert_list = [] 19 | self.running_work = None 20 | self.done_work = [] 21 | Spider.log.info("spider.init") 22 | 23 | def __new__(cls, *args, **kwargs): 24 | if not cls.instance: 25 | cls.instance = super(Spider, cls).__new__(cls) 26 | return cls.instance 27 | 28 | def run(self): 29 | create_logger('spider') 30 | # 启动爬虫线程 31 | if CONFIG.getboolean("base", "readonly"): 32 | return 33 | thread = threading.Thread(target=self.spider_thread, args=()) 34 | thread.daemon = True 35 | thread.start() 36 | 37 | @staticmethod 38 | def db(): 39 | if Spider.db_ins is None: 40 | Spider.log.info('spider.db.init') 41 | # 链接数据库 42 | Spider.db_ins = sqlite3.connect(CONFIG.get("base", "db_file")) 43 | Spider.db_ins.row_factory = make_dicts 44 | return Spider.db_ins 45 | 46 | @staticmethod 47 | def requests(): 48 | if Spider.requests_ins is None: 49 | requests.packages.urllib3.disable_warnings() 50 | Spider.log.info('spider.requests.init') 51 | # 创建会话对象 52 | Spider.requests_ins = requests.Session() 53 | # 忽略证书 54 | Spider.requests_ins.verify = False 55 | Spider.requests_ins.headers = { 56 | 'User-Agent': CONFIG.get("requests", "user_agent"), 57 | } 58 | # 代理 59 | Spider.requests_ins.proxies = { 60 | # 'https':'http://127.0.0.1:1080' 61 | } 62 | return Spider.requests_ins 63 | 64 | # 爬虫线程 65 | def spider_thread(self): 66 | Spider.log.info("spider_thread.start") 67 | while True: 68 | time.sleep(CONFIG.getfloat("spider", "sleep")) 69 | 70 | # 获取一个任务 71 | work_param = QUEUE.get() 72 | work_param["url"] = get_url(work_param["page_type"], work_param["keyword"], work_param["page_start"]) 73 | work_param["status"] = "ING" 74 | 75 | # 记录运行中任务 76 | self.running_work = work_param.copy() 77 | 78 | work_param["exist_linkid"] = {} 79 | # 是否跳过 默认跳过 80 | if "skip_exist" not in work_param or work_param.get("skip_exist"): 81 | work_param["exist_linkid"] = Spider.get_exist_linkid(work_param["page_type"], work_param["keyword"]) 82 | 83 | Spider.log.info("[crawl start]url:{0[url]} page_limit:{0[page_limit]}, exist_count:{1}".format( 84 | work_param, len(work_param["exist_linkid"]))) 85 | ret = self.crawl_accurate(work_param) 86 | 87 | # 打开浏览器提醒抓取完成 88 | if ret: 89 | # 清空缓存 90 | if CONFIG.getboolean("website", "use_cache"): 91 | SQL_CACHE.clear() 92 | if CONFIG.getboolean("website", "auto_open_link_when_crawl_done"): 93 | open_browser_tab(get_local_url(work_param["page_type"], work_param["keyword"], work_param["page_start"])) 94 | 95 | 96 | if "exist_linkid" in self.running_work: 97 | del self.running_work["exist_linkid"] 98 | self.done_work.append(self.running_work) 99 | self.running_work = None 100 | 101 | def get_last_insert_list(self): 102 | max_count = CONFIG.getint("spider", "insert_threshold") 103 | if len(self.last_insert_list) > max_count: 104 | # 取最后几个 105 | self.last_insert_list = self.last_insert_list[-max_count:] 106 | return self.last_insert_list 107 | 108 | def get_running_work(self, action: str = ''): 109 | if action: 110 | self.running_work["status"] = action 111 | return 112 | return self.running_work 113 | 114 | def get_done_work(self): 115 | return self.done_work 116 | 117 | @staticmethod 118 | def fetchall(sql) -> list: 119 | cur = Spider.db().cursor() 120 | cur.execute(sql) 121 | return cur.fetchall() 122 | 123 | # 根据链接参数抓取 124 | def crawl_accurate(self, work_param: dict) -> bool: 125 | page_type = work_param["page_type"] 126 | if not page_type: 127 | Spider.log.error("wrong param") 128 | return False 129 | # 单个电影 130 | if page_type == "movie": 131 | (status_code, data) = Spider.crawl_by_movie_linkid(work_param["keyword"]) 132 | if empty(data) or status_code != 200: 133 | Spider.log.warning("crawl_by_movie_linkid wrong,data:%r,status_code:%d", data, status_code) 134 | return False 135 | self.movie_save([data]) 136 | return True 137 | # 其他 138 | if page_type in ('genre', 'series', 'studio', 'label', 'director', 'search', 'star', 'popular'): 139 | self.crawl_by_page_type(work_param) 140 | return True 141 | Spider.log.fatal("wrong param,work_param:%s", work_param) 142 | return False 143 | 144 | # 获取所有类别 145 | @staticmethod 146 | def crawl_genre() -> list: 147 | genre_url = get_url('genre', '') 148 | Spider.log.info("get:%s", genre_url) 149 | (status_code, html) = Spider.get_html_by_url(genre_url) 150 | insert_list = [] 151 | h4 = html.xpath('/html/body/div[2]/h4/text()') 152 | div = html.xpath('/html/body/div[2]/div') 153 | for div_item in range(len(div)): 154 | g_title = h4[div_item] 155 | a_list = div[div_item].xpath('a') 156 | for a_item in a_list: 157 | if empty(a_item.text): 158 | continue 159 | insert_list.append({ 160 | "linkid": a_item.attrib.get('href')[-16:], 161 | "name": a_item.text, 162 | "title": g_title 163 | }) 164 | Spider.log.info('genre fetch record:%r', len(insert_list)) 165 | return insert_list 166 | 167 | # 根据页面类型抓取所有影片 168 | def crawl_by_page_type(self, work_param: dict) -> None: 169 | if work_param["page_type"] == 'star': 170 | Spider.stars_one(work_param["keyword"]) 171 | # 待插入 172 | insert_list = [] 173 | insert_count = 0 174 | skip_count = 0 175 | banned_count = 0 176 | continued_skip_count = 0 177 | for movie_linkid in Spider.linkid_general(work_param): 178 | # 跳出 179 | if self.running_work["status"] != "ING": 180 | # 任务结束 181 | break 182 | 183 | # 跳过已存在的 184 | if movie_linkid in work_param["exist_linkid"]: 185 | skip_count += 1 186 | continued_skip_count += 1 187 | # Spider.log.info("SKIP EXIST,URL:%s", get_local_url("movie", movie_linkid)) 188 | # 连续跳过到指定数量,则跳出抓取 189 | if continued_skip_count >= CONFIG.getint("spider", "continued_skip_limit"): 190 | break 191 | continue 192 | 193 | continued_skip_count = 0 194 | time.sleep(CONFIG.getfloat("spider", "sleep")) 195 | 196 | (status_code, data) = Spider.crawl_by_movie_linkid(movie_linkid) 197 | if status_code == 403: 198 | banned_count += 1 199 | if banned_count == 10: 200 | Spider.log.info("banned count:%d,break loop", banned_count) 201 | break 202 | continue 203 | if empty(data): 204 | continue 205 | 206 | # 判断影片是否符合要求 207 | duration = CONFIG.getint("spider", "minimum_movie_duration") 208 | if duration > 0 and data["len"] < duration: 209 | Spider.log.info("movie duration non conformance,url:%s", get_url("movie", movie_linkid)) 210 | continue 211 | 212 | insert_list.append(data) 213 | # 存储数据 214 | if len(insert_list) == CONFIG.getint("spider", "insert_threshold"): 215 | self.movie_save(insert_list) 216 | insert_count += len(insert_list) 217 | insert_list = [] 218 | # 插入剩余的数据 219 | self.movie_save(insert_list) 220 | insert_count += len(insert_list) 221 | Spider.log.info("[exist_count:{}][fetch_count:{}][skip_count:{}]".format( 222 | len(work_param["exist_linkid"]), insert_count, skip_count)) 223 | 224 | # 根据linkid抓取一个movie页面 225 | @staticmethod 226 | def crawl_by_movie_linkid(movie_linkid: str) -> tuple: 227 | url = get_url('movie', movie_linkid) 228 | (status_code, html) = Spider.get_html_by_url(url) 229 | if status_code != 200: 230 | return status_code, None 231 | if html is None: 232 | return status_code, None 233 | # 解析页面内容 234 | try: 235 | data = Spider.movie_page_data(html) 236 | except Exception as e: 237 | Spider.log.error('movie_page_data error:%s', traceback.format_exc()) 238 | return status_code, None 239 | 240 | if empty(data) or empty(data['av_id']) or empty(data["title"]): 241 | Spider.log.error("movie crawl fatal,linkid:%s", movie_linkid) 242 | return 500, None 243 | data['linkid'] = movie_linkid 244 | # 输出当前进度 245 | Spider.log.info(data['av_id'].ljust(15) + data['release_date'] + ' ' + data['stars']) 246 | return status_code, data 247 | 248 | # 获取一个明星的信息 249 | @staticmethod 250 | def stars_one(linkid: str): 251 | stars_res = Spider.fetchall("SELECT * FROM av_stars WHERE linkid='{}'".format(linkid)) 252 | if len(stars_res) == 1: 253 | return stars_res[0] 254 | 255 | def get_val(str_param): 256 | return str_param.split(':')[1].strip() 257 | 258 | url = get_url('star', linkid) 259 | data = { 260 | 'linkid': linkid, 261 | 'name': '', 262 | 'name_history': '', 263 | 'birthday': '', 264 | 'height': '', 265 | 'cup': '', 266 | 'bust': '', 267 | 'waist': '', 268 | 'hips': '', 269 | 'hometown': '', 270 | 'hobby': '', 271 | 'headimg': '' 272 | } 273 | Spider.log.info("get:%s", url) 274 | (status_code, html) = Spider.get_html_by_url(url) 275 | if html is None: 276 | return False 277 | 278 | try: 279 | data['name'] = html.xpath( 280 | '/html/head/meta[8]/@content')[0].split(',', 1)[0] 281 | data['headimg'] = html.xpath( 282 | '//*[@id="waterfall"]/div[1]/div/div[1]/img/@src')[0].split('/', 3)[3].replace( 283 | 'mono/actjpgs/nowprinting.gif', '') 284 | except: 285 | return False 286 | 287 | for item_p in html.xpath('//*[@id="waterfall"]/div[1]/div/div[2]/p'): 288 | if empty(item_p.text): 289 | continue 290 | if list_in_str(('生日:', 'Birthday:', '生年月日:'), item_p.text): 291 | data['birthday'] = get_val(item_p.text) 292 | continue 293 | if list_in_str(('身高:', 'Height:', '身長:'), item_p.text): 294 | data['height'] = get_val(item_p.text) 295 | continue 296 | if list_in_str(('罩杯:', 'Cup:', 'ブラのサイズ:'), item_p.text): 297 | data['cup'] = get_val(item_p.text) 298 | continue 299 | if list_in_str(('胸围:', 'Bust:', 'バスト:'), item_p.text): 300 | data['bust'] = get_val(item_p.text) 301 | continue 302 | if list_in_str(('腰围:', 'Waist:', 'ウエスト:'), item_p.text): 303 | data['waist'] = get_val(item_p.text) 304 | continue 305 | if list_in_str(('臀围:', 'Hips:', 'ヒップ:'), item_p.text): 306 | data['hips'] = get_val(item_p.text) 307 | continue 308 | if list_in_str(('出生地:', 'Hometown:', '出身地:'), item_p.text): 309 | data['hometown'] = get_val(item_p.text) 310 | continue 311 | if list_in_str(('爱好:', 'Hobby:', '趣味:'), item_p.text): 312 | data['hobby'] = get_val(item_p.text) 313 | continue 314 | # 讲括号中的名字记录为曾用名 315 | tmp = data['name'].replace('(', '(').replace(')', '').split('(') 316 | if len(tmp) == 2: 317 | data['name_history'] = tmp[1] 318 | Spider.log.info("star:%r", data) 319 | Spider.stars_save(data) 320 | return data 321 | 322 | # 自动翻页返回movie_id 323 | @staticmethod 324 | def linkid_general(work_param: dict) -> Iterator[str]: 325 | # 网站限制最多100页 326 | for page_no in range(work_param["page_start"], work_param["page_limit"] + 1): 327 | time.sleep(CONFIG.getfloat("spider", "sleep")) 328 | 329 | url = get_url(work_param["page_type"], work_param["keyword"], page_no) 330 | Spider.log.info("get:{}".format(url)) 331 | 332 | (status_code, html) = Spider.get_html_by_url(url) 333 | if status_code in [304, 403, 404, 500] or html is None: 334 | break 335 | 336 | movie_id_list = html.xpath('//*[@id="waterfall"]/div/a/@href') 337 | if not movie_id_list: 338 | Spider.log.warning("page empty break") 339 | break 340 | for item in movie_id_list: 341 | if re.search("movie/[a-z0-9]{16}$", item): 342 | yield item[-16:] 343 | 344 | # 检查是否有下一页 345 | next_page = html.xpath( 346 | '//span[@class="glyphicon glyphicon-chevron-right"]') 347 | if not next_page: 348 | break 349 | 350 | @staticmethod 351 | def stars_save(data: dict) -> None: 352 | insert_sql = replace_sql_build(AV_STARS, data) 353 | Spider.db().execute(insert_sql, tuple(data.values())) 354 | Spider.db().commit() 355 | 356 | # 插入数据库 357 | def movie_save(self, insert_list: list) -> None: 358 | if empty(insert_list): 359 | return 360 | self.last_insert_list.extend(insert_list) 361 | 362 | insert_sql = replace_sql_build(AV_LIST, insert_list[0]) 363 | cur = Spider.db().cursor() 364 | cur.executemany(insert_sql, [tuple(x.values()) for x in insert_list]) 365 | Spider.db().commit() 366 | Spider.log.info('INSERT:%d', len(insert_list)) 367 | 368 | # 解析html数据 369 | @staticmethod 370 | def movie_page_data(html) -> dict: 371 | data = { 372 | 'linkid': '', 373 | # 番号 374 | 'av_id': html.xpath('/html/body/div[2]/div[1]/div[2]/p[1]/span[2]/text()')[0].strip().upper(), 375 | 'director': '', 376 | 'director_url': '', 377 | 'studio': '', 378 | 'studio_url': '', 379 | 'label': '', 380 | 'label_url': '', 381 | 'series': '', 382 | 'series_url': '', 383 | 'genre': '', 384 | 'stars': '', 385 | 'stars_url': '', 386 | # 图片个数image_len 387 | 'image_len': int(len(html.xpath('//div[@id="sample-waterfall"]/a'))), 388 | 'len': 0, 389 | # 标题 390 | 'title': html.xpath('/html/body/div[2]/h3/text()')[0].strip(), 391 | # 封面 截取域名之后的部分 392 | 'bigimage': '/' + html.xpath('/html/body/div[2]/div[1]/div[1]/a/img/@src')[0].split('/', 5)[5].strip(), 393 | # 发行时间 394 | 'release_date': html.xpath('/html/body/div[2]/div[1]/div[2]/p[2]/text()')[0].strip() 395 | } 396 | # 时长len 397 | len_text = html.xpath('/html/body/div[2]/div[1]/div[2]/p[3]/text()') 398 | if non_empty(len_text): 399 | res = re.findall("(\\d+)", len_text[0]) 400 | if non_empty(res): 401 | data['len'] = int(res[0].strip()) 402 | 403 | # 获取:导演、制作商、发行商、系列 404 | right_info = html.xpath('/html/body/div[2]/div[1]/div[2]/p/a') 405 | for i in right_info: 406 | if empty(i.text): 407 | continue 408 | tmp_href = i.attrib.get('href') 409 | 410 | if "/director/" in tmp_href: 411 | # 导演 412 | data['director'] = i.text.strip() 413 | data['director_url'] = tmp_href[-16:] 414 | elif "/studio/" in tmp_href: 415 | # 制作商 416 | data['studio'] = i.text.strip() 417 | data['studio_url'] = tmp_href[-16:] 418 | elif "/label/" in tmp_href: 419 | # 发行商 420 | data['label'] = i.text.strip() 421 | data['label_url'] = tmp_href[-16:] 422 | elif "/series/" in tmp_href: 423 | # 系列 424 | data['series'] = i.text.strip() 425 | data['series_url'] = tmp_href[-16:] 426 | 427 | genre_list = [] 428 | # 获取类别列表genre 类别列表genre_url 429 | for genre_tag in html.xpath('/html/body/div[2]/div[1]/div[2]/p/span/a'): 430 | if genre_tag.text is None: 431 | continue 432 | # 获取类目链接 433 | link = genre_tag.attrib.get('href') 434 | # 获取类目名 435 | name = genre_tag.text.strip() 436 | genre_list.append(name) 437 | 438 | # 查看类目是否存在,不存在则添加 439 | storage_ret = storage(AV_GENRE, {"linkid": link[-16:]}, "name") 440 | if empty(storage_ret): 441 | # 添加新类目 442 | genre_data = { 443 | 'linkid': link[-16:], 444 | 'name': name, 445 | 'title': '未知分类' 446 | } 447 | Spider.log.info('find new genre:%r', genre_data) 448 | sql = replace_sql_build(AV_GENRE, genre_data) 449 | Spider.db().execute(sql, tuple(genre_data.values())) 450 | Spider.db().commit() 451 | DATA_STORAGE[AV_GENRE].clear() 452 | 453 | data['genre'] = '|'.join(genre_list) 454 | if non_empty(data['genre']): 455 | data['genre'] = '|' + data['genre'] + '|' 456 | 457 | # 演员stars 458 | star_list = html.xpath('//div[@id="avatar-waterfall"]/a/span/text()') 459 | data['stars'] = '|'.join([x.strip() for x in star_list]) 460 | if non_empty(data['stars']): 461 | data['stars'] = '|' + data['stars'] + '|' 462 | 463 | # stars_url 464 | stars_url_list = html.xpath('//div[@id="avatar-waterfall"]/a/@href') 465 | if non_empty(stars_url_list): 466 | data['stars_url'] = '|' + '|'.join([re.findall('([a-z0-9]+)$', x)[0] 467 | for x in stars_url_list]) 468 | 469 | return data 470 | 471 | # 查询已存在影片 472 | @staticmethod 473 | def get_exist_linkid(page_type: str, keyword: str) -> dict: 474 | sql = '' 475 | exist_linkid_dict = {} 476 | # 必须有值 477 | if not keyword: 478 | return {} 479 | # 查询已存在的 480 | if page_type in ['director', 'studio', 'label', 'series']: 481 | sql = "SELECT linkid FROM av_list WHERE {}_url='{}'".format(page_type, keyword) 482 | if page_type == 'genre': 483 | genre = Spider.fetchall("SELECT name FROM av_genre WHERE linkid='{}'".format(keyword)) 484 | if genre: 485 | sql = "SELECT linkid FROM av_list WHERE genre LIKE '%|{}|%'".format(genre[0]['name']) 486 | if page_type == 'star': 487 | sql = "SELECT linkid FROM av_list WHERE stars_url LIKE '%{}%'".format(keyword) 488 | if page_type == 'group': 489 | sql = "SELECT linkid FROM av_list WHERE av_id LIKE '{}-%'".format(keyword) 490 | if page_type == 'search': 491 | where = [] 492 | for key_item in keyword.split(' '): 493 | where.append(search_where(key_item)) 494 | sql = "SELECT linkid FROM av_list WHERE " + " AND ".join(where) 495 | if non_empty(sql): 496 | ret = Spider.fetchall(sql) 497 | exist_linkid_dict = {x["linkid"]: True for x in ret} 498 | return exist_linkid_dict 499 | 500 | @staticmethod 501 | def get_html_by_url(url: str) -> tuple: 502 | retry_limit = 100 503 | for i in range(retry_limit): 504 | try: 505 | res = Spider.requests().get(url, timeout=CONFIG.getint("requests", "timeout")) 506 | if res.status_code != 200: 507 | Spider.log.error("status_code = {},url:{}".format(res.status_code, url)) 508 | return res.status_code, None 509 | 510 | return 200, etree.HTML(res.text) 511 | except Timeout as e: 512 | Spider.log.warning("requests Timeout,error:{}\nretry url:{}".format( 513 | e, url 514 | )) 515 | # 休眠 516 | time.sleep(10) 517 | # 超时重试 518 | continue 519 | 520 | except ConnectionError as e: 521 | Spider.log.warning("requests ConnectionError,error:{}\nretry url:{}".format( 522 | e, url 523 | )) 524 | # 休眠 525 | time.sleep(10) 526 | # 链接异常 527 | continue 528 | 529 | except Exception as e: 530 | Spider.log.warning("requests Exception:{}\nurl:{}".format(e, url)) 531 | time.sleep(10) 532 | continue 533 | # 返回错误 534 | return 500, None 535 | 536 | 537 | if __name__ == '__main__': 538 | pass 539 | -------------------------------------------------------------------------------- /static/glyphicons-halflings-regular.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/moozik/avmoo-spider/424d5f95f4f8dc3b766231235a49cd6e8cc49d4b/static/glyphicons-halflings-regular.woff2 -------------------------------------------------------------------------------- /static/icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/moozik/avmoo-spider/424d5f95f4f8dc3b766231235a49cd6e8cc49d4b/static/icon.png -------------------------------------------------------------------------------- /templates/actresses.html: -------------------------------------------------------------------------------- 1 | {% extends "main.html" %} 2 | 3 | 4 | {% block head %} 5 | 10 | {% endblock %} 11 | 12 | {% block container %} 13 |
14 |
15 |
16 |
17 |
18 | 19 |
20 |
21 |
22 |
23 |
24 | {% for item in data.av_stars %} 25 |
26 | 27 |
28 | {% if item.headimg %} 29 | 30 | {% else %} 31 | 32 | {% endif %} 33 |
34 |
35 | {{item.name | rename}} 36 | {% if item.movie_count %} 37 | 作品 {{item.movie_count}} 38 | 最新 {{item.release_date}} 39 | {% endif %} 40 | 41 |
42 | {% endfor %} 43 |
44 |
45 |
46 | {% endblock %} -------------------------------------------------------------------------------- /templates/analyse.html: -------------------------------------------------------------------------------- 1 | {% extends "main.html" %} 2 | 3 | {% block container %} 4 |
5 |
6 |
7 | 8 |

{{data.page_type}}: {{data.analyse_name | rename}}

9 | 10 |

总时长

11 | {{data.minute_sum}} min 12 | 13 |

类别

14 | {% for item in data.genre_counter %} 15 | {{item.name | rename}}({{item.count}}) 16 | {% endfor %} 17 | 18 |

演员

19 | {% for item in data.stars_counter %} 20 | {{item.name | rename}}({{item.count}}) 21 | {% endfor %} 22 | 23 |

系列

24 | {% for item in data.series_counter %} 25 | {{item.name | rename}}({{item.count}}) 26 | {% endfor %} 27 | 28 |

制作商

29 | {% for item in data.studio_counter %} 30 | {{item.name | rename}}({{item.count}}) 31 | {% endfor %} 32 | 33 |

发行商

34 | {% for item in data.label_counter %} 35 | {{item.name | rename}}({{item.count}}) 36 | {% endfor %} 37 | 38 |

导演

39 | {% for item in data.director_counter %} 40 | {{item.name | rename}}({{item.count}}) 41 | {% endfor %} 42 |
43 |
44 |
45 | {% endblock %} -------------------------------------------------------------------------------- /templates/config.html: -------------------------------------------------------------------------------- 1 | {% extends "main.html" %} 2 | 3 | {% block container %} 4 |
5 |
6 |
7 |
8 |
9 |
base 基础配置
10 |
11 |
12 | 13 | 14 |
15 |
16 | 17 | 18 |
19 |
20 | 21 |
22 | 25 | 28 |
29 |
30 |
31 |
32 | 33 |
34 |
spider 爬虫配置
35 |
36 |
37 | 38 | 39 |
40 |
41 | 42 | 43 |
44 |
45 | 46 | 47 |
48 |
49 | 50 | 51 |
52 | 53 |
54 | 55 | 56 |
57 |
58 | 59 | 60 |
61 |
62 |
63 |
64 |
65 |
66 |
website 网站配置
67 |
68 |
69 | 70 |
71 | 76 |
77 |
78 |
79 | 80 | 81 |
82 |
83 | 84 | 85 |
86 |
87 | 88 | 89 |
90 |
91 | 92 | 93 |
94 | 95 |
96 | 97 |
98 | 101 | 104 |
105 |
106 |
107 | 108 |
109 | 112 | 115 |
116 |
117 |
118 | 119 |
120 | 123 | 126 |
127 |
128 |
129 | 130 |
131 | 134 | 137 |
138 |
139 |
140 | 141 |
142 | 145 | 148 |
149 |
150 |
151 |
152 | 153 |
154 | 155 |
156 |
157 |
158 | 159 |
160 |
161 |
扩展信息操作
162 |
163 |
164 | 165 | 170 |
171 |
172 | 173 | 174 |
175 |
176 | 177 | 178 |
179 |
180 | 185 | 190 | 195 |
196 |
197 |
198 |
199 |
200 | 201 | 202 |
203 | {% endblock %} -------------------------------------------------------------------------------- /templates/genre.html: -------------------------------------------------------------------------------- 1 | {% extends "main.html" %} 2 | {% block container %} 3 |
4 | 5 | {% for item in data.av_genre %} 6 |

{{item.0.title}}

7 |
8 | {% for item2 in item %} 9 | {{item2.name | rename}} 10 | {% if item2.genre_count %}({{item2.genre_count}}){% endif %} 11 | 12 | {% endfor %} 13 |
14 | {% endfor %} 15 |
16 | {% endblock %} 17 | {% block pagination %}{% endblock %} -------------------------------------------------------------------------------- /templates/group.html: -------------------------------------------------------------------------------- 1 | {% extends "main.html" %} 2 | 3 | {% block container %} 4 | 6 |
7 |
8 | 26 |
27 |
28 | {% endblock %} -------------------------------------------------------------------------------- /templates/index.html: -------------------------------------------------------------------------------- 1 | {% extends "main.html" %} 2 | 3 | {% block container %} 4 |
5 |
6 |
7 | {#女优信息#} 8 | {% if data.av_stars and frame_data.page.now == 1 %} 9 |
10 |
11 |
12 | {% if data.av_stars.headimg %} 13 | 14 | {% else %} 15 | 16 | {% endif %} 17 |
18 |
19 | {{ data.av_stars.name | rename }} 20 | {% if data.av_stars.birthday %}

生日: {{ data.av_stars.birthday }}

{% endif %} 21 | {% if data.av_stars.age %}

年龄: {{ data.av_stars.age }}岁

{% endif %} 22 | {% if data.av_stars.height %}

身高: {{ data.av_stars.height }}

{% endif %} 23 | {% if data.av_stars.cup %}

罩杯: {{ data.av_stars.cup }}

{% endif %} 24 | {% if data.av_stars.bust %}

胸围: {{ data.av_stars.bust }}

{% endif %} 25 | {% if data.av_stars.waist %}

腰围: {{ data.av_stars.waist }}

{% endif %} 26 | {% if data.av_stars.hips %}

臀围: {{ data.av_stars.hips }}

{% endif %} 27 | {% if data.av_stars.hometown %}

出生地: {{ data.av_stars.hometown }}

{% endif %} 28 | {% if data.av_stars.hobby %}

爱好: {{ data.av_stars.hobby }}

{% endif %} 29 |
30 |
31 |
32 | {% endif %} 33 | 34 | {% if data.page_type != 'index' and data.page_type != 'like' and frame_data.page.now == 1 %} 35 |
36 |
37 |
38 | {% if data.page_type != 'star' %} 39 |

{{ page_type_map[data.page_type].name }}:

40 |

{{ frame_data.placeholder | rename }}

41 | {% endif %} 42 |

43 | 47 |

48 |

分析影片

50 | {# page_type_map中定义的才展示收藏 #} 51 | {% if page_type_map[data.page_type].like_enable %} 52 |

53 | {% if data.is_like %} 54 | 58 | {% else %} 59 | 63 | {% endif %} 64 |

65 | {% endif %} 66 | {% if page_type_map[data.page_type].rename_enable %} 67 |

68 | 72 |

73 | {% endif %} 74 |
75 |
76 |
77 |
78 |
79 | {% endif %} 80 | 81 | {% for item in data.av_list %} 82 | 113 | {% endfor %} 114 |
115 |
116 |
117 | 118 | 142 | {% endblock %} 143 | -------------------------------------------------------------------------------- /templates/install.html: -------------------------------------------------------------------------------- 1 | {% extends "main.html" %} 2 | 3 | {% block container %} 4 |
5 |
6 |
7 |
8 |
9 | 10 | 11 |
12 |
13 | 16 |
17 |
18 | 21 |
22 |
23 | 26 |
27 |
28 | 29 |
30 |
31 |
32 |
33 |
34 | {% endblock %} -------------------------------------------------------------------------------- /templates/main.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | {% if frame_data.title %}{{frame_data.title}} {% endif %}avmoo-spider 8 | 9 | 10 | 11 | 12 | 13 | 14 | 54 | {% block head %} {% endblock %} 55 | 56 | 57 | 58 | 118 | 119 | {% block container %} 120 | {% endblock %} 121 | 122 | 123 | {% block pagination %} 124 | {% if frame_data.page %} 125 | 164 | {% endif %} 165 | {% endblock %} 166 | 167 | 188 | 192 | 193 | 194 | 260 | {% block tail %} {% endblock %} 261 | 262 | -------------------------------------------------------------------------------- /templates/movie.html: -------------------------------------------------------------------------------- 1 | {% extends "main.html" %} 2 | 3 | {% block container %} 4 |
5 |

{{data.title}}

6 |
7 |
8 | 9 | 10 | 11 |
12 |
13 |

14 | 识别码: 15 | {{data.av_id}} 16 |

17 |

18 | 番号: 19 | {{data.av_group}} 20 |

21 | {% if data.release_date %} 22 |

23 | 发行时间: {{data.release_date}} 24 |

25 | {% endif %} 26 | {% if data.len %} 27 |

28 | 长度: {{data.len}}分钟 29 |

30 | {% endif %} 31 | {% if data.director_url %} 32 |

33 | 导演: 34 | {{data.director | rename}} 35 |

36 | {% endif %} 37 | {% if data.studio_url %} 38 |

制作商:

39 |

40 | {{data.studio | rename}} 41 |

42 | {% endif %} 43 | {% if data.label_url %} 44 |

发行商:

45 |

46 | {{data.label | rename}} 47 |

48 | {% endif %} 49 | {% if data.series_url %} 50 |

系列:

51 |

52 | {{data.series | rename}} 53 |

54 | {% endif %} 55 | {% if data.genre_data %} 56 |

类别:

57 |

58 | {% for item in data.genre_data %} 59 | 60 | {{item.name | rename}} 61 | 62 | {% endfor %} 63 |

64 | {% endif %} 65 |

操作:

66 |

67 | {% if data.is_like %} 68 | 69 | {% else %} 70 | 71 | {% endif %} 72 | 73 |

74 |
75 |
76 | 77 | 78 | 79 | {% if config['website']['search_url'] %} 80 |

搜索资源

81 |
82 | {% for item in search_url(data.av_id) %} 83 | {{item.name}} 84 | {% endfor %} 85 |
86 | {% endif %} 87 |

影片资源

88 |
89 |
90 | 91 | 92 |
93 | 106 |
107 | 108 | {% if data.stars_data %} 109 |

演员

110 |
111 | {% for item in data.stars_data %} 112 | 113 |
114 | {% if item.headimg %} 115 | 116 | {% else %} 117 | 118 | {% endif %} 119 |
120 | {{item.name | rename}} 121 |
122 | {% endfor %} 123 |
124 | {% endif %} 125 | 126 | 127 | {% if data.stars_map %} 128 |

其他演员

129 |

130 | {% for item in data.stars_map %} 131 | 132 | {{item.name}} 133 | 134 | {% endfor %} 135 |

136 | {% endif %} 137 | 138 | 139 |
140 | {% if data.image_len > 0 %} 141 |

样品图像

142 |
143 | {% for item in detail_image(data.bigimage, data.image_len) %} 144 | 145 |
146 | 147 |
148 |
149 | {% endfor %} 150 |
151 | {% endif %} 152 |
153 | 154 |
155 | {% endblock %} 156 | {% block pagination %}{% endblock %} 157 | {% block tail %} 158 | {% if data.res_list %} 159 | 160 | 161 | {% endif %} 162 | 189 | {% endblock %} -------------------------------------------------------------------------------- /templates/scandisk.html: -------------------------------------------------------------------------------- 1 | {% extends "main.html" %} 2 | 3 | {% block head %} 4 | 9 | {% endblock %} 10 | 11 | {% block container %} 12 |
13 |
14 |
15 |
16 |
17 |

18 | 扫描本地磁盘中目标文件,自动识别番号文件名,可直接搜索,或打开。 19 |

20 |

21 | 建议选较为精准的目录。 22 |

23 | 24 | 32 | 33 | 34 | 35 |
36 |
37 | 38 | 39 | 40 | 41 | 42 | {% if file_target == "mp4" %} 43 | 44 | {% endif %} 45 | 46 | {% if file_target == "mp4" %} 47 | {% for resource in file_res %} 48 | {% if resource.av_id and av_data_map[resource.av_id] %} 49 | 50 | 51 | 52 | 53 | 80 | 81 | {% endif %} 82 | {% endfor %} 83 | {% for resource in file_res %} 84 | {% if resource.av_id and not av_data_map[resource.av_id] %} 85 | 86 | 87 | 88 | 89 | 90 | 91 | {% endif %} 92 | {% endfor %} 93 | {% for resource in file_res %} 94 | {% if not resource.av_id %} 95 | 96 | 97 | 98 | 99 | 100 | 101 | {% endif %} 102 | {% endfor %} 103 | 104 | {% else %} 105 | {% for resource in file_res %} 106 | 107 | 108 | 109 | 110 | 111 | {% endfor %} 112 | {% endif %} 113 | 114 |
路径打开信息
{{resource.file_path}} 54 | {{resource.av_id}} 55 | {% if resource.info.has_fetch_movie %} 56 | [影片已抓取] 57 | {% else %} 58 | [影片未抓取] 59 | {% endif %} 60 | {% if resource.info.has_res_extend %} 61 | [已存储路径] 62 | {% else %} 63 | [未存储路径] 64 | 65 | {% endif %} 66 | {# #} 67 | {#
#} 68 | {# #} 71 | {#
#} 72 | {#
#} 73 | {# #} 74 | {# {{ av_data_map[resource.av_id].av_id }} /#} 75 | {# {{ av_data_map[resource.av_id].release_date }}#} 76 | {# #} 77 | {#
#} 78 | {#
#} 79 |
{{resource.file_path}}{{resource.av_id}} [影片未抓取]
{{resource.file_path}}
{{resource.file_path}}
115 | 116 | 117 |
118 |
119 |
120 | {% endblock %} 121 | 122 | 123 | {% block tail %} 124 | 130 | {% endblock %} -------------------------------------------------------------------------------- /templates/spider.html: -------------------------------------------------------------------------------- 1 | {% extends "main.html" %} 2 | 3 | {% block container %} 4 | 9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 | 30 |
31 |
32 | 33 | 34 |
35 |
36 | 37 |
38 | 41 | 44 |
45 |
46 | 47 |
48 |
49 |
50 |

运行中任务

51 |
52 |
    53 |
54 |
55 |

等待中任务

56 |
57 |
    58 |
59 |
60 |

已完成任务

61 |
62 |
    63 |
64 |
65 |
66 |
67 |
68 | {% endblock %} 69 | 70 | {% block tail %} 71 | 138 | {% endblock %} -------------------------------------------------------------------------------- /website.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding:utf-8 -*- 3 | import collections 4 | import datetime 5 | import json 6 | import logging 7 | import math 8 | import re 9 | import time 10 | from logging.config import dictConfig 11 | from logging.handlers import RotatingFileHandler 12 | 13 | from flask import Flask 14 | from flask import redirect 15 | from flask import render_template 16 | from flask import request 17 | from flask import url_for 18 | 19 | from common import * 20 | from spider import Spider 21 | 22 | app = Flask(APP_NAME) 23 | app.jinja_env.auto_reload = True 24 | app.jinja_env.filters['rename'] = rename 25 | app.jinja_env.filters['url_rename'] = url_rename 26 | app.jinja_env.filters['small_img'] = small_img 27 | app.jinja_env.filters['big_img'] = big_img 28 | app.jinja_env.filters['can_play_url'] = can_play_url 29 | app.config['TEMPLATES_AUTO_RELOAD'] = True 30 | # 向网页暴露异常 31 | app.config['PROPAGATE_EXCEPTIONS'] = True 32 | SPIDER = Spider() 33 | 34 | 35 | def run(): 36 | LOGGER.info("website.run") 37 | if CONFIG.getboolean("base", "debug_mode"): 38 | app.debug = True 39 | # 打开主页 40 | if CONFIG.getboolean("website", "auto_open_site_on_run"): 41 | open_browser_tab(get_local_url()) 42 | app.run(port=CONFIG.getint("base", "port"), processes=1) 43 | 44 | 45 | # 安装程序 46 | @app.route('/install', methods=['GET', 'POST']) 47 | def install(): 48 | if request.method == 'GET': 49 | return render_template('install.html', 50 | db_file=CONFIG.get("base", "db_file"), 51 | frame_data={ 52 | 'title': '安装程序' 53 | }) 54 | # 创建db 55 | db_file = CONFIG.get("base", "db_file") 56 | LOGGER.info('create db,', db_file) 57 | db = sqlite3.connect(db_file, check_same_thread=False) 58 | # 创建表 59 | sql_list = [ 60 | CREATE_AV_LIST_SQL, 61 | CREATE_AV_STARS_SQL, 62 | CREATE_AV_GENRE_SQL, 63 | CREATE_AV_EXTEND_SQL 64 | ] 65 | for sql in sql_list: 66 | LOGGER.info('create table,sql:', sql) 67 | db.cursor().execute(sql) 68 | db.close() 69 | 70 | db_init() 71 | # 抓取av_genre 72 | insert(AV_GENRE, Spider.crawl_genre()) 73 | if 'init.crawl' in request.form: 74 | # 七ツ森りり 75 | crawl_accurate('star', '17f01576bb6b6755') 76 | for item in AV_GENRE_DEMO_DATA: 77 | insert(AV_EXTEND, [{'extend_name': item[0], 'key': item[1], 'val': item[2]}]) 78 | # 跳转到爬虫页 79 | return redirect(url_for('page_spider')) 80 | 81 | 82 | # IO错误时跳转到安装页面 83 | @app.errorhandler(IOError) 84 | def handle_exception(e): 85 | return redirect(url_for('install')) 86 | 87 | 88 | # 获取详情图列表 89 | # 大图(ssis00263pl.jpg) 90 | # 小图(ssis00263ps.jpg) 91 | # 详情小图(ssis00263-1.jpg)-(ssis00263-10.jpg) 92 | # 详情大图(ssis00263jp-1.jpg)-(ssis00263jp-10.jpg) 93 | def detail_image(big_image, img_count): 94 | ret = [] 95 | big_image = big_image[:-6] 96 | for k in range(img_count): 97 | ret.append({ 98 | 'small': '{}/digital/video{}-{}.jpg'.format(CONFIG.get('website', 'cdn'), big_image, k+1), 99 | 'big': '{}/digital/video{}jp-{}.jpg'.format(CONFIG.get('website', 'cdn'), big_image, k+1) 100 | }) 101 | return ret 102 | 103 | 104 | # 构造search链接 105 | def search_url(av_id): 106 | search_url_str = CONFIG.get('website', 'search_url') 107 | search_url_list = search_url_str.strip().split(',') 108 | if not search_url_list: 109 | return [] 110 | ret = [] 111 | for url in search_url_list: 112 | res = re.findall("^(https?://)?([^/]+)", url) 113 | if not res: 114 | continue 115 | ret.append({ 116 | 'url': url + av_id, 117 | 'name': res[0][1] 118 | }) 119 | return ret 120 | 121 | 122 | # 全局变量 123 | @app.context_processor 124 | def app_context_processor(): 125 | return { 126 | 'config': CONFIG, 127 | 'page_type_map': PAGE_TYPE_MAP, 128 | 'country_map': COUNTRY_MAP, 129 | 'detail_image': detail_image, 130 | 'search_url': search_url 131 | } 132 | 133 | 134 | # http请求log 135 | @app.before_request 136 | def app_before_request(): 137 | if request.path[:8] != '/static/': 138 | LOGGER.info('"{} {}"'.format(request.method, color(32, request.path))) 139 | 140 | 141 | # 主页 搜索页 142 | @app.route('/') 143 | @app.route('/page/') 144 | @app.route('/search/') 145 | @app.route('/search//page/') 146 | def index(keyword='', page_num=1): 147 | where = search_where_build(keyword) 148 | 149 | (result, row_count) = select_av_list(av_list_where=where, page_num=page_num) 150 | if non_empty(keyword): 151 | page_root = '/search/{}'.format(quote(keyword)) 152 | else: 153 | page_root = '' 154 | return render_template('index.html', 155 | data={ 156 | AV_LIST: result, 157 | 'page_type': 'index' 158 | }, 159 | frame_data={ 160 | 'title': keyword, 161 | 'placeholder': keyword, 162 | 'origin_link': get_url("search", quote(keyword), page_num), 163 | 'page': pagination(page_num, row_count, page_root) 164 | }) 165 | 166 | 167 | # 演员页 168 | @app.route('/actresses') 169 | @app.route('/actresses/page/') 170 | def page_actresses(page_num=1): 171 | page_limit = CONFIG.getint("website", "actresses_page_limit") 172 | sql_text = ''' 173 | SELECT * FROM ( 174 | SELECT av_stars.*,COUNT(*) AS movie_count,av_list.release_date 175 | FROM ( 176 | SELECT max(release_date) as release_date,stars_url 177 | FROM av_list 178 | WHERE stars_url != '' 179 | GROUP BY stars_url 180 | ORDER BY release_date DESC 181 | )av_list 182 | JOIN av_stars ON INSTR(av_list.stars_url, av_stars.linkid) > 0 183 | GROUP BY av_stars.linkid) 184 | ORDER BY release_date DESC 185 | ''' 186 | # 性能模式简化sql 187 | if CONFIG.getboolean("website", "efficiency_mode"): 188 | sql_text = "SELECT * FROM av_stars" 189 | 190 | sql_result = "{} LIMIT {},{}".format(sql_text, (page_num - 1) * page_limit, page_limit) 191 | result = query_sql(sql_result) 192 | count = query_sql('SELECT COUNT(*) AS co FROM av_stars')[0]['co'] 193 | return render_template('actresses.html', 194 | data={ 195 | AV_STARS: result 196 | }, 197 | frame_data={ 198 | 'title': '女优', 199 | 'origin_link': get_url("actresses"), 200 | 'page': pagination(page_num, count, "/actresses", page_limit) 201 | }) 202 | 203 | 204 | # 番号页 205 | @app.route('/series') 206 | @app.route('/series/page/') 207 | @app.route('/studio') 208 | @app.route('/studio/page/') 209 | @app.route('/label') 210 | @app.route('/label/page/') 211 | @app.route('/group') 212 | @app.route('/group/page/') 213 | def page_group(page_num=1): 214 | # 页面类型 215 | page_type = request.path.split('/')[1] 216 | result = group_data(page_type, page_num) 217 | 218 | if page_type == "group": 219 | count_res = query_sql( 220 | "SELECT count(1) AS co FROM (SELECT DISTINCT substr(av_id, 0, instr(av_id, '-')) FROM av_list)") 221 | else: 222 | count_res = query_sql( 223 | "SELECT count(DISTINCT {0}) AS co FROM av_list WHERE {0} != ''".format(page_type + '_url')) 224 | return render_template('group.html', 225 | data={ 226 | "list": result, 227 | "page_type": page_type, 228 | }, 229 | frame_data={ 230 | 'title': PAGE_TYPE_MAP[page_type]['name'], 231 | 'page': pagination(page_num, count_res[0]['co'], '/' + page_type, CONFIG.getint('website', 'group_page_limit')) 232 | }) 233 | 234 | 235 | # 构造group页面data 236 | def group_data(page_type: str, page_num: int, where: str = '1'): 237 | page_limit = CONFIG.getint('website', 'group_page_limit') 238 | order_by = CONFIG.get('website', 'group_page_order_by') 239 | if order_by not in ['release_date', 'count']: 240 | order_by = 'count' 241 | 242 | if page_type == "group": 243 | sql_text = ''' 244 | SELECT linkid,linkid AS title,release_date,bigimage,av_id,count(1) AS count FROM( 245 | SELECT 246 | substr(av_id, 0, instr(av_id, '-')) AS linkid, 247 | release_date,bigimage,av_id 248 | FROM av_list 249 | WHERE {3} 250 | ORDER BY release_date DESC,av_id DESC 251 | ) 252 | GROUP BY linkid 253 | ORDER BY {0} DESC 254 | LIMIT {1},{2} 255 | '''.format( 256 | order_by, 257 | (page_num - 1) * page_limit, 258 | page_limit, 259 | where, 260 | ) 261 | else: 262 | sql_text = ''' 263 | SELECT *,count(*) AS count FROM(SELECT {1} AS linkid,{0} AS title,release_date,bigimage,av_id FROM av_list 264 | WHERE {1} != '' AND {5} 265 | ORDER BY release_date DESC) 266 | GROUP BY linkid 267 | ORDER BY {2} DESC 268 | LIMIT {3},{4} 269 | '''.format( 270 | page_type, 271 | page_type + '_url', 272 | order_by, 273 | (page_num - 1) * page_limit, 274 | page_limit, 275 | where 276 | ) 277 | return query_sql(sql_text) 278 | 279 | 280 | # 标签页 281 | @app.route('/genre') 282 | def genre(): 283 | # 获取类目 284 | av_genre_res = query_sql("SELECT linkid,name,title FROM av_genre") 285 | 286 | # 如果genre为空则抓取 287 | if not av_genre_res: 288 | LOGGER.info('spider.genre.fetch') 289 | insert(AV_GENRE, Spider.crawl_genre()) 290 | return "请刷新" 291 | 292 | # 统计标签个数 293 | genre_list = [] 294 | for row in query_sql("SELECT genre AS genre FROM av_list"): 295 | genre_list.extend(list(set(row['genre'].strip("|").split("|")))) 296 | genre_counter = collections.Counter(genre_list) 297 | 298 | data = {} 299 | for item in av_genre_res: 300 | if item['title'] not in data: 301 | data[item['title']] = [] 302 | # 组装标签数据 303 | if item['name'] in genre_counter: 304 | item["genre_count"] = genre_counter[item['name']] 305 | 306 | data[item["title"]].append(item) 307 | data = list(data.values()) 308 | return render_template('genre.html', 309 | data={ 310 | AV_GENRE: data 311 | }, 312 | frame_data={ 313 | 'title': PAGE_TYPE_MAP['genre']['name'], 314 | 'origin_link': get_url("genre"), 315 | 'page': {'count': len(av_genre_res)} 316 | }) 317 | 318 | 319 | # 分类页 320 | @app.route('/director/') 321 | @app.route('/director//page/') 322 | @app.route('/studio/') 323 | @app.route('/studio//page/') 324 | @app.route('/label/') 325 | @app.route('/label//page/') 326 | @app.route('/series/') 327 | @app.route('/series//page/') 328 | def search_normal(linkid='', page_num=1): 329 | # 页面类型 330 | page_type = request.path.split('/')[1] 331 | # 原始链接 332 | origin_link = get_url(page_type, linkid, page_num) 333 | # 翻页链接 334 | page_root = '/{}/{}'.format(page_type, linkid) 335 | 336 | # 条件 337 | where = PAGE_TYPE_MAP[page_type]['where'].format(linkid) 338 | # 查询 339 | (result, row_count) = select_av_list(av_list_where=[where], page_num=page_num) 340 | # 判空 341 | if not result: 342 | return '没找到数据
{}'.format(a_tag_build(get_url(page_type, linkid))) 343 | 344 | # 设置默认搜索词 345 | placeholder = result[0][page_type] 346 | 347 | # 是否收藏当前页面 348 | is_like = False 349 | if PAGE_TYPE_MAP[page_type]["like_enable"] and storage(AV_EXTEND, {"extend_name": "like", "key": PAGE_TYPE_MAP[page_type]["key"], "val": linkid}): 350 | is_like = True 351 | 352 | return render_template('index.html', 353 | data={ 354 | AV_LIST: result, 355 | 'page_type': page_type, 356 | 'linkid': linkid, 357 | 'is_like': is_like 358 | }, 359 | frame_data={ 360 | 'title': rename(placeholder), 361 | 'placeholder': placeholder, 362 | 'origin_link': origin_link, 363 | 'page': pagination(page_num, row_count, page_root) 364 | }) 365 | 366 | 367 | # 分类页 368 | @app.route('/star/') 369 | @app.route('/star//page/') 370 | @app.route('/genre/') 371 | @app.route('/genre//page/') 372 | @app.route('/group/') 373 | @app.route('/group//page/') 374 | def search_other(keyword='', page_num=1): 375 | # group 和 genre 的linkid不是标准linkid 376 | page_type = request.path.split('/')[1] 377 | page_root = '/{}/{}'.format(page_type, keyword) 378 | 379 | placeholder = '' 380 | linkid = keyword 381 | origin_link = CONFIG.get("base", "avmoo_site") 382 | 383 | if page_type == 'genre': 384 | origin_link = get_url('genre', keyword) 385 | store_ret = storage(AV_GENRE, {'linkid': keyword}) 386 | if non_empty(store_ret): 387 | keyword = store_ret[0]['name'] 388 | placeholder = keyword 389 | 390 | if page_type == 'group': 391 | placeholder = keyword 392 | origin_link = get_url("search", keyword) 393 | 394 | if page_type == 'star': 395 | origin_link = get_url("star", keyword) 396 | 397 | where = PAGE_TYPE_MAP[page_type]['where'].format(keyword) 398 | 399 | # 查询 400 | (result, row_count) = select_av_list(av_list_where=[where], page_num=page_num) 401 | 402 | star_data = None 403 | if page_type == 'star': 404 | if non_empty(result) and non_empty(result[0]["stars"]): 405 | placeholder = result[0]["stars"].split("|")[result[0]["stars_url"].split("|").index(keyword)] 406 | 407 | star_data = query_sql("SELECT * FROM av_stars WHERE linkid='{}'".format(keyword)) 408 | if len(star_data) == 1: 409 | star_data = star_data[0] 410 | # 计算年龄 411 | if non_empty(star_data['birthday']) and re.match(r"\d{4}-\d{2}-\d{2}", star_data['birthday']): 412 | print(star_data['birthday']) 413 | sp = star_data['birthday'].split('-') 414 | birthday_data = datetime.date(int(sp[0]), int(sp[1]), int(sp[2])) 415 | star_data['age'] = math.ceil( 416 | (datetime.date.today() - birthday_data).days / 365) 417 | 418 | # 是否收藏当前页面 419 | is_like = False 420 | if PAGE_TYPE_MAP[page_type]["like_enable"] and storage(AV_EXTEND, {"extend_name": "like", "key": PAGE_TYPE_MAP[page_type]["key"], "val": keyword}): 421 | is_like = True 422 | 423 | return render_template('index.html', 424 | data={ 425 | AV_LIST: result, 426 | AV_STARS: star_data, 427 | 'page_type': page_type, 428 | 'linkid': linkid, 429 | 'is_like': is_like 430 | }, 431 | frame_data={ 432 | 'title': rename(placeholder), 433 | 'placeholder': placeholder, 434 | 'origin_link': origin_link, 435 | 'page': pagination(page_num, row_count, page_root) 436 | }) 437 | 438 | 439 | # 电影页 440 | @app.route('/movie/') 441 | def movie(linkid=''): 442 | where = PAGE_TYPE_MAP['movie']['where'].format(linkid) 443 | movie_list = query_sql("SELECT * FROM av_list WHERE {}".format(where)) 444 | if not movie_list: 445 | return "没找到影片
{}".format(linkid, a_tag_build(get_url('movie', linkid))) 446 | movie_data = movie_build(movie_list[0]) 447 | return render_template('movie.html', 448 | data=movie_data, 449 | frame_data={ 450 | 'title': movie_data['title'], 451 | 'origin_link': get_url("movie", movie_list[0]['linkid']) 452 | }) 453 | 454 | 455 | # 收藏页 影片 456 | @app.route('/like/movie') 457 | @app.route('/like/movie/page/') 458 | def page_like_movie(page_num=1): 459 | page_root = '/like/movie' 460 | 461 | where = "av_id IN (SELECT distinct val FROM av_extend WHERE extend_name='like' AND key='av_id')" 462 | (result, row_count) = select_av_list(av_list_where=[where], page_num=page_num) 463 | 464 | return render_template('index.html', 465 | data={ 466 | AV_LIST: result, 467 | 'page_type': 'like' 468 | }, 469 | frame_data={ 470 | 'title': '收藏影片', 471 | 'page': pagination(page_num, row_count, page_root) 472 | }) 473 | 474 | 475 | # 收藏页 番号 系列 发行 制作 476 | @app.route('/like/') 477 | @app.route('/like//page/') 478 | def page_like(page_type='', page_num=1): 479 | pmap = PAGE_TYPE_MAP[page_type] 480 | if not pmap['like_enable']: 481 | return "error" 482 | 483 | page_root = '/like/' + page_type 484 | where = '' 485 | # 收藏sql 486 | extend_sql = "SELECT distinct val FROM av_extend WHERE extend_name='like' AND key='{}'".format(PAGE_TYPE_MAP[page_type]['key']) 487 | 488 | if page_type == 'group': 489 | where = "substr(av_id, 0, instr(av_id, '-')) IN ({})".format(extend_sql) 490 | 491 | if page_type in ['series', 'studio', 'label']: 492 | where = "{} IN ({})".format(PAGE_TYPE_MAP[page_type]['key'], extend_sql) 493 | 494 | count = len(storage(AV_EXTEND, {"extend_name": 'like', 'key': PAGE_TYPE_MAP[page_type]['key']})) 495 | return render_template('group.html', 496 | data={ 497 | "list": group_data(page_type, page_num, where), 498 | "page_type": page_type, 499 | }, 500 | frame_data={ 501 | 'title': '收藏' + PAGE_TYPE_MAP[page_type]['name'], 502 | 'page': pagination(page_num, count, page_root, CONFIG.getint('website', 'group_page_limit')) 503 | }) 504 | 505 | 506 | # 爬虫页 507 | @app.route('/spider') 508 | def page_spider(): 509 | return render_template('spider.html', 510 | frame_data={ 511 | 'title': '爬虫' 512 | }) 513 | 514 | 515 | # 配置页 516 | @app.route('/config', methods=['GET', 'POST']) 517 | def page_config(): 518 | if request.method == 'GET': 519 | return render_template('config.html', 520 | frame_data={ 521 | 'title': '配置' 522 | }) 523 | # 表单存在的配置项name 524 | for name in request.form: 525 | (section, option) = name.split(".") 526 | if name == 'website.search_url': 527 | CONFIG.set(section=section, option=option, value=','.join([x.strip() for x in request.form[name].split("\n")])) 528 | continue 529 | CONFIG.set(section=section, option=option, value=request.form[name].strip()) 530 | config_save(CONFIG) 531 | config_init() 532 | LOGGER.info("new config:%r", json.dumps(request.form, ensure_ascii = False)) 533 | return redirect(request.referrer) 534 | 535 | 536 | # 分析 537 | @app.route('/analyse//') 538 | def action_analyse_star(page_type='', keyword=''): 539 | sql = "SELECT * FROM av_list WHERE {};".format(page_type_datail_where_build(page_type, keyword)) 540 | data = fetchall(sql) 541 | if empty(data): 542 | return "没找到数据
{}".format(a_tag_build(get_url(page_type, keyword))) 543 | 544 | # group 为默认 545 | analyse_name = keyword 546 | if page_type == 'star': 547 | i = data[0]['stars_url'].strip('|').split('|').index(keyword) 548 | analyse_name = data[0]['stars'].strip('|').split('|')[i] 549 | 550 | if page_type == 'genre': 551 | storage_ret = storage(AV_GENRE, {'linkid': [keyword]}, 'name') 552 | if non_empty(storage_ret): 553 | analyse_name = storage_ret[0] 554 | 555 | if page_type in ['director', 'studio', 'label', 'series']: 556 | analyse_name = data[0][page_type] 557 | 558 | genre_all = [] 559 | stars_all = [] 560 | series_all = [] 561 | studio_all = [] 562 | label_all = [] 563 | director_all = [] 564 | minute_sum = 0 565 | for row in data: 566 | genre_all.extend(row["genre"].strip('|').split("|")) 567 | stars_all.extend(row["stars"].strip('|').split("|")) 568 | if row["series"]: 569 | series_all.append(row["series"]) 570 | if row["studio"]: 571 | studio_all.append(row["studio"]) 572 | if row["label"]: 573 | label_all.append(row["label"]) 574 | if row["director"]: 575 | director_all.append(row["director"]) 576 | 577 | minute_sum = minute_sum + int(row["len"]) 578 | 579 | genre_counter = collections.OrderedDict( 580 | sorted(collections.Counter(genre_all).items(), key=lambda x: x[1], reverse=True)) 581 | stars_counter = collections.OrderedDict( 582 | sorted(collections.Counter(stars_all).items(), key=lambda x: x[1], reverse=True)) 583 | series_counter = collections.OrderedDict( 584 | sorted(collections.Counter(series_all).items(), key=lambda x: x[1], reverse=True)) 585 | studio_counter = collections.OrderedDict( 586 | sorted(collections.Counter(studio_all).items(), key=lambda x: x[1], reverse=True)) 587 | label_counter = collections.OrderedDict( 588 | sorted(collections.Counter(label_all).items(), key=lambda x: x[1], reverse=True)) 589 | director_counter = collections.OrderedDict( 590 | sorted(collections.Counter(director_all).items(), key=lambda x: x[1], reverse=True)) 591 | 592 | genre_counter = [{'name': x, 'count': genre_counter[x]} for x in genre_counter] 593 | stars_counter = [{'name': x, 'count': stars_counter[x]} for x in stars_counter if x != ''] 594 | series_counter = [{'name': x, 'count': series_counter[x]} for x in series_counter if x != ''] 595 | studio_counter = [{'name': x, 'count': studio_counter[x]} for x in studio_counter if x != ''] 596 | label_counter = [{'name': x, 'count': label_counter[x]} for x in label_counter if x != ''] 597 | director_counter = [{'name': x, 'count': director_counter[x]} for x in director_counter if x != ''] 598 | 599 | data = { 600 | "analyse_name": analyse_name, 601 | "page_type": page_type, 602 | "keyword": keyword, 603 | "minute_sum": minute_sum, 604 | "genre_counter": genre_counter, 605 | "stars_counter": stars_counter, 606 | "series_counter": series_counter, 607 | "studio_counter": studio_counter, 608 | "label_counter": label_counter, 609 | "director_counter": director_counter, 610 | } 611 | return render_template('analyse.html', 612 | data=data, 613 | frame_data={ 614 | 'title': '[{}]分析结果'.format(rename(data['analyse_name'])) 615 | }) 616 | 617 | 618 | # 构造where条件 619 | def search_where_build(keyword: str) -> list: 620 | where = [] 621 | keyword = keyword.strip() 622 | 623 | if empty(keyword): 624 | return [] 625 | 626 | # sql mode 627 | if keyword[:5] == 'WHERE': 628 | return [keyword[5:]] 629 | 630 | # 自定义搜索结构 631 | re_res = re.findall("((director|studio|label|series|genre|star|group)\\[(.+?)\\])", keyword) 632 | for item in re_res: 633 | where.append(page_type_datail_where_build(item[1], item[2])) 634 | keyword = keyword.replace(item[0], '') 635 | 636 | keyword = keyword.strip() 637 | # 通用搜索 638 | for key_item in keyword.strip().split(' '): 639 | if empty(key_item): 640 | continue 641 | 642 | # 识别linkid 643 | if is_linkid(key_item): 644 | genre_data = storage(AV_GENRE, {'linkid': [keyword]}) 645 | if non_empty(genre_data): 646 | where.append("genre GLOB '*|{}|*'".format(genre_data[0]["name"])) 647 | continue 648 | 649 | sql = " OR ".join(["{}='{}'".format(item, key_item) for item in 650 | ['linkid', 'director_url', 'studio_url', 'label_url', 'series_url']]) 651 | where.append("({} OR stars_url GLOB '*|{}*')".format(sql, key_item)) 652 | continue 653 | 654 | if key_item == '已发布': 655 | date = time.strftime("%Y-%m-%d", time.localtime()) 656 | where.append("av_list.release_date <= '{}'".format(date)) 657 | continue 658 | 659 | if key_item == '有资源': 660 | where.append( 661 | "av_id IN (SELECT distinct key FROM av_extend WHERE extend_name='movie_res')") 662 | continue 663 | 664 | if key_item == '已下载': 665 | where.append( 666 | "av_id IN (SELECT distinct key FROM av_extend WHERE extend_name='movie_res' AND val LIKE '_:\\%')") 667 | continue 668 | 669 | where.append(search_where(key_item)) 670 | return where 671 | 672 | 673 | # page_type详情页 674 | def page_type_datail_where_build(page_type: str, keyword: str) -> str: 675 | keyword = keyword 676 | 677 | if page_type == 'group': 678 | return "av_id like '{}-%'".format(keyword) 679 | 680 | if page_type == 'genre': 681 | if is_linkid(keyword): 682 | keyword = storage(AV_GENRE, {'linkid': [keyword]}, "name")[0] 683 | return "genre like '%|{}|%'".format(sql_escape(keyword)) 684 | 685 | if page_type == 'star': 686 | if is_linkid(keyword) or re.match("[a-z0-9]{4}", keyword): 687 | return "stars_url like '%|{}%'".format(keyword) 688 | else: 689 | return "stars like '%|{}|%'".format(sql_escape(keyword)) 690 | 691 | if page_type in ['director', 'studio', 'label', 'series']: 692 | if is_linkid(keyword): 693 | return "{}_url = '{}'".format(page_type, keyword) 694 | else: 695 | return "{} = '{}'".format(page_type, sql_escape(keyword)) 696 | 697 | 698 | def page_type_group_where_build(page_type: str) -> str: 699 | if page_type not in ['movie', 'series', 'studio', 'label']: 700 | return '1=1' 701 | 702 | 703 | # 构造电影页 704 | def movie_build(movie_data): 705 | # 修复数据 706 | if movie_data["genre"] and isinstance(movie_data["genre"], str) and movie_data["genre"][0] != '|': 707 | execute("update av_list set genre=('|' || genre || '|') where genre != '' and genre not like '|%'") 708 | # 修复数据 20200212 709 | if movie_data["stars"] and isinstance(movie_data["stars"], str) and movie_data["stars"][-1] != '|': 710 | execute("update av_list set stars=(stars || '|') where stars != '' and stars not like '%|'") 711 | # 系列 712 | movie_data['genre_data'] = [] 713 | if non_empty(movie_data['genre']): 714 | for item in movie_data['genre'].strip('|').split('|'): 715 | linkid = storage(AV_GENRE, {"name":item},'linkid') 716 | if non_empty(linkid): 717 | movie_data['genre_data'].append({ 718 | 'linkid': linkid[0], 719 | "name": item 720 | }) 721 | 722 | # 演员 723 | if non_empty(movie_data['stars_url']) and isinstance(movie_data['stars_url'], str): 724 | movie_data['stars_url'] = movie_data['stars_url'].strip('|').split("|") 725 | movie_data['stars'] = movie_data['stars'].strip('|').split("|") 726 | 727 | sql_text = "SELECT linkid,name,headimg FROM av_stars WHERE linkid IN ('{}')".format( 728 | "','".join(movie_data['stars_url']) 729 | ) 730 | movie_data['stars_data'] = query_sql(sql_text) 731 | # 其他所有演员 732 | if len(movie_data['stars_data']) < len(movie_data['stars_url']): 733 | movie_data['stars_map'] = [] 734 | linkid_list = [x['linkid'] for x in movie_data['stars_data']] 735 | for i in range(len(movie_data['stars_url'])): 736 | if movie_data['stars_url'][i] in linkid_list: 737 | continue 738 | movie_data['stars_map'].append({ 739 | 'linkid': movie_data['stars_url'][i], 740 | "name": movie_data['stars'][i] 741 | }) 742 | 743 | # 影片资源 744 | storage_res = storage(AV_EXTEND, {"extend_name": "movie_res", "key": [movie_data['av_id']]}, "val") 745 | movie_data['res_list'] = storage_res 746 | 747 | movie_data['av_group'] = movie_data['av_id'].split('-', 1)[0] 748 | 749 | # 是否已收藏 750 | movie_data["is_like"] = False 751 | if storage(AV_EXTEND, {"extend_name": "like", "key": 'av_id', "val": movie_data['av_id']}): 752 | movie_data["is_like"] = True 753 | 754 | # 标记 755 | movie_data['build'] = True 756 | return movie_data 757 | 758 | 759 | # 爬虫控制 760 | @app.route('/action/crawl/control/') 761 | def action_crawl_control(action): 762 | if action == "clean": 763 | QUEUE.queue.clear() 764 | return "已清空" 765 | 766 | if action == "exit": 767 | SPIDER.get_running_work("exit") 768 | return "已跳过当前任务" 769 | 770 | 771 | # 爬虫接口 772 | # /spider 表单按钮 773 | @app.route('/action/crawl', methods=['POST']) 774 | def action_crawl(): 775 | url_text = request.form['url_text'] 776 | input_num_limit = request.form['page_limit'] 777 | skip_exist = True 778 | if request.form['skip_exist'] == "False": 779 | skip_exist = False 780 | link_list = [x.strip() for x in url_text.split("\n") if x.strip() != ''] 781 | 782 | if empty(link_list): 783 | return '请输入有效id' 784 | page_limit = PAGE_MAX 785 | 786 | if input_num_limit.isnumeric() and int(input_num_limit) <= PAGE_MAX: 787 | page_limit = int(input_num_limit) 788 | 789 | for link in link_list: 790 | # 调用搜索查询 791 | if not re.match("https?://", link): 792 | # 检查是否为已存在的影片 793 | if re.match("[A-Za-z]+-\d+", link): 794 | res = fetchall('SELECT * FROM av_list WHERE av_id="{}"'.format(link)) 795 | if non_empty(res): 796 | LOGGER.info("av_id:{},exist".format(link)) 797 | continue 798 | # 构造search链接,编码搜索词 799 | link = get_url("search", quote(link)) 800 | 801 | page_type, keyword, page_start = parse_url(link) 802 | if empty(page_type): 803 | LOGGER.fatal("wrong link:", link) 804 | continue 805 | ret = crawl_accurate(page_type, keyword, page_start, page_limit, skip_exist) 806 | return redirect(url_for("page_spider")) 807 | 808 | 809 | # 爬虫精确接口 确定到页面类型 810 | # /genre 更新类目按钮 811 | # /actresses 更新所有影片按钮 812 | # /movie/linkid 详情页重新抓取按钮 813 | # /search/keyword 详情页重新抓取按钮 814 | # /(star|genre|series|studio|label|director|group)/linkid 更新影片按钮 815 | @app.route('/action/crawl/accurate', methods=['POST']) 816 | def action_crawl_accurate(): 817 | return crawl_accurate(request.form['page_type'], request.form['keyword']) 818 | 819 | 820 | # 爬虫任务统一入口 821 | def crawl_accurate(page_type: str, keyword: str = '', page_start: int = 1, page_limit: int = PAGE_MAX, 822 | skip_exist: bool = True): 823 | if page_type not in ['movie', 'star', 'genre', 'series', 'studio', 'label', 'director', 'search', 'popular', 824 | 'group', 'all_star', 'all_genre']: 825 | return 'wrong' 826 | if page_type == 'all_genre': 827 | LOGGER.info('spider.genre.fetch') 828 | insert(AV_GENRE, Spider.crawl_genre()) 829 | return '抓取完毕' 830 | if page_type == 'group': 831 | page_type = 'search' 832 | keyword = keyword + '-' 833 | 834 | if page_type == 'all_star': 835 | star_list = query_sql("SELECT linkid,name FROM av_stars") 836 | for item in star_list: 837 | # 遍历所有演员 838 | add_work({ 839 | "page_type": "star", 840 | "keyword": item['linkid'], 841 | "skip_exist": True, 842 | }) 843 | return '排队中({})...'.format(len(star_list)) 844 | 845 | if page_type in ['movie', 'star', 'genre', 'series', 'studio', 'label', 'director']: 846 | if not is_linkid(keyword): 847 | return 'keyword错误' 848 | add_work({ 849 | "page_type": page_type, 850 | "keyword": keyword, 851 | "page_start": page_start, 852 | "page_limit": page_limit, 853 | "skip_exist": skip_exist, 854 | }) 855 | return '排队中...' 856 | 857 | 858 | # 添加任务信息进队列,补充url信息 859 | def add_work(work: dict): 860 | data = { 861 | "page_type": work["page_type"], 862 | "keyword": work["keyword"], 863 | "page_start": 1, 864 | "page_limit": PAGE_MAX, 865 | "skip_exist": True, 866 | } 867 | # 补充可选参数 868 | for key in ["page_start", "page_limit", "skip_exist"]: 869 | if key in work: 870 | data[key] = work[key] 871 | data["url"] = get_url(data["page_type"], data["keyword"], data["page_start"]) 872 | QUEUE.put(data) 873 | 874 | 875 | @app.route('/action/last/insert') 876 | def action_last_insert(): 877 | return { 878 | "last_insert_list": SPIDER.get_last_insert_list(), 879 | "wait_work": list(QUEUE.queue), 880 | "running_work": SPIDER.get_running_work(), 881 | "done_work": SPIDER.get_done_work(), 882 | } 883 | 884 | 885 | # 磁盘扫描工具 886 | @app.route('/scandisk') 887 | def page_scandisk(): 888 | if 'path_target' not in request.values or 'file_target' not in request.values: 889 | return render_template('scandisk.html', frame_data={'title': '扫描硬盘'}) 890 | 891 | path_target = request.values['path_target'] 892 | path_target = upper_path(path_target) 893 | if not os.path.exists(path_target): 894 | return render_template('scandisk.html', frame_data={'title': '扫描硬盘'}) 895 | 896 | # 文件目标类型 897 | file_target = request.values['file_target'] 898 | # 路径信息 899 | file_res = [] 900 | av_data_map = {} 901 | extend_file_list = {} 902 | if file_target == "mp4": 903 | ret = storage(AV_EXTEND, {"extend_name": "movie_res"}) 904 | for row in ret: 905 | if row['key'] in extend_file_list: 906 | extend_file_list[row['key']].append(row['val']) 907 | else: 908 | extend_file_list[row['key']] = [row['val']] 909 | 910 | reg = FILE_TAIL[file_target] 911 | # 遍历所有文件 912 | for root, file in walk_all_files(path_target): 913 | # 不符合后缀要求略过 914 | if not re.search(reg, file): 915 | continue 916 | 917 | now_path = upper_path(os.path.join(root, file)) 918 | av_check = re.search(AV_FILE_REG, file) 919 | if file_target != "mp4" or not re.search(AV_FILE_REG, file): 920 | file_res.append({ 921 | 'file_path': now_path, 922 | 'file_target': file_target, 923 | }) 924 | continue 925 | 926 | # 格式化avid 927 | av_id = av_check.group(0).upper() 928 | exist = (av_id in extend_file_list and now_path in extend_file_list[av_id]) 929 | info = { 930 | # 是否已存储路径 931 | "has_res_extend": False, 932 | # 是否已抓取影片 933 | "has_fetch_movie": False, 934 | } 935 | if exist: 936 | info["has_res_extend"] = True 937 | 938 | file_res.append({ 939 | 'file_path': now_path, 940 | 'file_target': file_target, 941 | 'info': info, 942 | 'av_id': av_id, 943 | }) 944 | if file_target == "mp4": 945 | av_id_list = [x['av_id'] for x in file_res if 'av_id' in x] 946 | sql_text = "SELECT * FROM av_list WHERE av_id in ('{}')".format( 947 | "','".join(av_id_list)) 948 | for row in fetchall(sql_text): 949 | av_data_map[row['av_id']] = row 950 | 951 | for i in range(len(file_res)): 952 | if 'av_id' not in file_res[i]: 953 | continue 954 | if file_target == "mp4": 955 | if file_res[i]['av_id'] in av_data_map: 956 | file_res[i]['info']['has_fetch_movie'] = True 957 | 958 | return render_template('scandisk.html', 959 | file_res=file_res, 960 | av_data_map=av_data_map, 961 | file_target=file_target, 962 | path_target=path_target, 963 | frame_data={'title': '扫描硬盘'}) 964 | 965 | 966 | # 本地打开 967 | @app.route('/action/explorer') 968 | def action_explorer(): 969 | # 打开指定路径 970 | try: 971 | os.startfile(request.values["path"]) 972 | except FileNotFoundError as e: 973 | LOGGER.warn('FileNotFoundError,error:%r', e) 974 | return '文件未找到' 975 | return 'ok' 976 | 977 | 978 | # 查询扩展信息接口 979 | @app.route('/action/extend/select') 980 | def action_extend_select(): 981 | storage_res = storage(AV_EXTEND, dict(request.values)) 982 | return json.dumps(storage_res, ensure_ascii=False) 983 | 984 | 985 | # 添加扩展信息接口 986 | @app.route('/action/extend/insert') 987 | def action_extend_insert(): 988 | data = dict(request.values) 989 | data["val"] = data["val"].strip() 990 | val_list = storage(AV_EXTEND, {"extend_name": data["extend_name"], "key": [data["key"]]}, "val") 991 | 992 | # 影片资源 993 | if data["extend_name"] == "movie_res": 994 | if data["val"] in val_list: 995 | return "已存在不能重复添加" 996 | # 收藏 997 | if data["extend_name"] == "like": 998 | if data["val"] in val_list: 999 | return "已存在不能重复添加" 1000 | # 改名 1001 | if data["extend_name"] == "rename": 1002 | if AV_EXTEND in DATA_STORAGE: 1003 | del DATA_STORAGE[AV_EXTEND] 1004 | if 'rename' in DATA_STORAGE: 1005 | del DATA_STORAGE['rename'] 1006 | if empty(data["val"]): 1007 | del data['val'] 1008 | delete(AV_EXTEND, data) 1009 | return '已恢复原名称' 1010 | 1011 | if non_empty(val_list): 1012 | sql = "UPDATE av_extend SET val='{}' WHERE extend_name='{}' and key='{}'".format( 1013 | data["val"], data["extend_name"], data["key"]) 1014 | execute(sql) 1015 | return '已添加' 1016 | 1017 | insert(AV_EXTEND, [data]) 1018 | return '已添加' 1019 | 1020 | 1021 | # 删除扩展信息接口 1022 | @app.route('/action/extend/delete') 1023 | def action_extend_delete(): 1024 | delete(AV_EXTEND, dict(request.values)) 1025 | return '已删除' 1026 | 1027 | 1028 | # 删除影片 仅限手动调用 1029 | @app.route('/action/delete/movie/') 1030 | def action_delete_movie(linkid=''): 1031 | delete(AV_LIST, {'linkid': linkid}) 1032 | return 'movie已删除' 1033 | 1034 | 1035 | # 删除演员 仅限手动调用 1036 | @app.route('/action/delete/stars/') 1037 | def action_delete_stars(linkid=''): 1038 | star_movie = query_sql("SELECT linkid,stars_url FROM av_list WHERE stars_url like '%|{}%'".format(linkid)) 1039 | for item in star_movie: 1040 | move_star_list = query_sql("SELECT linkid FROM av_stars WHERE linkid IN ('{}')".format( 1041 | item['stars_url'].strip('|').replace('|', "','") 1042 | )) 1043 | if len(move_star_list) == 1: 1044 | delete(AV_LIST, {'linkid': item['linkid']}) 1045 | delete(AV_STARS, {'linkid': linkid}) 1046 | return 'star已删除' 1047 | 1048 | 1049 | # 标题翻译 1050 | @app.route('/action/translate') 1051 | def action_translate(): 1052 | tmp = request.values["words"].split(' ') 1053 | tmp.pop(0) 1054 | input_text = ''.join(tmp) 1055 | 1056 | res = requests.post('http://wap.youdao.com/translate', 1057 | data={'inputtext': input_text, 'type': 'JA2ZH_CN'}) 1058 | if res.status_code != 200 or len(res.text) < 20000: 1059 | return "出现错误.." + input_text 1060 | tt = re.findall('
    (.*?)
', 1061 | res.text, re.DOTALL) 1062 | if not tt: 1063 | return "出现错误.." + input_text 1064 | return tt[0].strip()[4:-5] 1065 | 1066 | 1067 | @app.route('/action/change/language') 1068 | def action_change_language(): 1069 | country = request.values['country'] 1070 | CONFIG.set("base", "country", country) 1071 | config_save(CONFIG) 1072 | config_init() 1073 | return 'ok' 1074 | 1075 | 1076 | # 分页 1077 | def pagination(pagenum, count, pageroot, pagelimit=None) -> dict: 1078 | if not pagelimit: 1079 | # 默认为主页的limit 1080 | pagelimit = CONFIG.getint("website", "page_limit") 1081 | pagecount = math.ceil(count / pagelimit) 1082 | total_max = 8 1083 | p1 = pagenum - total_max 1084 | p2 = pagenum + total_max 1085 | pagelist = [x for x in range(p1, p2 + 1) if 0 < x <= pagecount] 1086 | 1087 | pageleft = 0 1088 | pageright = 0 1089 | if pagecount != 0 and pagenum != pagecount: 1090 | pageright = pagenum + 1 1091 | if pagenum != 1: 1092 | pageleft = pagenum - 1 1093 | 1094 | pagehead = 0 1095 | pagetail = 0 1096 | if len(pagelist) > 0: 1097 | if pagelist[0] != 1: 1098 | pagehead = 1 1099 | if pagelist[-1] != pagecount: 1100 | pagetail = pagecount 1101 | return { 1102 | 'now': pagenum, 1103 | 'left': pageleft, 1104 | 'right': pageright, 1105 | 'list': pagelist, 1106 | 'head': pagehead, 1107 | 'tail': pagetail, 1108 | 'pageroot': pageroot, 1109 | 'count': count 1110 | } 1111 | 1112 | 1113 | # 查询列表 1114 | def select_av_list(av_list_where: list, page_num: int): 1115 | # 每页展示的数量 1116 | page_limit = CONFIG.getint("website", "page_limit") 1117 | 1118 | sql_order_by = "release_date DESC,av_id DESC" 1119 | sql_text = "" 1120 | if non_empty(av_list_where): 1121 | where_str = " AND ".join(av_list_where) 1122 | sql_text = "SELECT * FROM av_list WHERE {}".format(where_str) 1123 | else: 1124 | sql_text = "SELECT * FROM av_list" 1125 | 1126 | result = query_sql( 1127 | sql_text + ' ORDER BY {} LIMIT {},{}'.format(sql_order_by, (page_num - 1) * page_limit, page_limit)) 1128 | 1129 | for i in range(len(result)): 1130 | # 扩展信息 1131 | extend_list = storage(AV_EXTEND, {"extend_name": "movie_res", "key": [result[i]['av_id']]}, "val") 1132 | if not extend_list: 1133 | continue 1134 | for extend in extend_list: 1135 | extend = url_rename(extend) 1136 | if extend[:6] == "magnet" or extend[:3] == "115": 1137 | result[i]['magnet'] = 1 1138 | continue 1139 | if extend[:4] == "http": 1140 | result[i]['http'] = 1 1141 | continue 1142 | if extend[1] == ":": 1143 | result[i]['file'] = 1 1144 | return result, get_sql_count(sql_text) 1145 | 1146 | 1147 | def get_sql_count(sql_text): 1148 | return query_sql('SELECT COUNT(1) AS count FROM ({})'.format(sql_text))[0]['count'] 1149 | 1150 | 1151 | # 遍历文件 1152 | def walk_all_files(path_target): 1153 | for root, dirs, files in os.walk(path_target): 1154 | for file in files: 1155 | yield root, file 1156 | 1157 | 1158 | if __name__ == '__main__': 1159 | pass 1160 | --------------------------------------------------------------------------------