├── gcp-autostart ├── app │ ├── requirements.txt │ └── main.py ├── Dockerfile └── README.md ├── oracle-lookbusy ├── app │ ├── speedtest.sh │ └── docker-entrypoint.sh ├── docker-compose.yml └── Dockerfile ├── maccms-tool ├── requirements.txt ├── word.py ├── main.py └── config.yml ├── README.md ├── mail-api ├── template │ ├── theme_5 │ │ ├── author.html │ │ └── reply.html │ ├── theme_4 │ │ ├── author.html │ │ └── reply.html │ ├── theme_3 │ │ ├── reply.html │ │ └── author.html │ ├── theme_6 │ │ ├── author.html │ │ └── reply.html │ ├── theme_1 │ │ ├── author.html │ │ └── reply.html │ ├── theme_2 │ │ ├── author.html │ │ └── reply.html │ └── theme_7 │ │ └── reply.html ├── PHPMailer │ ├── Exception.php │ ├── OAuthTokenProvider.php │ ├── OAuth.php │ └── POP3.php └── index.php ├── wallpaper-dl ├── remove_person_pic.py ├── image_uploader.py ├── 360.py └── wallhaven.py ├── ddns-scripts └── cloudflare │ ├── ddns.sh │ └── domain-ddns.sh ├── mtab-import ├── bing-wp.py └── website-info.py └── LICENSE /gcp-autostart/app/requirements.txt: -------------------------------------------------------------------------------- 1 | google-cloud-compute==1.24.0 -------------------------------------------------------------------------------- /oracle-lookbusy/app/speedtest.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | pkill speedtest 3 | /usr/bin/speedtest --accept-license 4 | -------------------------------------------------------------------------------- /maccms-tool/requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Fog-Forest/scripts/HEAD/maccms-tool/requirements.txt -------------------------------------------------------------------------------- /gcp-autostart/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.10-slim 2 | LABEL authors="Kinoko" 3 | 4 | WORKDIR /app 5 | COPY ./app/* . 6 | 7 | # 安装依赖 8 | RUN pip install --no-cache-dir -r requirements.txt 9 | 10 | ENV PYTHONUNBUFFERED=1 11 | 12 | CMD ["python", "main.py"] 13 | -------------------------------------------------------------------------------- /gcp-autostart/README.md: -------------------------------------------------------------------------------- 1 | # GCP 抢占式实例自动开机脚本 2 | 3 | ## 运行容器 4 | 5 | ```bash 6 | # 先申请GCP账号密钥,类型选JSON, 7 | 8 | mkdir /root/key # 密钥文件放到 /root/key 里面 9 | docker run -d --name gcp-autostart \ 10 | -e GCP_KEY_PATH=/app/key \ 11 | -e GCP_LOOP_INTERVAL=300 \ 12 | -v /root/key:/app/key \ 13 | fogforest/gcp-autostart 14 | ``` 15 | 16 | ## 查看日志 17 | 18 | ```bash 19 | docker logs -f gcp-autostart 20 | ``` 21 | 22 | -------------------------------------------------------------------------------- /oracle-lookbusy/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | 3 | services: 4 | lookbusy: 5 | image: fogforest/lookbusy:latest 6 | container_name: lookbusy 7 | hostname: lookbusy 8 | restart: always 9 | environment: 10 | - TZ=Asia/Shanghai 11 | - CPU_UTIL=10-20 # CPU占用,单位%,不可省略,支持固定值或范围 12 | - CPU_CORE=1 # CPU占用核心数,不指定默认跑全核,出现CPU打满的情况可以指定为1核 13 | - MEM_UTIL=15 # 内存占用,单位%,不跑内存可省略 14 | - SPEEDTEST_INTERVAL=120 # 网络测速间隔,单位分钟,不跑网络可省略 15 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 📖 Scripts 2 | 3 | 🤪 随手写的一些乱七八糟脚本,持续更新中,觉得好用请点个⭐~ 4 | 5 | ## 仓库简介 6 | 7 | - **ddns-scripts** 8 | 简易 DDNS 脚本,兼容 Cloudflare 服务商。 9 | - **gcp-autostart** 10 | 谷歌云抢占式实例状态实时监控及自动重启工具。 11 | - **maccms-tool** 12 | MacCMS 视频信息整理工具。 13 | - **mail-api** 14 | 发送邮件接口,支持多种模板。 15 | - **mtab-import** 16 | mTab新标签页数据丰富脚本(壁纸、书签)。 17 | - **oracle-lookbusy** 18 | 甲骨文云免费机器保活工具,采用智能资源占用策略。 19 | - **wallpaper-dl** 20 | 多平台壁纸下载工具,适配 Wallhaven、360壁纸。 21 | 22 | ## 免责声明 23 | 24 | - 本项目中的脚本仅供学习和个人使用,严禁用于任何商业用途。 25 | - 学习使用时请遵守您所在国家的法律,任何非法行为由使用者自行承担。 26 | 27 | -------------------------------------------------------------------------------- /oracle-lookbusy/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM debian:12 2 | LABEL authors="Kinoko" 3 | 4 | COPY ./app /app 5 | WORKDIR /app 6 | 7 | RUN apt update \ 8 | && apt install -y curl wget cron git cmake build-essential \ 9 | && git clone https://github.com/flow2000/lookbusy.git \ 10 | && curl -s https://packagecloud.io/install/repositories/ookla/speedtest-cli/script.deb.sh | bash \ 11 | && apt install -y speedtest \ 12 | && cd lookbusy && chmod +x ./configure && ./configure && make && make install \ 13 | && rm -rf /app/lookbusy && chmod +x /app/*.sh \ 14 | && apt autoremove -y && apt autoclean && apt remove -y && apt clean 15 | 16 | CMD [ "/app/docker-entrypoint.sh" ] 17 | 18 | USER root 19 | -------------------------------------------------------------------------------- /mail-api/template/theme_5/author.html: -------------------------------------------------------------------------------- 1 |
{blogName}:《{title}》一文有新的评论啦!
4 |{author}在《{title}》评论:
7 |{text}
8 | IP地址:{ip}您在《{title}》一文上的留言有回复啦!
4 |{author},您好!
7 |您在文章《{title}》上的评论:
8 |{text}
9 |{replyAuthor}给您的回复如下:
10 |{replyText}
11 |您可以点击 查看回复的完整內容。
12 |感谢您对 {blogName} 的关注,如您有任何疑问,欢迎来我网站留言。
13 |(注:此邮件由系统自动发出,请勿回复。)
14 |{text}
67 |{text}
57 |{replyText}
62 || 6 | | 7 | | 8 | | 9 | | 10 | |
{author}在 20 | 《{title}》中发表评论:
21 |{text}
22 | 评论状态:{status}2019 30 | {blogName} 31 |
{text}
67 |{replyText}
71 |本邮件为{blogName}自动发送,请勿直接回复 | 查看文章 | {blogName}
75 |{text}
64 || 6 | | 7 | | 8 | | 9 | | 10 | |
{author},您曾在 20 | 《{title}》中发表评论:
21 |{text}
22 |{replyAuthor}给您的回复如下:
23 |{replyText}
24 |您可以 25 | 查看完整的回复内容,欢迎再次光临 26 | {blogName}!
27 |2019 31 | {blogName} - 邮件自动生成,请勿直接回复!
32 |©2021 Copyright {author}
15 ||
5 |
6 |
14 | 您在 [{blogName}] 发表的文章有新评论!
8 |
13 | {author} 在您的《{title}》上发表评论: 9 |{text} 10 |请注意:此邮件由 {blogName} 自动发送,请勿直接回复。 11 |若此邮件不是您请求的,请忽略并删除! 12 | |
15 |
|
5 |
6 |
17 | 您在 [{blogName}] 的评论有了新的回复!
8 |
16 | {author},您曾在文章《{title}》上发表评论: 9 |{text} 10 |{replyAuthor} 给您的回复如下: 11 |{replyText} 12 |您可以 查看回复完整内容,欢迎再次光临 {blogName}。 13 |请注意:此邮件由 {blogName} 自动发送,请勿直接回复。 14 |若此邮件不是您请求的,请忽略并删除! 15 | |
18 |
您的评论:
8 |{text}
9 |{replyAuthor} 给您的回复:
10 |{replyText}
11 |树在,山在,大地在,岁月在,我在,你还要怎样更好的世界?——张晓风《我在》
15 |本邮件为系统自动发送,请勿直接回复~
19 |©2021 Copyright {author}
22 |⚠️服务器探针告警!
49 || 5 | 51 | | 52 |
';
430 | foreach ($this->errors as $e) {
431 | print_r($e);
432 | }
433 | echo '';
434 | }
435 | }
436 |
437 | /**
438 | * Get an array of error messages, if any.
439 | *
440 | * @return array
441 | */
442 | public function getErrors()
443 | {
444 | return $this->errors;
445 | }
446 |
447 | /**
448 | * POP3 connection error handler.
449 | *
450 | * @param int $errno
451 | * @param string $errstr
452 | * @param string $errfile
453 | * @param int $errline
454 | */
455 | protected function catchWarning($errno, $errstr, $errfile, $errline)
456 | {
457 | $this->setError(
458 | 'Connecting to the POP3 server raised a PHP warning:' .
459 | "errno: $errno errstr: $errstr; errfile: $errfile; errline: $errline"
460 | );
461 | }
462 | }
463 |
--------------------------------------------------------------------------------
/maccms-tool/config.yml:
--------------------------------------------------------------------------------
1 | # 数据库连接信息
2 | db:
3 | host: 10.0.0.11
4 | database: demo
5 | user: demo
6 | password: b26ZMnAAZMjjsmMc
7 | # 处理数量,'all' = 全部更新
8 | num: all
9 | # 替换词库
10 | word:
11 | class: # 扩展分类
12 | - { "7-12岁": "少儿,儿童",
13 | "13-17岁": "少年",
14 | "18岁及以上": "成人",
15 | "Hong": "",
16 | "Kong": "",
17 | "ITV": "电视,卫视",
18 | "LOLI": "萝莉",
19 | "VIP尊享": "",
20 | "VIP集": "剧集,剧情",
21 | "VIP": "",
22 | "中国动漫": "国创",
23 | "中国台湾": "台湾",
24 | "中国大陆": "内地",
25 | "中国香港": "香港",
26 | "中国": "国创",
27 | "儿歌精选": "儿歌,精选",
28 | "儿童搞笑": "儿童,搞笑",
29 | "儿童教育": "儿童,教育",
30 | "其他动漫": "其他,动漫",
31 | "其他综艺": "其他,综艺",
32 | "其它": "其他",
33 | "内地剧场": "内地,剧情",
34 | "动态漫": "动态漫画",
35 | "动态漫画画": "动态漫画",
36 | "动漫电影": "动漫,电影",
37 | "动漫音乐": "动漫,音乐",
38 | "动画电影": "动画,动漫,电影",
39 | "医疗健康": "医疗,健康",
40 | "卡通动漫": "卡通,动漫",
41 | "卫视剧": "电视,卫视",
42 | "即兴喜剧": "即兴,喜剧",
43 | "历史人文": "历史,人文",
44 | "历史文化": "历史,文化",
45 | "历史革命": "历史,革命",
46 | "反腐扫黑": "反腐,扫黑",
47 | "台剧": "台湾,剧集",
48 | "台湾剧": "台湾,剧集",
49 | "合作活动": "合作,活动",
50 | "启蒙英语": "启蒙,英语,学习,幼教",
51 | "唱唱跳跳": "唱跳,舞蹈",
52 | "国产综艺": "内地,综艺",
53 | "国产动漫": "国创,动漫",
54 | "国产动画": "国创,动画",
55 | "国产剧": "内地",
56 | "国产": "内地",
57 | "国学精粹": "国学,精粹",
58 | "国家地理": "地理",
59 | "外语学习": "外语,学习",
60 | "大陆综艺": "内地,综艺",
61 | "天津卫视": "天津,卫视,电视",
62 | "太空宇宙": "太空,宇宙",
63 | "女孩爱看": "女孩",
64 | "娱乐节目": "娱乐,明星",
65 | "婚恋情感": "婚恋,情感",
66 | "学前教育": "早教,教育,学习,幼儿",
67 | "学英语": "学习,英语",
68 | "安徽卫视": "安徽,卫视,电视",
69 | "幼儿认知": "早教,教育,学习,幼儿",
70 | "幼小教育": "早教,教育,学习,幼儿",
71 | "幽默集锦": "幽默,集锦,搞笑",
72 | "强档热播": "热播,院线",
73 | "情感交友": "情感,交友",
74 | "情景喜剧": "情景,喜剧,搞笑",
75 | "情景喜": "情景,喜剧,搞笑",
76 | "手工绘画": "手工,绘画",
77 | "抗疫救灾": "抗疫,救灾",
78 | "搜狐视频大视野": "搜狐出品",
79 | "搞笑幽默": "搞笑,幽默",
80 | "播报-专访": "播报,专访",
81 | "播报-出品": "播报,出品",
82 | "播报-明星": "播报,明星,娱乐",
83 | "播报-独家": "播报,独家",
84 | "播报-现场": "播报,现场",
85 | "收藏鉴宝": "收藏,鉴宝",
86 | "文化艺术": "文化,艺术",
87 | "日常生活": "日常,生活",
88 | "日本动漫": "日本,番剧",
89 | "日本动画": "日本,番剧,动画",
90 | "日本综艺": "日本,综艺",
91 | "日漫": "日本,番剧",
92 | "早教益智": "早教,益智,教育,学习",
93 | "明星八卦": "明星,八卦,娱乐",
94 | "明星访谈": "明星,访谈",
95 | "极限运动": "极限,运动,体育",
96 | "棚内真人秀": "真人秀",
97 | "欧美剧": "欧美,剧集",
98 | "欧美动漫": "欧美,动漫",
99 | "欧美动画": "欧美,动画",
100 | "欧美综艺": "欧美,综艺",
101 | "武术散打": "武术,散打",
102 | "母婴护理": "母婴,护理",
103 | "民族音乐": "音乐,民族音乐",
104 | "流行音乐": "音乐,流行,流行音乐",
105 | "浙江卫视": "浙江,卫视,电视",
106 | "港剧": "香港,剧集",
107 | "港台动漫": "香港,台湾,动漫",
108 | "港台综艺": "香港,台湾,综艺",
109 | "港台": "香港,台湾",
110 | "港澳剧": "香港,澳门,剧集",
111 | "港澳": "香港,澳门",
112 | "游戏改编": "游戏改",
113 | "游戏竞技": "游戏,竞技",
114 | "演唱会": "音乐,演唱会",
115 | "漫画改编": "漫画改",
116 | "潮流文化": "潮流,文化,流行",
117 | "热门综艺": "热门,综艺",
118 | "父母课堂": "父母,学习,课堂,教育",
119 | "瑞士Switzerland": "瑞士",
120 | "Switzerland": "瑞士",
121 | "生活娱乐": "生活,娱乐",
122 | "生活技巧": "生活,技巧",
123 | "生活服务": "生活,服务",
124 | "生活消费": "生活,消费",
125 | "生活百科": "生活,百科",
126 | "电视剧": "剧集",
127 | "电音": "电音,音乐",
128 | "男孩爱看": "男孩",
129 | "相声小品": "相声,小品",
130 | "真人特摄": "真人,特摄",
131 | "社会题材": "社会",
132 | "竖短片": "竖屏,短片",
133 | "竖短": "竖屏,短片",
134 | "童话绘本": "童话,绘本",
135 | "篮球": "篮球,体育,运动",
136 | "精选短": "精选,短片",
137 | "纪录片": "记录",
138 | "绘画手工": "绘画,手工",
139 | "网络剧": "网络,剧集,网剧",
140 | "网络游戏": "网络,游戏",
141 | "网络电影": "网络,电影",
142 | "罪案": "犯罪,凶案,罪案",
143 | "美少女": "美女,少女,美少女",
144 | "美食教学": "美食,教学",
145 | "美食文化": "美食,文化",
146 | "美食旅游": "美食,旅游",
147 | "职业技能": "职业,技能",
148 | "自制节目": "自制",
149 | "自然科学": "自然,科学",
150 | "西班牙SPain": "西班牙",
151 | "记录片": "记录",
152 | "语言表达": "语言,表达",
153 | "课堂知识": "课堂,知识,学习",
154 | "超级网剧": "网络,剧集,网剧",
155 | "轻小说改编": "轻小说,小说改",
156 | "韩国动漫": "韩国,动漫",
157 | "韩国动画": "韩国,动画",
158 | "韩国综艺": "韩国,综艺",
159 | "音乐亚洲": "音乐,亚洲",
160 | "音乐剧": "音乐,剧集",
161 | "预告&剧八卦": "预告,剧集,八卦" }
162 | area: # 地区
163 | - { ",": "",
164 | "(": "",
165 | ")": "",
166 | "/": ",",
167 | " ": "",
168 | ":中国大陆": "内地",
169 | "马来西亚Malaysia": "马来西亚",
170 | "馬來西亞Malay": "马来西亚",
171 | "马拉西亚": "马来西亚",
172 | "马来西": "马来西亚",
173 | "马来西亚亚": "马来西亚",
174 | "超级飞侠每一集飞往世界各地不同的城市或地": "其他",
175 | "电视剧以改革初期陕北地区的城乡生活为时空": "其他",
176 | "《果味香村》以黄桃、蜜瓜、刺梨、苹果、蜜": "其他",
177 | "在西南地区的一个刚刚脱贫的小村落——高石": "其他",
178 | "《我是冒险王》是青海卫视的一档探险栏目": "其他",
179 | "大型活动《芒果新童星》关注贫困地区儿童": "其他",
180 | "以日本东北地区和东京为舞台描写了命运悲": "其他",
181 | "刘昴星(小当家)是史上最年轻的通过中国": "日本",
182 | "记录了7个少数民族的当下生活体现的是少": "其他",
183 | "《坐庄2操盘手》是王珈执导的悬疑犯罪片": "其他",
184 | "清朝道光年间皇家御用烧锅“同盛金”埋藏": "其他",
185 | "适逢改革开放40周年美国格律文化传媒集": "其他",
186 | "以六盘山为切入点用鲜活的故事充分展示": "其他",
187 | "《不老乡音第二季》穿行湘西大地用镜头": "其他",
188 | "7月瓜果飘香是大力开展农产品销售和迎": "其他",
189 | "北京电影制片厂年出品": "北京",
190 | "蘇聯": "苏联",
191 | "菲律宾Philippines": "菲律宾",
192 | "荷兰Netherlands": "荷兰",
193 | "芬兰Finland": "芬兰",
194 | "美國USA": "美国",
195 | "美国南非加拿大": "美国,南非,加拿大",
196 | "美国、英国、德国": "美国,英国,德国",
197 | "瑞士Switzerland": "瑞士",
198 | "瑞典Sweden": "瑞典",
199 | "澳大利亚Australia": "澳大利亚",
200 | "港台": "香港,台湾",
201 | "泰國": "泰国",
202 | "法国德国日本": "法国,德国,日本",
203 | "沙特阿拉伯SaudiAra": "沙特阿拉伯",
204 | "比利时Belgium": "比利时",
205 | "比利": "比利时",
206 | "比利时时": "比利时",
207 | "英语": "其他",
208 | "皆可": "其他",
209 | "波黑": "波斯尼亚,黑塞哥维那",
210 | "欧美地区": "欧美",
211 | "欧美其他": "欧美,其他",
212 | "未知": "其他",
213 | "智利Chile": "智利",
214 | "日韩地区": "日本,韩国",
215 | "日韩": "日本,韩国",
216 | "日本日本剧": "日本",
217 | "新马": "新加坡,马来西亚",
218 | "新加坡美国": "新加坡,美国",
219 | "新加坡Singapore": "新加坡",
220 | "摩纳": "摩纳哥",
221 | "摩纳哥哥": "摩纳哥",
222 | "捷克斯洛伐克Czechoslovaki": "捷克斯洛伐克",
223 | "捷克美国": "捷克,美国",
224 | "德國": "德国",
225 | "德语": "德国",
226 | "意大": "意大利",
227 | "意大利利": "意大利",
228 | "巴勒斯坦被占领区": "巴勒斯坦",
229 | "委内瑞拉Venezuela": "委内瑞拉",
230 | "国外": "其他",
231 | "塞尔维": "塞尔维亚",
232 | "塞尔维亚亚": "塞尔维亚",
233 | "埃塞俄比亚Ethi": "埃塞俄比亚",
234 | "土耳其Turkey": "土耳其",
235 | "Turkey": "土耳其",
236 | "土耳": "土耳其",
237 | "土耳其其": "土耳其",
238 | "哈萨克斯": "哈萨克斯坦",
239 | "哈萨克斯坦坦": "哈萨克斯坦",
240 | "台灣Taiwan": "台湾",
241 | "古巴Cuba": "古巴",
242 | "叙利亚Syria": "叙利亚",
243 | "印度Indian": "印度",
244 | "印度India": "印度",
245 | "India": "印度",
246 | "印尼Indonesia": "印尼",
247 | "北京": "内地",
248 | "匈牙利Hungary": "匈牙利",
249 | "动漫": "日本",
250 | "加拿大Canada": "加拿大",
251 | "利比": "利比里亚",
252 | "利比里亚里亚": "利比里亚",
253 | "其它": "其他",
254 | "俄罗斯哈萨克斯坦": "俄罗斯,哈萨克斯坦",
255 | "俄罗斯Russia": "俄罗斯",
256 | "俄国Russia": "俄国",
257 | "中国香港": "香港",
258 | "中国香": "香港",
259 | "中国澳门": "澳门",
260 | "中国大陆法国": "内地,法国",
261 | "中国大陆": "内地",
262 | "中国内地": "内地",
263 | "中国大": "内地",
264 | "中国台湾": "台湾",
265 | "不详": "其他",
266 | "中国": "内地",
267 | "大陆": "内地",
268 | "USA": "美国",
269 | "UK": "英国",
270 | "U.S.A": "美国",
271 | "SouthAfrica": "南非",
272 | "NZ": "荷兰",
273 | "Mexico": "墨西哥",
274 | "Germany": "德国",
275 | "Denmark": "丹麦",
276 | "Canada加拿大": "加拿大",
277 | "Canada": "加拿大",
278 | "Australia": "澳大利亚",
279 | "Switzerland": "瑞士" }
280 | lang: # 语言
281 | - { "/": ",",
282 | " ": "",
283 | "55": "",
284 | ":汉语普通话": "普通话",
285 | ":韩语": "韩语",
286 | "马拉地语Marat": "马拉地语",
287 | "马来西亚": "马来语",
288 | "马来西": "马来语",
289 | "马来": "马来语",
290 | "音乐": "",
291 | "丹麦语Danish": "丹麦语",
292 | "丹麦语D": "丹麦语",
293 | "丹麦": "丹麦语",
294 | "乌克兰语Ukari": "乌克兰语",
295 | "乌克兰": "乌克兰语",
296 | "乌尔都": "乌尔都语",
297 | "俄罗斯语Russi": "俄语",
298 | "俄罗斯语": "俄语",
299 | "俄罗斯": "俄语",
300 | "俄语Russian": "俄语",
301 | "俄语Russina": "俄语",
302 | "俄語": "俄语",
303 | "克丘亚": "克丘亚语",
304 | "兰州": "",
305 | "其它": "其他",
306 | "冰岛语Icelan": "冰岛语",
307 | "比印度语Hindi": "印地语",
308 | "北印度语Hindi": "印地语",
309 | "北印度语": "印地语",
310 | "印度语Hindi": "印地语",
311 | "印度语": "印地语",
312 | "印地语Hindi": "印地语",
313 | "印地语h": "印地语",
314 | "印地": "印地语",
315 | "印度India": "印地语",
316 | "印度尼西亚语": "印尼语",
317 | "印度尼西亚": "印尼语",
318 | "印尼语Indone": "印尼语",
319 | "印度": "印地语",
320 | "南非語": "南非语",
321 | "四川方言": "四川话",
322 | "泰国语": "泰语",
323 | "泰国": "泰语",
324 | "泰語": "泰语",
325 | "泰米尔语Tamil": "泰米尔语",
326 | "土耳其语Turke": "土耳其语",
327 | "土耳其语Turki": "土耳其语",
328 | "土尔其语": "土耳其语",
329 | "土耳其": "土耳其语",
330 | "塞尔维亚克罗地亚语": "塞尔维亚语,克罗地亚语",
331 | "塞尔维亚-克罗地亚语": "塞尔维亚语,克罗地亚语",
332 | "塞尔维亚-": "塞尔维亚语",
333 | "塞尔维亚": "塞尔维亚语",
334 | "暂无": "其他",
335 | "未知": "其他",
336 | "国产": "普通话",
337 | "国语大陆": "普通话",
338 | "国语": "普通话",
339 | "中国大陆": "普通话",
340 | "中国大": "普通话",
341 | "中国香港": "粤语",
342 | "中国香": "粤语",
343 | "中国台湾": "闽南语",
344 | "台湾": "闽南语",
345 | "台语": "闽南语",
346 | "中国": "普通话",
347 | "中文": "普通话",
348 | "湖南方言": "湖南话,方言",
349 | "闽南方言": "闽南语,方言",
350 | "云南方言": "云南语,方言",
351 | "云南语": "云南语,方言",
352 | "闽南话": "闽南语",
353 | "上海": "上海话,方言",
354 | "南京": "南京话,方言",
355 | "北京": "北京话,方言",
356 | "四川": "四川话,方言",
357 | "山东": "山东话,方言",
358 | "客家": "客家话,方言",
359 | "徐州": "徐州话,方言",
360 | "武汉": "武汉话,方言",
361 | "河南": "河南话,方言",
362 | "福建": "福建话,方言",
363 | "胶辽": "胶辽话,方言",
364 | "重庆": "重庆话,方言",
365 | "陕西": "陕西话,方言",
366 | "闽南": "闽南语",
367 | "吴越": "吴语,方言",
368 | "维吾": "维语",
369 | "太湖": "",
370 | "河南越调": "越剧",
371 | "汉语普通话Mand": "普通话",
372 | "汉语普通话": "普通话",
373 | "汉语普通": "普通话",
374 | "汉语普": "普通话",
375 | "汉语方言及普通话": "普通话,方言",
376 | "汉语方言": "普通话,方言",
377 | "汉语四川话": "普通话,四川话,方言",
378 | "汉语": "普通话",
379 | "方言越调": "越剧",
380 | "方言话": "方言",
381 | "佛兰德斯语": "荷兰语",
382 | "佛兰德语": "荷兰语",
383 | "南非荷兰语": "南非语",
384 | "南非": "南非语",
385 | "加泰罗尼亚": "加泰罗尼亚语",
386 | "加泰罗": "加泰罗尼亚语",
387 | "加泰罗尼亚语尼亚语": "加泰罗尼亚语",
388 | "加拿大": "英语",
389 | "古希腊": "希腊语",
390 | "塔伽洛": "塔伽洛语",
391 | "墨西哥": "西班牙语",
392 | "西班牙": "西班牙语",
393 | "奥地利": "德语",
394 | "巴西": "葡萄牙语",
395 | "希伯来语Hebre": "希伯来语",
396 | "希伯来": "希伯来语",
397 | "库尔德": "库尔德语",
398 | "德国": "德语",
399 | "德語": "德语",
400 | "意大利": "意大利语",
401 | "意大": "意大利语",
402 | "意大利语利语": "意大利语",
403 | "挪威语Norweg": "挪威语",
404 | "挪威": "挪威语",
405 | "捷克斯洛伐克": "捷克语",
406 | "捷克语Czech": "捷克语",
407 | "捷克": "捷克语",
408 | "斯洛文尼亚": "斯洛文尼亚语",
409 | "无声": "默片",
410 | "无对白": "默片",
411 | "日本": "日语",
412 | "日語": "日语",
413 | "智利": "西班牙语",
414 | "朝鲜": "朝鲜语",
415 | "法国": "法语",
416 | "波兰语Polish": "波兰语",
417 | "波利尼西亚": "波利尼西亚语",
418 | "波斯语Persia": "波斯语",
419 | "波斯": "波斯语",
420 | "波黑": "波斯尼亚语",
421 | "波斯语尼亚语": "波斯尼亚语",
422 | "泰卢固语Te": "泰卢固语",
423 | "泰卢固": "泰卢固语",
424 | "斯洛伐": "斯洛伐克语",
425 | "斯洛伐克语克语": "斯洛伐克语",
426 | "比利时": "法语",
427 | "澳大利亚": "英语",
428 | "爱尔兰盖尔": "爱尔兰语",
429 | "爱沙尼亚": "爱沙尼亚语",
430 | "瑞典语Swedis": "瑞典语",
431 | "瑞典语S": "瑞典语",
432 | "瑞典": "瑞典语",
433 | "瑞士德语Swiss-": "德语",
434 | "瑞士德语": "德语",
435 | "瑞士语言": "德语",
436 | "瑞士语": "德语",
437 | "瑞士": "德语",
438 | "粵語": "粤语",
439 | "罗马尼亚语Roma": "罗马尼亚语",
440 | "罗马尼": "罗马尼亚语",
441 | "罗马尼亚语亚语": "罗马尼亚语",
442 | "美国手语": "英语,手语",
443 | "美国": "英语",
444 | "芬兰语Finnis": "芬兰语",
445 | "芬兰": "芬兰语",
446 | "艾马拉": "马拉语",
447 | "苏格兰盖尔": "苏格兰盖尔语",
448 | "苏联": "俄语",
449 | "英語英语": "英语",
450 | "英语英语": "英语",
451 | "英国": "英语",
452 | "荷兰语Dutch": "荷兰语",
453 | "荷蘭語": "荷兰语",
454 | "荷兰": "荷兰语",
455 | "菲律宾语Filip": "菲律宾语",
456 | "菲律宾塔加": "菲律宾语",
457 | "菲律宾": "菲律宾语",
458 | "葡萄牙语Portu": "葡萄牙语",
459 | "葡萄牙": "葡萄牙语",
460 | "蒙古语": "蒙语",
461 | "蒙古": "蒙语",
462 | "越南语Vietna": "越南语",
463 | "越南": "越南语",
464 | "越语": "越南语",
465 | "赣语Gan": "赣语",
466 | "西西里": "西西里语",
467 | "高棉": "高棉语",
468 | "普听话": "普通话",
469 | "阿姆哈": "阿姆哈拉语",
470 | "阿姆哈拉语拉语": "阿姆哈拉语",
471 | "阿拉伯": "阿拉伯语",
472 | "阿拉": "阿拉伯语",
473 | "阿拉伯语伯语": "阿拉伯语",
474 | "阿布": "其他",
475 | "韩国": "韩语",
476 | "马拉地语": "马拉地语",
477 | "Afrikaans": "荷兰语",
478 | "Athap": "其他",
479 | "Cantonese": "粤语",
480 | "Danish": "丹麦语",
481 | "Dari": "达里语",
482 | "English": "英语",
483 | "French": "法语",
484 | "Galic": "加利奇语",
485 | "German": "德语",
486 | "Hindi": "印地语",
487 | "India": "印地语",
488 | "Icelandic": "冰岛语",
489 | "Luxembourg": "卢森堡语",
490 | "Malayalam": "马拉雅拉姆语",
491 | "Pasht": "其他",
492 | "Persian": "波斯语",
493 | "Russian": "俄语",
494 | "Silent": "默片",
495 | "Swahi": "斯瓦希里语",
496 | "SwissGerm": "瑞士语",
497 | "Tamil": "泰米尔语",
498 | "Telugu": "泰卢固语",
499 | "Telu": "泰卢固语",
500 | "Turkish": "土耳其语",
501 | "Ukrai": "乌克兰语",
502 | "Welsh": "威尔士语",
503 | "Zulu": "祖鲁语",
504 | "spanish": "西班牙语",
505 | "话话": "话",
506 | "语语": "语" }
--------------------------------------------------------------------------------
/wallpaper-dl/360.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python3
2 | # coding=utf8
3 | # @Author: Modified based on Kinoko's script
4 | # @Date : 2025/08/10
5 | # @Desc : 360壁纸批量下载脚本 - 支持过滤黑白、纯色背景、偏暗和相似图片
6 | import logging
7 | import os
8 | import re
9 | import time
10 | from concurrent.futures import ThreadPoolExecutor, as_completed
11 | from io import BytesIO
12 |
13 | import numpy as np
14 | import requests
15 | from PIL import Image
16 | from sklearn.cluster import KMeans
17 | from tqdm import tqdm
18 |
19 | # ===================== 配置项 =====================
20 | # API基础地址
21 | API_BASE_URL = "http://wallpaper.apc.360.cn/index.php"
22 |
23 | # 分类映射关系 (cid: 分类名称)
24 | CATEGORY_MAPPING = {
25 | "14": "动物萌宠"
26 | }
27 |
28 | # 每页图片数量
29 | PAGE_SIZE = 100
30 |
31 | # 自定义下载根目录
32 | DOWNLOAD_ROOT_DIR = "D:/DL"
33 |
34 | # 并发下载线程数
35 | MAX_WORKERS = 5
36 |
37 | # 请求超时时间(秒)
38 | TIMEOUT = 10
39 |
40 | # 请求头
41 | HEADERS = {
42 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
43 | "Accept": "application/json, text/plain, */*",
44 | "Connection": "keep-alive"
45 | }
46 |
47 | # 下载失败重试次数
48 | MAX_RETRIES = 3
49 |
50 | # 重试延迟时间(秒)
51 | RETRY_DELAY = 2
52 |
53 | # 图片过滤配置
54 | BLACK_WHITE_THRESHOLD = 20 # 黑白判断阈值
55 | SOLID_BACKGROUND_THRESHOLD = 0.6 # 纯色背景判断阈值
56 | BRIGHTNESS_THRESHOLD = 50 # 亮度阈值(0-255)
57 | SIMILARITY_THRESHOLD = 5 # 相似图片判断阈值(汉明距离),值越小要求越相似
58 |
59 | # 日志配置
60 | logging.basicConfig(
61 | level=logging.INFO,
62 | format='%(asctime)s - %(levelname)s - %(message)s',
63 | datefmt='%Y-%m-%d %H:%M:%S'
64 | )
65 | logger = logging.getLogger(__name__)
66 |
67 | # 存储已下载图片的哈希值,用于相似性检查
68 | image_hashes = {} # 结构: {category_name: [hash_values]}
69 |
70 |
71 | # =================================================
72 |
73 |
74 | def calculate_perceptual_hash(image, hash_size=16):
75 | """计算图片的感知哈希值"""
76 | try:
77 | # 缩小图片尺寸并转为灰度图
78 | img = image.resize((hash_size, hash_size), Image.LANCZOS).convert('L')
79 | img_array = np.array(img)
80 |
81 | # 计算平均亮度
82 | avg_brightness = img_array.mean()
83 |
84 | # 生成哈希值:像素亮度高于平均为1,否则为0
85 | hash_array = (img_array > avg_brightness).flatten()
86 |
87 | # 转换为整数哈希值
88 | hash_value = 0
89 | for bit in hash_array:
90 | hash_value = (hash_value << 1) | (1 if bit else 0)
91 |
92 | return hash_value
93 | except Exception as e:
94 | logger.error(f"计算哈希值失败: {str(e)}")
95 | return None
96 |
97 |
98 | def hamming_distance(hash1, hash2):
99 | """计算两个哈希值的汉明距离"""
100 | if hash1 is None or hash2 is None:
101 | return float('inf') # 无法计算时视为差异极大
102 | # 计算两个哈希值的异或结果中1的个数
103 | return bin(hash1 ^ hash2).count('1')
104 |
105 |
106 | def is_similar_to_existing(image, category_name):
107 | """判断图片是否与同分类中已下载的图片相似"""
108 | if category_name not in image_hashes:
109 | return False, None
110 |
111 | current_hash = calculate_perceptual_hash(image)
112 | if current_hash is None:
113 | return False, None
114 |
115 | # 与同分类中所有已下载图片比较
116 | for existing_hash in image_hashes[category_name]:
117 | distance = hamming_distance(current_hash, existing_hash)
118 | if distance < SIMILARITY_THRESHOLD:
119 | return True, distance
120 |
121 | return False, None
122 |
123 |
124 | def is_black_white(image):
125 | """判断图片是否为黑白"""
126 | try:
127 | img_rgb = image.convert('RGB')
128 | img_array = np.array(img_rgb)
129 |
130 | r, g, b = img_array[:, :, 0], img_array[:, :, 1], img_array[:, :, 2]
131 | diff1 = np.abs(r - g)
132 | diff2 = np.abs(r - b)
133 | diff3 = np.abs(g - b)
134 |
135 | total_pixels = img_array.shape[0] * img_array.shape[1]
136 | bw_pixels = np.sum((diff1 < BLACK_WHITE_THRESHOLD) &
137 | (diff2 < BLACK_WHITE_THRESHOLD) &
138 | (diff3 < BLACK_WHITE_THRESHOLD))
139 |
140 | return bw_pixels / total_pixels > 0.95
141 | except Exception as e:
142 | logger.error(f"黑白判断失败: {str(e)}")
143 | return False
144 |
145 |
146 | def has_solid_background(image):
147 | """判断图片是否有纯色背景"""
148 | try:
149 | img_rgb = image.convert('RGB')
150 | img_array = np.array(img_rgb)
151 | pixels = img_array.reshape(-1, 3)
152 |
153 | kmeans = KMeans(n_clusters=min(10, len(pixels)), random_state=42)
154 | kmeans.fit(pixels)
155 |
156 | cluster_counts = np.bincount(kmeans.labels_)
157 | max_cluster_ratio = np.max(cluster_counts) / len(pixels)
158 |
159 | return max_cluster_ratio > SOLID_BACKGROUND_THRESHOLD
160 | except Exception as e:
161 | logger.error(f"纯色背景判断失败: {str(e)}")
162 | return False
163 |
164 |
165 | def is_too_dark(image):
166 | """判断图片是否偏暗"""
167 | try:
168 | img_gray = image.convert('L')
169 | img_array = np.array(img_gray)
170 | average_brightness = np.mean(img_array)
171 | return average_brightness < BRIGHTNESS_THRESHOLD
172 | except Exception as e:
173 | logger.error(f"亮度判断失败: {str(e)}")
174 | return False
175 |
176 |
177 | def download_and_filter_image(url, save_path, category_name):
178 | """下载图片并进行过滤"""
179 | for attempt in range(MAX_RETRIES):
180 | try:
181 | logger.debug(f"尝试下载 {url} (第 {attempt + 1} 次)")
182 | response = requests.get(
183 | url,
184 | headers=HEADERS,
185 | timeout=TIMEOUT,
186 | stream=True
187 | )
188 | response.raise_for_status()
189 |
190 | image_data = BytesIO(response.content)
191 |
192 | try:
193 | with Image.open(image_data) as img:
194 | # 检查是否为黑白图片
195 | if is_black_white(img):
196 | logger.debug(f"过滤黑白图片: {url}")
197 | return False, "黑白图片"
198 |
199 | # 检查是否为纯色背景图片
200 | if has_solid_background(img):
201 | logger.debug(f"过滤纯色背景图片: {url}")
202 | return False, "纯色背景图片"
203 |
204 | # 检查是否为偏暗图片
205 | if is_too_dark(img):
206 | logger.debug(f"过滤偏暗图片: {url}")
207 | return False, "偏暗图片"
208 |
209 | # 检查是否与已下载图片相似
210 | is_similar, distance = is_similar_to_existing(img, category_name)
211 | if is_similar:
212 | logger.debug(f"过滤相似图片 (距离: {distance}): {url}")
213 | return False, f"相似图片 (距离: {distance})"
214 |
215 | except Exception as e:
216 | logger.warning(f"图片分析失败 {url} (格式可能异常): {str(e)}")
217 | return False, "图片格式异常"
218 |
219 | # 保存图片
220 | os.makedirs(os.path.dirname(save_path), exist_ok=True)
221 | with open(save_path, 'wb') as f:
222 | f.write(response.content)
223 |
224 | # 计算并保存哈希值
225 | with Image.open(save_path) as saved_img:
226 | img_hash = calculate_perceptual_hash(saved_img)
227 | if img_hash is not None:
228 | if category_name not in image_hashes:
229 | image_hashes[category_name] = []
230 | image_hashes[category_name].append(img_hash)
231 |
232 | logger.debug(f"成功下载: {save_path}")
233 | return True, "成功"
234 |
235 | except Exception as e:
236 | if attempt < MAX_RETRIES - 1:
237 | logger.warning(
238 | f"下载失败 {url} (第 {attempt + 1} 次): {str(e)},将重试..."
239 | )
240 | time.sleep(RETRY_DELAY * (attempt + 1))
241 | continue
242 |
243 | logger.error(f"下载失败 {url} (已达最大重试次数): {str(e)}")
244 | return False, f"下载失败: {str(e)}"
245 | return None
246 |
247 |
248 | def fetch_page_images(category_id, start_index):
249 | """获取指定分类和起始位置的图片列表"""
250 | try:
251 | params = {
252 | "c": "WallPaper",
253 | "a": "getAppsByCategory",
254 | "cid": category_id,
255 | "start": start_index,
256 | "count": PAGE_SIZE,
257 | "from": "360chrome"
258 | }
259 |
260 | logger.debug(f"请求URL: {API_BASE_URL}, 参数: {params}")
261 | response = requests.get(API_BASE_URL, params=params, headers=HEADERS, timeout=TIMEOUT)
262 | response.raise_for_status()
263 | return response.json()
264 | except Exception as e:
265 | logger.error(f"获取起始位置 {start_index} 的数据失败: {str(e)}")
266 | return None
267 |
268 |
269 | def collect_all_image_urls():
270 | """收集所有分类的图片URL,进行全局去重"""
271 | logger.info("====== 开始收集所有分类的图片URL ======")
272 |
273 | all_images = {}
274 | total_count = 0
275 |
276 | for category_id, category_name in CATEGORY_MAPPING.items():
277 | logger.info(f"开始收集分类: {category_name} (ID: {category_id}) 的图片URL")
278 | save_dir = os.path.join(DOWNLOAD_ROOT_DIR, category_name)
279 |
280 | try:
281 | # 获取第一页数据以确定总数
282 | first_page_data = fetch_page_images(category_id, 0)
283 | if not first_page_data or first_page_data.get("errno") != "0":
284 | error_msg = first_page_data.get("errmsg", "未知错误") if first_page_data else "无法获取数据"
285 | logger.error(f"API请求失败: {error_msg}")
286 | continue
287 |
288 | total_images = int(first_page_data.get("total", 0))
289 | total_count += total_images
290 |
291 | if total_images == 0:
292 | logger.info(f"分类 {category_name} 没有找到壁纸")
293 | continue
294 |
295 | logger.info(f"分类 {category_name} 发现 {total_images} 张壁纸")
296 |
297 | # 计算需要请求的页数
298 | pages = (total_images + PAGE_SIZE - 1) // PAGE_SIZE
299 |
300 | for page in range(pages):
301 | start_index = page * PAGE_SIZE
302 | # 避免请求超出总数
303 | if start_index >= total_images:
304 | break
305 |
306 | page_data = fetch_page_images(category_id, start_index)
307 | if not page_data or page_data.get("errno") != "0":
308 | error_msg = page_data.get("errmsg", "未知错误") if page_data else "无法获取数据"
309 | logger.warning(f"获取起始位置 {start_index} 失败: {error_msg},将跳过该页")
310 | continue
311 |
312 | for item in page_data.get("data", []):
313 | raw_url = item.get("url", "")
314 | # 移除了URL清理逻辑,直接使用原始URL
315 | if not raw_url:
316 | continue
317 |
318 | # 生成图片名称
319 | image_id = item.get("id", str(int(time.time() * 1000)))
320 | # 从URL提取扩展名
321 | ext_match = re.search(r'\.(\w+)(?:\?|$)', raw_url)
322 | ext = ext_match.group(1) if ext_match else 'jpg'
323 | image_name = f"{image_id}.{ext}"
324 | image_name = re.sub(r'[\\/*?:"<>|]', "", image_name)
325 | save_path = os.path.join(save_dir, image_name)
326 |
327 | if raw_url not in all_images:
328 | all_images[raw_url] = (category_name, save_path)
329 |
330 | logger.info(f"已收集分类 {category_name} 第 {page + 1}/{pages} 页的图片链接")
331 | time.sleep(1) # 避免请求过于频繁
332 |
333 | except Exception as e:
334 | logger.error(f"收集分类 {category_name} URL时出错: {str(e)}", exc_info=True)
335 |
336 | duplicate_count = total_count - len(all_images)
337 | logger.info(
338 | f"URL收集完成,原始总计 {total_count} 张,去重后剩余 {len(all_images)} 张,移除了 {duplicate_count} 个重复链接")
339 |
340 | categorized_images = {}
341 | for url, (category_name, save_path) in all_images.items():
342 | if category_name not in categorized_images:
343 | categorized_images[category_name] = []
344 | categorized_images[category_name].append((url, save_path))
345 |
346 | return categorized_images
347 |
348 |
349 | def download_categorized_images(categorized_images):
350 | """按分类下载整理好的图片"""
351 | logger.info("====== 开始按分类下载图片 ======")
352 | os.makedirs(DOWNLOAD_ROOT_DIR, exist_ok=True)
353 |
354 | # 初始化哈希存储
355 | global image_hashes
356 | image_hashes = {category: [] for category in categorized_images.keys()}
357 |
358 | total_stats = {"total": 0, "success": 0, "failed": 0,
359 | "filtered_black_white": 0, "filtered_solid_bg": 0,
360 | "filtered_dark": 0, "filtered_similar": 0}
361 |
362 | for category_name, image_list in categorized_images.items():
363 | logger.info(f"开始处理分类: {category_name},共 {len(image_list)} 张图片")
364 | cat_stats = {
365 | "total": len(image_list),
366 | "success": 0,
367 | "failed": 0,
368 | "filtered_black_white": 0,
369 | "filtered_solid_bg": 0,
370 | "filtered_dark": 0,
371 | "filtered_similar": 0
372 | }
373 |
374 | with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
375 | # 提交任务时传递分类名称
376 | futures = {
377 | executor.submit(download_and_filter_image, url, path, category_name): (url, path)
378 | for url, path in image_list
379 | }
380 |
381 | for future in tqdm(as_completed(futures), total=len(futures), desc=f"下载 {category_name}"):
382 | url, path = futures[future]
383 | result, reason = future.result()
384 | if result:
385 | cat_stats["success"] += 1
386 | else:
387 | if reason == "黑白图片":
388 | cat_stats["filtered_black_white"] += 1
389 | logger.info(f"已过滤 {reason}: {url}")
390 | elif reason == "纯色背景图片":
391 | cat_stats["filtered_solid_bg"] += 1
392 | logger.info(f"已过滤 {reason}: {url}")
393 | elif reason == "偏暗图片":
394 | cat_stats["filtered_dark"] += 1
395 | logger.info(f"已过滤 {reason}: {url}")
396 | elif reason.startswith("相似图片"):
397 | cat_stats["filtered_similar"] += 1
398 | logger.info(f"已过滤 {reason}: {url}")
399 | else:
400 | cat_stats["failed"] += 1
401 | logger.info(f"下载失败 {reason}: {url}")
402 |
403 | for key in total_stats:
404 | total_stats[key] += cat_stats[key]
405 |
406 | logger.info(
407 | f"分类 {category_name} 处理完成: "
408 | f"成功 {cat_stats['success']} 张, "
409 | f"失败 {cat_stats['failed']} 张, "
410 | f"过滤黑白 {cat_stats['filtered_black_white']} 张, "
411 | f"过滤纯色背景 {cat_stats['filtered_solid_bg']} 张, "
412 | f"过滤偏暗图片 {cat_stats['filtered_dark']} 张, "
413 | f"过滤相似图片 {cat_stats['filtered_similar']} 张\n"
414 | )
415 |
416 | logger.info(
417 | f"====== 所有分类处理完毕 ======\n"
418 | f"总计: {total_stats['total']} 张\n"
419 | f"成功下载: {total_stats['success']} 张\n"
420 | f"下载失败: {total_stats['failed']} 张\n"
421 | f"过滤黑白图片: {total_stats['filtered_black_white']} 张\n"
422 | f"过滤纯色背景图片: {total_stats['filtered_solid_bg']} 张\n"
423 | f"过滤偏暗图片: {total_stats['filtered_dark']} 张\n"
424 | f"过滤相似图片: {total_stats['filtered_similar']} 张"
425 | )
426 |
427 |
428 | def main():
429 | """主函数"""
430 | logger.info("====== 360壁纸批量下载脚本启动 ======")
431 | logger.info(f"配置信息: 并发数={MAX_WORKERS}, 每页数量={PAGE_SIZE}")
432 | logger.info(f"下载根目录: {os.path.abspath(DOWNLOAD_ROOT_DIR)}")
433 | logger.info(f"图片过滤: 黑白图片阈值={BLACK_WHITE_THRESHOLD}, "
434 | f"纯色背景阈值={SOLID_BACKGROUND_THRESHOLD}, "
435 | f"亮度阈值={BRIGHTNESS_THRESHOLD}, "
436 | f"相似图片阈值={SIMILARITY_THRESHOLD}")
437 |
438 | categorized_images = collect_all_image_urls()
439 | if categorized_images:
440 | download_categorized_images(categorized_images)
441 | else:
442 | logger.info("没有收集到任何图片URL,程序退出")
443 |
444 |
445 | if __name__ == "__main__":
446 | main()
447 |
--------------------------------------------------------------------------------
/wallpaper-dl/wallhaven.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python3
2 | # coding=utf8
3 | # @Author: Kinoko
4 | # @Date : 2025/08/08
5 | # @Desc : Wallhaven 壁纸批量下载脚本 - 支持过滤黑白、纯色背景、偏暗和相似图片
6 | import logging
7 | import os
8 | import re
9 | import time
10 | from concurrent.futures import ThreadPoolExecutor, as_completed
11 | from io import BytesIO
12 |
13 | import numpy as np
14 | import requests
15 | from PIL import Image
16 | from sklearn.cluster import KMeans
17 | from tqdm import tqdm
18 |
19 | # ===================== 配置项 =====================
20 | # API基础地址
21 | API_BASE_URL = "https://api.codelife.cc/wallpaper/wallhaven"
22 |
23 | # 分类映射关系 (id: 分类名称)
24 | CATEGORY_MAPPING = {
25 | # "1": "二次元",
26 | # "5": "二次元",
27 | "37": "自然风景",
28 | "711": "自然风景",
29 | "1748": "吉卜力",
30 | "2321": "像素风"
31 | }
32 |
33 | # 自定义下载根目录
34 | DOWNLOAD_ROOT_DIR = "D:/DL"
35 |
36 | # 并发下载线程数
37 | MAX_WORKERS = 5
38 |
39 | # 请求超时时间(秒)
40 | TIMEOUT = 10
41 |
42 | # 请求头
43 | HEADERS = {
44 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
45 | "Accept": "application/json, text/plain, */*",
46 | "Connection": "keep-alive"
47 | }
48 |
49 | # 下载失败重试次数
50 | MAX_RETRIES = 10
51 |
52 | # 重试延迟时间(秒)
53 | RETRY_DELAY = 2
54 |
55 | # 图片过滤配置
56 | BLACK_WHITE_THRESHOLD = 10 # 黑白判断阈值
57 | SOLID_BACKGROUND_THRESHOLD = 0.7 # 纯色背景判断阈值
58 | SOLID_COLOR_TOLERANCE = 15 # 颜色容差
59 | BRIGHTNESS_THRESHOLD = 20 # 亮度阈值(0-255)
60 | SIMILARITY_THRESHOLD = 5 # 相似图片判断阈值(汉明距离),值越小要求越相似
61 |
62 | # 域名配置 - 主域名和备用域名列表
63 | PRIMARY_DOMAIN = "https://w.wallhaven.cc/"
64 | BACKUP_DOMAINS = [
65 | "https://w.wallhaven.wpcoder.cn/",
66 | "https://w.wallhaven.clbug.com/",
67 | "https://w.wallhaven.1lou.top/",
68 | "https://files.codelife.cc/wallhaven/"
69 | ]
70 |
71 | # 日志配置
72 | logging.basicConfig(
73 | level=logging.INFO,
74 | format='%(asctime)s - %(levelname)s - %(message)s',
75 | datefmt='%Y-%m-%d %H:%M:%S'
76 | )
77 | logger = logging.getLogger(__name__)
78 |
79 | # 存储已下载图片的哈希值,用于相似性检查
80 | image_hashes = {} # 结构: {category_name: [hash_values]}
81 |
82 |
83 | # =================================================
84 |
85 |
86 | def get_domain_url(raw_url, domain):
87 | """使用指定域名生成URL,只提取full/.../wallhaven-....[图片格式]部分"""
88 | if not raw_url:
89 | return ""
90 |
91 | # 修正正则表达式:精准匹配从full/开始到图片扩展名结束的路径
92 | path_match = re.search(r'(full/[^?]+\.(?:jpg|jpeg|png|gif|webp))', raw_url)
93 | if path_match:
94 | path = path_match.group(1)
95 | if not domain.endswith('/'):
96 | domain += '/'
97 | return f"{domain}{path}"
98 |
99 | logger.warning(f"无法提取有效路径: {raw_url}")
100 | return raw_url.split('?')[0]
101 |
102 |
103 | def clean_url(raw_url):
104 | """默认URL清理:使用主域名"""
105 | return get_domain_url(raw_url, PRIMARY_DOMAIN)
106 |
107 |
108 | def calculate_perceptual_hash(image, hash_size=16):
109 | """计算图片的感知哈希值"""
110 | try:
111 | # 缩小图片尺寸并转为灰度图
112 | img = image.resize((hash_size, hash_size), Image.LANCZOS).convert('L')
113 | img_array = np.array(img)
114 |
115 | # 计算平均亮度
116 | avg_brightness = img_array.mean()
117 |
118 | # 生成哈希值:像素亮度高于平均为1,否则为0
119 | hash_array = (img_array > avg_brightness).flatten()
120 |
121 | # 转换为整数哈希值
122 | hash_value = 0
123 | for bit in hash_array:
124 | hash_value = (hash_value << 1) | (1 if bit else 0)
125 |
126 | return hash_value
127 | except Exception as e:
128 | logger.error(f"计算哈希值失败: {str(e)}")
129 | return None
130 |
131 |
132 | def hamming_distance(hash1, hash2):
133 | """计算两个哈希值的汉明距离"""
134 | if hash1 is None or hash2 is None:
135 | return float('inf') # 无法计算时视为差异极大
136 | # 计算两个哈希值的异或结果中1的个数
137 | return bin(hash1 ^ hash2).count('1')
138 |
139 |
140 | def is_similar_to_existing(image, category_name):
141 | """判断图片是否与同分类中已下载的图片相似"""
142 | if category_name not in image_hashes:
143 | return False, None
144 |
145 | current_hash = calculate_perceptual_hash(image)
146 | if current_hash is None:
147 | return False, None
148 |
149 | # 与同分类中所有已下载图片比较
150 | for existing_hash in image_hashes[category_name]:
151 | distance = hamming_distance(current_hash, existing_hash)
152 | if distance < SIMILARITY_THRESHOLD:
153 | return True, distance
154 |
155 | return False, None
156 |
157 |
158 | def is_black_white(image):
159 | """判断图片是否为黑白"""
160 | try:
161 | img_rgb = image.convert('RGB')
162 | img_array = np.array(img_rgb)
163 |
164 | r, g, b = img_array[:, :, 0], img_array[:, :, 1], img_array[:, :, 2]
165 | diff1 = np.abs(r - g)
166 | diff2 = np.abs(r - b)
167 | diff3 = np.abs(g - b)
168 |
169 | total_pixels = img_array.shape[0] * img_array.shape[1]
170 | bw_pixels = np.sum((diff1 < BLACK_WHITE_THRESHOLD) &
171 | (diff2 < BLACK_WHITE_THRESHOLD) &
172 | (diff3 < BLACK_WHITE_THRESHOLD))
173 |
174 | return bw_pixels / total_pixels > 0.95
175 | except Exception as e:
176 | logger.error(f"黑白判断失败: {str(e)}")
177 | return False
178 |
179 |
180 | def has_solid_background(image):
181 | """判断图片是否有纯色背景"""
182 | try:
183 | img_rgb = image.convert('RGB')
184 | img_array = np.array(img_rgb)
185 | pixels = img_array.reshape(-1, 3)
186 |
187 | kmeans = KMeans(n_clusters=min(10, len(pixels)), random_state=42)
188 | kmeans.fit(pixels)
189 |
190 | cluster_counts = np.bincount(kmeans.labels_)
191 | max_cluster_ratio = np.max(cluster_counts) / len(pixels)
192 |
193 | return max_cluster_ratio > SOLID_BACKGROUND_THRESHOLD
194 | except Exception as e:
195 | logger.error(f"纯色背景判断失败: {str(e)}")
196 | return False
197 |
198 |
199 | def is_too_dark(image):
200 | """判断图片是否偏暗"""
201 | try:
202 | img_gray = image.convert('L')
203 | img_array = np.array(img_gray)
204 | average_brightness = np.mean(img_array)
205 | return average_brightness < BRIGHTNESS_THRESHOLD
206 | except Exception as e:
207 | logger.error(f"亮度判断失败: {str(e)}")
208 | return False
209 |
210 |
211 | def download_and_filter_image(url, save_path, category_name):
212 | """下载图片并进行过滤(支持多域名切换重试)"""
213 | all_domains = [PRIMARY_DOMAIN] + BACKUP_DOMAINS
214 | current_domain_index = 0
215 |
216 | for attempt in range(MAX_RETRIES):
217 | try:
218 | current_domain = all_domains[current_domain_index]
219 | current_url = get_domain_url(url, current_domain)
220 |
221 | logger.debug(f"尝试下载 {current_url} (第 {attempt + 1} 次,使用域名: {current_domain})")
222 | response = requests.get(
223 | current_url,
224 | headers=HEADERS,
225 | timeout=TIMEOUT,
226 | stream=True
227 | )
228 | response.raise_for_status()
229 |
230 | image_data = BytesIO(response.content)
231 |
232 | try:
233 | with Image.open(image_data) as img:
234 | # 检查是否为黑白图片
235 | if is_black_white(img):
236 | logger.debug(f"过滤黑白图片: {current_url}")
237 | return False, "黑白图片"
238 |
239 | # 检查是否为纯色背景图片
240 | if has_solid_background(img):
241 | logger.debug(f"过滤纯色背景图片: {current_url}")
242 | return False, "纯色背景图片"
243 |
244 | # 检查是否为偏暗图片
245 | if is_too_dark(img):
246 | logger.debug(f"过滤偏暗图片: {current_url}")
247 | return False, "偏暗图片"
248 |
249 | # 检查是否与已下载图片相似
250 | is_similar, distance = is_similar_to_existing(img, category_name)
251 | if is_similar:
252 | logger.debug(f"过滤相似图片 (距离: {distance}): {current_url}")
253 | return False, f"相似图片 (距离: {distance})"
254 |
255 | except Exception as e:
256 | logger.warning(f"图片分析失败 {current_url} (格式可能异常): {str(e)}")
257 | return False, "图片格式异常"
258 |
259 | # 保存图片
260 | os.makedirs(os.path.dirname(save_path), exist_ok=True)
261 | with open(save_path, 'wb') as f:
262 | f.write(response.content)
263 |
264 | # 计算并保存哈希值
265 | with Image.open(save_path) as saved_img:
266 | img_hash = calculate_perceptual_hash(saved_img)
267 | if img_hash is not None:
268 | if category_name not in image_hashes:
269 | image_hashes[category_name] = []
270 | image_hashes[category_name].append(img_hash)
271 |
272 | logger.debug(f"成功下载: {save_path} (来源: {current_url})")
273 | return True, "成功"
274 |
275 | except Exception as e:
276 | current_domain_index = (current_domain_index + 1) % len(all_domains)
277 |
278 | if attempt < MAX_RETRIES - 1:
279 | next_domain = all_domains[current_domain_index]
280 | logger.warning(
281 | f"下载失败 {current_url} (第 {attempt + 1} 次): {str(e)},"
282 | f"将尝试域名 {next_domain} 重试..."
283 | )
284 | time.sleep(RETRY_DELAY * (attempt + 1))
285 | continue
286 |
287 | logger.error(f"下载失败 {current_url} (已达最大重试次数): {str(e)}")
288 | return False, f"下载失败: {str(e)}"
289 | return None
290 |
291 |
292 | def fetch_page_images(category_id, page_num):
293 | """获取指定分类和页码的图片列表"""
294 | try:
295 | url = f"{API_BASE_URL}?lang=cn&page={page_num}&size=50&q=id:{category_id}"
296 | logger.debug(f"请求URL: {url}")
297 | response = requests.get(url, headers=HEADERS, timeout=TIMEOUT)
298 | response.raise_for_status()
299 | return response.json()
300 | except Exception as e:
301 | logger.error(f"获取第 {page_num} 页数据失败: {str(e)}")
302 | return None
303 |
304 |
305 | def collect_all_image_urls():
306 | """收集所有分类的图片URL,进行全局去重"""
307 | logger.info("====== 开始收集所有分类的图片URL ======")
308 |
309 | all_images = {}
310 | total_count = 0
311 |
312 | for category_id, category_name in CATEGORY_MAPPING.items():
313 | logger.info(f"开始收集分类: {category_name} (ID: {category_id}) 的图片URL")
314 | save_dir = os.path.join(DOWNLOAD_ROOT_DIR, category_name)
315 |
316 | try:
317 | first_page_data = fetch_page_images(category_id, 1)
318 | if not first_page_data or first_page_data.get("code") != 200:
319 | error_msg = first_page_data.get("msg", "未知错误") if first_page_data else "无法获取数据"
320 | logger.error(f"API请求失败: {error_msg}")
321 | continue
322 |
323 | total_pages = first_page_data.get("pages", 0)
324 | cat_count = first_page_data.get("count", 0)
325 | total_count += cat_count
326 |
327 | if total_pages == 0:
328 | logger.info(f"分类 {category_name} 没有找到壁纸")
329 | continue
330 |
331 | logger.info(f"分类 {category_name} 发现 {cat_count} 张壁纸,共 {total_pages} 页")
332 |
333 | for page in range(1, total_pages + 1):
334 | page_data = fetch_page_images(category_id, page)
335 | if not page_data or page_data.get("code") != 200:
336 | error_msg = page_data.get("msg", "未知错误") if page_data else "无法获取数据"
337 | logger.warning(f"获取第 {page} 页失败: {error_msg},将跳过该页")
338 | continue
339 |
340 | for item in page_data.get("data", []):
341 | raw_url = item.get("raw", "")
342 | clean_img_url = clean_url(raw_url)
343 | if not clean_img_url:
344 | continue
345 |
346 | image_name = f"{item.get('name', '')}.jpg"
347 | image_name = re.sub(r'[\\/*?:"<>|]', "", image_name)
348 | save_path = os.path.join(save_dir, image_name)
349 |
350 | if clean_img_url not in all_images:
351 | all_images[clean_img_url] = (category_name, save_path)
352 |
353 | logger.info(f"已收集分类 {category_name} 第 {page}/{total_pages} 页的图片链接")
354 | time.sleep(1)
355 |
356 | except Exception as e:
357 | logger.error(f"收集分类 {category_name} URL时出错: {str(e)}", exc_info=True)
358 |
359 | duplicate_count = total_count - len(all_images)
360 | logger.info(
361 | f"URL收集完成,原始总计 {total_count} 张,去重后剩余 {len(all_images)} 张,移除了 {duplicate_count} 个重复链接")
362 |
363 | categorized_images = {}
364 | for url, (category_name, save_path) in all_images.items():
365 | if category_name not in categorized_images:
366 | categorized_images[category_name] = []
367 | categorized_images[category_name].append((url, save_path))
368 |
369 | return categorized_images
370 |
371 |
372 | def download_categorized_images(categorized_images):
373 | """按分类下载整理好的图片"""
374 | logger.info("====== 开始按分类下载图片 ======")
375 | logger.info(f"使用的域名列表: 主域名={PRIMARY_DOMAIN}, 备用域名={BACKUP_DOMAINS}")
376 | os.makedirs(DOWNLOAD_ROOT_DIR, exist_ok=True)
377 |
378 | # 初始化哈希存储
379 | global image_hashes
380 | image_hashes = {category: [] for category in categorized_images.keys()}
381 |
382 | total_stats = {"total": 0, "success": 0, "failed": 0,
383 | "filtered_black_white": 0, "filtered_solid_bg": 0,
384 | "filtered_dark": 0, "filtered_similar": 0}
385 |
386 | for category_name, image_list in categorized_images.items():
387 | logger.info(f"开始处理分类: {category_name},共 {len(image_list)} 张图片")
388 | cat_stats = {
389 | "total": len(image_list),
390 | "success": 0,
391 | "failed": 0,
392 | "filtered_black_white": 0,
393 | "filtered_solid_bg": 0,
394 | "filtered_dark": 0,
395 | "filtered_similar": 0
396 | }
397 |
398 | with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
399 | # 提交任务时传递分类名称
400 | futures = {
401 | executor.submit(download_and_filter_image, url, path, category_name): (url, path)
402 | for url, path in image_list
403 | }
404 |
405 | for future in tqdm(as_completed(futures), total=len(futures), desc=f"下载 {category_name}"):
406 | url, path = futures[future]
407 | cleaned_url = clean_url(url)
408 | result, reason = future.result()
409 | if result:
410 | cat_stats["success"] += 1
411 | else:
412 | if reason == "黑白图片":
413 | cat_stats["filtered_black_white"] += 1
414 | logger.info(f"已过滤 {reason}: {cleaned_url}")
415 | elif reason == "纯色背景图片":
416 | cat_stats["filtered_solid_bg"] += 1
417 | logger.info(f"已过滤 {reason}: {cleaned_url}")
418 | elif reason == "偏暗图片":
419 | cat_stats["filtered_dark"] += 1
420 | logger.info(f"已过滤 {reason}: {cleaned_url}")
421 | elif reason.startswith("相似图片"):
422 | cat_stats["filtered_similar"] += 1
423 | logger.info(f"已过滤 {reason}: {cleaned_url}")
424 | else:
425 | cat_stats["failed"] += 1
426 | logger.info(f"下载失败 {reason}: {cleaned_url}")
427 |
428 | for key in total_stats:
429 | total_stats[key] += cat_stats[key]
430 |
431 | logger.info(
432 | f"分类 {category_name} 处理完成: "
433 | f"成功 {cat_stats['success']} 张, "
434 | f"失败 {cat_stats['failed']} 张, "
435 | f"过滤黑白 {cat_stats['filtered_black_white']} 张, "
436 | f"过滤纯色背景 {cat_stats['filtered_solid_bg']} 张, "
437 | f"过滤偏暗图片 {cat_stats['filtered_dark']} 张, "
438 | f"过滤相似图片 {cat_stats['filtered_similar']} 张\n"
439 | )
440 |
441 | logger.info(
442 | f"====== 所有分类处理完毕 ======\n"
443 | f"总计: {total_stats['total']} 张\n"
444 | f"成功下载: {total_stats['success']} 张\n"
445 | f"下载失败: {total_stats['failed']} 张\n"
446 | f"过滤黑白图片: {total_stats['filtered_black_white']} 张\n"
447 | f"过滤纯色背景图片: {total_stats['filtered_solid_bg']} 张\n"
448 | f"过滤偏暗图片: {total_stats['filtered_dark']} 张\n"
449 | f"过滤相似图片: {total_stats['filtered_similar']} 张"
450 | )
451 |
452 |
453 | def main():
454 | """主函数"""
455 | logger.info("====== Wallhaven 壁纸批量下载脚本启动 ======")
456 | logger.info(f"配置信息: 并发数={MAX_WORKERS}, 每页数量=50")
457 | logger.info(f"下载根目录: {os.path.abspath(DOWNLOAD_ROOT_DIR)}")
458 | logger.info(f"图片过滤: 黑白图片阈值={BLACK_WHITE_THRESHOLD}, "
459 | f"纯色背景阈值={SOLID_BACKGROUND_THRESHOLD}, "
460 | f"亮度阈值={BRIGHTNESS_THRESHOLD}, "
461 | f"相似图片阈值={SIMILARITY_THRESHOLD}")
462 | logger.info(f"域名配置: 主域名={PRIMARY_DOMAIN}, 备用域名={BACKUP_DOMAINS}")
463 |
464 | categorized_images = collect_all_image_urls()
465 | if categorized_images:
466 | download_categorized_images(categorized_images)
467 | else:
468 | logger.info("没有收集到任何图片URL,程序退出")
469 |
470 |
471 | if __name__ == "__main__":
472 | main()
473 |
--------------------------------------------------------------------------------
/mtab-import/website-info.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python3
2 | # coding=utf8
3 | # @Author: Kinoko
4 | # @Date : 2025/08/01
5 | # @Desc : mTab多分类网站书签导入工具(AI)
6 | # @Func : 批量获取网站信息,处理URL去重、AI生成标题、描述和分类,图标下载转换压缩SVG,生成JSON
7 |
8 |
9 | # ============================== 公共配置参数区 ==============================
10 | # 线程数量配置
11 | MAX_WORKERS = 40 # 并发处理URL的线程数量
12 |
13 | # AI模型配置
14 | AI_CONFIG = {
15 | "api_key_env_var": "AI_API_KEY",
16 | "base_url": "https://www.gptapi.us/v1",
17 | "model": "gpt-4o-mini",
18 | "temperature": 0.7,
19 | "max_tokens": 1024,
20 | "max_retries": 2, # AI调用失败最大重试次数
21 | "retry_delay": 1 # 重试延迟时间(秒)
22 | }
23 |
24 | # 分类配置
25 | CATEGORIES = [
26 | "ai", "app", "news", "music", "tech", "photos", "life", "education",
27 | "entertainment", "shopping", "social", "read", "sports", "finance", "others"
28 | ]
29 | CATEGORY_IDS = {
30 | "生活&出行&地图&交通&美食&健康&母婴": 1,
31 | "新闻&资讯&财经资讯&地方资讯": 2,
32 | "社交&互动&论坛&社区&邮箱&即时通讯": 3,
33 | "购物&消费&电商&跨境购&二手交易": 4,
34 | "影音&媒体&影视&音乐&短视频&直播": 5,
35 | "阅读&出版&书籍&小说&漫画&百科&文献": 6,
36 | "游戏&娱乐&电竞&手游&休闲游戏": 7,
37 | "应用&工具&办公工具&效率工具&系统工具&AI": 8,
38 | "教育&学习&课程&大学&职业教育&语言学习": 9,
39 | "设计&图片&素材&创意&模板&UI设计": 10,
40 | "开发&编程&Web&框架&编程语言&IDE&技术&文档&API文档&开源手册&教程文档": 11,
41 | "职场&就业&招聘&创业&职场技能": 12,
42 | "金融&投资&银行&理财&保险&支付": 13,
43 | "体育&运动&健身&赛事&运动装备": 14,
44 | "其他": 15
45 | }
46 |
47 | # 供AI参考的分类列表
48 | AI_CATEGORY_OPTIONS = list(CATEGORY_IDS.keys())
49 |
50 | # 域名过滤配置
51 | DOMAIN_BLACKLIST = {
52 | "trae.cn", "trae.ai", "js.design", "zenvideo.qq.com"
53 | }
54 | DOMAIN_WHITELIST = {
55 | "x.com", "qq.com", "gmail.com", "google.com", "github.com", "youtube.com", "facebook.com",
56 | "yandex.com", "www.iqiyi.com", "yiyan.baidu.com", "outlook.live.com"
57 | }
58 |
59 | # 域名映射配置 - 键为需要映射的域名或URL,值为目标域名或URL
60 | DOMAIN_MAPPING = {
61 | "https://tj.shshinfo.com/tz/pcw/kimi10.html": "https://www.kimi.com",
62 | }
63 |
64 | # 网络请求配置
65 | HTTP_CONFIG = {
66 | 'timeout': 20, # 请求超时时间(秒)
67 | 'headers': {
68 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
69 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
70 | 'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
71 | 'Connection': 'keep-alive'
72 | }
73 | }
74 |
75 | # URL跳转配置
76 | REDIRECT_CONFIG = {
77 | 'max_redirects': 10, # 最大跳转次数,防止无限循环
78 | 'js_redirect_patterns': [
79 | r'window\.location\.href\s*=\s*["\'](.*?)["\']',
80 | r'window\.location\s*=\s*["\'](.*?)["\']',
81 | r'location\.href\s*=\s*["\'](.*?)["\']',
82 | r'location\s*=\s*["\'](.*?)["\']',
83 | r'redirect\s*\(\s*["\'](.*?)["\']\s*\)',
84 | r'window\.open\s*\(\s*["\'](.*?)["\']\s*\)'
85 | ]
86 | }
87 |
88 | # 文件路径配置
89 | ICON_DIRECTORY = 'icons'
90 | JSON_OUTPUT_FILE = "mtab_data.json"
91 |
92 | # 图片下载配置
93 | MAX_IMAGE_RETRIES = 3 # 最大重试次数
94 | INITIAL_RETRY_DELAY = 1 # 初始重试延迟(秒)
95 | # ========================================================================
96 |
97 |
98 | # 导入依赖库
99 | import io
100 | import json
101 | import logging
102 | import os
103 | import random
104 | import re
105 | import threading
106 | import time
107 | from base64 import b64encode
108 | from collections import Counter
109 | from concurrent.futures import ThreadPoolExecutor, as_completed
110 | from dataclasses import dataclass, asdict
111 | from typing import List, Dict, Set, Tuple, Optional
112 | from urllib.parse import quote, urlparse, urljoin
113 |
114 | import requests
115 | import validators
116 | from PIL import Image
117 | from openai import OpenAI
118 | from openai.types.chat import (
119 | ChatCompletionSystemMessageParam,
120 | ChatCompletionUserMessageParam,
121 | ChatCompletionMessageParam
122 | )
123 | from tldextract import extract
124 | from tqdm import tqdm
125 |
126 |
127 | # 数据结构定义
128 | @dataclass
129 | class WebsiteData:
130 | name: str # 网站名称(AI生成)
131 | url: str # 网站URL(小写处理后)
132 | description: str # 网站描述(AI生成)
133 | img_src: str # 图标原始URL
134 | local_filename: str # 本地存储的图标文件名
135 | category: str # 所属分类(中文)
136 | category_id: int # 分类ID
137 | background_color: str # 背景颜色
138 |
139 |
140 | # 初始化AI客户端 - 从环境变量获取API密钥
141 | def get_ai_client():
142 | """从环境变量获取API密钥并初始化AI客户端"""
143 | api_key = os.getenv(AI_CONFIG["api_key_env_var"])
144 | if not api_key:
145 | raise EnvironmentError(f"未设置环境变量 {AI_CONFIG['api_key_env_var']},请配置API密钥")
146 |
147 | return OpenAI(
148 | api_key=api_key,
149 | base_url=AI_CONFIG["base_url"]
150 | )
151 |
152 |
153 | # 初始化AI客户端
154 | ai_client = get_ai_client()
155 |
156 |
157 | # ============================== 日志配置 ==============================
158 | def setup_logger() -> logging.Logger:
159 | """配置并返回日志记录器"""
160 | logger = logging.getLogger('mtab_exporter')
161 | logger.setLevel(logging.INFO)
162 |
163 | ch = logging.StreamHandler()
164 | ch.setLevel(logging.INFO)
165 |
166 | formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
167 | ch.setFormatter(formatter)
168 |
169 | if logger.handlers:
170 | logger.handlers = []
171 | logger.addHandler(ch)
172 |
173 | return logger
174 |
175 |
176 | # 初始化日志
177 | logger = setup_logger()
178 |
179 |
180 | # ============================== URL处理工具函数 ==============================
181 | def apply_domain_mapping(url: str) -> str:
182 | """应用域名映射规则,将URL转换为目标URL"""
183 | url_lower = url.lower()
184 |
185 | # 检查完整URL匹配
186 | if url_lower in DOMAIN_MAPPING:
187 | mapped_url = DOMAIN_MAPPING[url_lower]
188 | logger.info(f"URL映射: {url_lower} -> {mapped_url}")
189 | return mapped_url
190 |
191 | # 检查域名级别匹配
192 | parsed = urlparse(url_lower)
193 | domain = parsed.netloc
194 |
195 | # 检查子域名+主域名匹配
196 | if domain in DOMAIN_MAPPING:
197 | mapped_domain = DOMAIN_MAPPING[domain]
198 | mapped_parsed = urlparse(mapped_domain)
199 | # 保留路径和参数,但使用新的域名和协议
200 | new_url = urljoin(mapped_domain, parsed.path)
201 | if parsed.query:
202 | new_url += f"?{parsed.query}"
203 | logger.info(f"域名映射: {domain} -> {mapped_domain}, 完整URL: {url_lower} -> {new_url}")
204 | return new_url
205 |
206 | # 检查主域名匹配
207 | ext = extract(domain)
208 | main_domain = f"{ext.domain}.{ext.suffix}"
209 | if main_domain in DOMAIN_MAPPING:
210 | mapped_domain = DOMAIN_MAPPING[main_domain]
211 | # 替换主域名但保留子域名
212 | subdomain = ext.subdomain
213 | new_netloc = f"{subdomain}.{mapped_domain}" if subdomain else mapped_domain
214 | new_url = f"{parsed.scheme}://{new_netloc}{parsed.path}"
215 | if parsed.query:
216 | new_url += f"?{parsed.query}"
217 | logger.info(f"主域名映射: {main_domain} -> {mapped_domain}, 完整URL: {url_lower} -> {new_url}")
218 | return new_url
219 |
220 | # 无匹配的映射规则
221 | return url_lower
222 |
223 |
224 | def normalize_url(url: str) -> str:
225 | """标准化URL格式并转换为小写"""
226 | parsed = urlparse(url)
227 | normalized = f"{parsed.scheme.lower()}://{parsed.netloc.lower()}{parsed.path}".rstrip('/')
228 | return normalized
229 |
230 |
231 | def extract_domain(url: str) -> str:
232 | """提取URL中的域名(小写)"""
233 | parsed = urlparse(url)
234 | return parsed.netloc.lower()
235 |
236 |
237 | def is_domain_whitelisted(url: str) -> bool:
238 | """检查域名是否在白名单中(使用小写域名检查)"""
239 | ext = extract(url)
240 | domain_parts = [part for part in [ext.subdomain, ext.domain, ext.suffix] if part]
241 | full_domain = ".".join(domain_parts).lower()
242 |
243 | if full_domain in DOMAIN_WHITELIST:
244 | return True
245 | if ext.registered_domain.lower() in DOMAIN_WHITELIST:
246 | return True
247 | for i in range(1, len(domain_parts)):
248 | if ".".join(domain_parts[i:]).lower() in DOMAIN_WHITELIST:
249 | return True
250 | return False
251 |
252 |
253 | def is_domain_blocked(url: str) -> bool:
254 | """检查域名是否在黑名单中(使用小写域名检查)"""
255 | domain = extract_domain(url).lower()
256 | parts = domain.split('.')
257 |
258 | if domain in DOMAIN_BLACKLIST:
259 | return True
260 | for i in range(len(parts) - 1):
261 | if '.'.join(parts[i:]).lower() in DOMAIN_BLACKLIST:
262 | return True
263 | return False
264 |
265 |
266 | def is_url_acceptable(url: str) -> Tuple[bool, str]:
267 | """检查URL是否符合处理条件(使用小写URL检查)"""
268 | lower_url = url.lower()
269 | if is_domain_blocked(lower_url):
270 | return False, f"URL在黑名单中: {extract_domain(lower_url)}"
271 | return True, "URL符合处理条件"
272 |
273 |
274 | def validate_and_process_url(url: str) -> Tuple[Optional[str], Optional[str]]:
275 | """验证并处理URL格式(确保返回小写URL)"""
276 | url = url.lower()
277 |
278 | if not url.startswith(('http://', 'https://')):
279 | return None, "URL缺少协议前缀"
280 |
281 | parsed = urlparse(url)
282 | base_url = f"{parsed.scheme}://{parsed.netloc}"
283 |
284 | # 强制使用HTTPS
285 | if base_url.startswith('http://'):
286 | base_url = base_url.replace('http://', 'https://')
287 |
288 | if not base_url.endswith('/'):
289 | base_url += '/'
290 |
291 | if not validators.url(base_url.rstrip('/')):
292 | return None, "URL格式无效"
293 |
294 | return base_url, None
295 |
296 |
297 | def get_preferred_url(original_url: str, redirect_history: List[str]) -> str:
298 | """根据跳转历史选择最优URL(返回小写URL)"""
299 | if not redirect_history:
300 | return original_url.lower()
301 |
302 | url_info = []
303 | for url in redirect_history:
304 | url_lower = url.lower()
305 | parsed = urlparse(url_lower)
306 | domain = parsed.netloc
307 | ext = extract(domain)
308 |
309 | url_info.append({
310 | "url": url_lower,
311 | "main_domain": ext.domain.lower(),
312 | "registered_domain": ext.registered_domain.lower(),
313 | "subdomain": ext.subdomain.lower(),
314 | "is_www": ext.subdomain.lower() == "www",
315 | "suffix": ext.suffix.lower(),
316 | "suffix_length": len(ext.suffix.split('.'))
317 | })
318 |
319 | base_main_domain = url_info[0]["main_domain"]
320 | same_main_domain_urls = [
321 | info for info in url_info
322 | if info["main_domain"] == base_main_domain
323 | ]
324 |
325 | if len(same_main_domain_urls) != len(url_info):
326 | return redirect_history[-1].lower()
327 |
328 | # 优先保留带www的URL
329 | www_urls = [info for info in same_main_domain_urls if info["is_www"]]
330 | if www_urls:
331 | www_urls_sorted = sorted(www_urls, key=lambda x: x["suffix_length"])
332 | return www_urls_sorted[0]["url"]
333 |
334 | # 无www时保留后缀最短的URL
335 | non_www_urls_sorted = sorted(same_main_domain_urls, key=lambda x: x["suffix_length"])
336 | shortest_suffix_urls = [
337 | info for info in non_www_urls_sorted
338 | if info["suffix_length"] == non_www_urls_sorted[0]["suffix_length"]
339 | ]
340 |
341 | # 后缀长度相同时保留最早出现的URL
342 | return shortest_suffix_urls[0]["url"]
343 |
344 |
345 | def follow_redirects(url: str) -> Tuple[str, int, str, List[str]]:
346 | """跟踪URL跳转,包括HTTP重定向和JS跳转(返回小写URL)"""
347 | # 首先应用域名映射
348 | url = apply_domain_mapping(url)
349 |
350 | visited_urls = set()
351 | current_url = url.lower()
352 | redirect_history = [current_url]
353 | redirect_count = 0
354 |
355 | while redirect_count < REDIRECT_CONFIG['max_redirects']:
356 | if current_url in visited_urls:
357 | return current_url, 302, f"循环跳转 detected after {redirect_count} steps", redirect_history
358 | visited_urls.add(current_url)
359 |
360 | try:
361 | response = requests.get(
362 | current_url,
363 | headers=HTTP_CONFIG['headers'],
364 | timeout=HTTP_CONFIG['timeout'],
365 | allow_redirects=False,
366 | stream=True
367 | )
368 |
369 | # 处理HTTP重定向
370 | if 300 <= response.status_code < 400 and 'Location' in response.headers:
371 | next_url = response.headers['Location'].lower()
372 | next_url = urljoin(current_url, next_url)
373 | # 对重定向的URL也应用映射规则
374 | next_url = apply_domain_mapping(next_url)
375 | logger.info(f"HTTP重定向: {current_url} -> {next_url}")
376 | current_url = next_url
377 | redirect_history.append(current_url)
378 | redirect_count += 1
379 | continue
380 |
381 | # 处理JS跳转
382 | if response.status_code == 200 and 'text/html' in response.headers.get('Content-Type', ''):
383 | content = response.raw.read(8192).decode('utf-8', errors='ignore')
384 |
385 | for pattern in REDIRECT_CONFIG['js_redirect_patterns']:
386 | match = re.search(pattern, content, re.IGNORECASE)
387 | if match:
388 | js_redirect_url = match.group(1).lower()
389 | js_redirect_url = urljoin(current_url, js_redirect_url)
390 | # 对JS跳转的URL也应用映射规则
391 | js_redirect_url = apply_domain_mapping(js_redirect_url)
392 | logger.info(f"JS跳转检测: {current_url} -> {js_redirect_url}")
393 | current_url = js_redirect_url
394 | redirect_history.append(current_url)
395 | redirect_count += 1
396 | response.close()
397 | break
398 | else:
399 | return current_url, response.status_code, f"最终URL,经过{redirect_count}次跳转", redirect_history
400 | continue
401 |
402 | return current_url, response.status_code, f"最终URL,经过{redirect_count}次跳转", redirect_history
403 |
404 | except requests.exceptions.SSLError:
405 | return current_url, 495, "HTTPS证书错误", redirect_history
406 | except Exception as e:
407 | return current_url, 500, f"请求错误: {str(e)}", redirect_history
408 |
409 | return current_url, 302, f"达到最大跳转次数 ({REDIRECT_CONFIG['max_redirects']})", redirect_history
410 |
411 |
412 | def check_url_accessibility(url: str) -> Tuple[bool, Optional[str], Optional[str], Optional[str]]:
413 | """检查URL可访问性并处理跳转,返回最终小写URL"""
414 | try:
415 | url = url.lower()
416 | # 应用域名映射
417 | url = apply_domain_mapping(url)
418 |
419 | if url.startswith('http://'):
420 | url = url.replace('http://', 'https://')
421 |
422 | final_url, status_code, status_msg, redirect_history = follow_redirects(url)
423 | preferred_url = get_preferred_url(url, redirect_history)
424 | logger.info(f"URL跳转跟踪结果: {preferred_url} (状态码: {status_code}, {status_msg})")
425 |
426 | if status_code == 495 or status_code >= 500:
427 | return False, f"URL访问失败: {status_msg} (状态码: {status_code})", url, None
428 |
429 | is_acceptable, reason = is_url_acceptable(preferred_url)
430 | if not is_acceptable:
431 | return False, f"URL不符合处理条件: {reason}", url, None
432 |
433 | processed_url, error = validate_and_process_url(preferred_url)
434 | if not processed_url:
435 | return False, f"URL格式验证失败: {error}", url, None
436 |
437 | normalized = normalize_url(processed_url)
438 | return True, None, processed_url, normalized
439 |
440 | except Exception as e:
441 | normalized = normalize_url(url.lower())
442 | return False, f"URL处理异常: {str(e)[:20]}", url, normalized
443 |
444 |
445 | # ============================== 网站信息处理函数 ==============================
446 | def is_valid_text(text: str) -> bool:
447 | """检查文本是否有效(不是乱码),兼容中文、英文和俄文"""
448 | if not text or not text.strip():
449 | return False
450 |
451 | text_clean = re.sub(r'[\x00-\x1F\x7F]', '', text)
452 | if not text_clean:
453 | return False
454 |
455 | valid_chars = re.findall(
456 | r'[\u4e00-\u9fa5\u0400-\u04FFa-zA-Z0-9,。,.;:!?()()《》“”‘’«»\s]',
457 | text_clean
458 | )
459 |
460 | return len(valid_chars) / len(text_clean) > 0.5
461 |
462 |
463 | def clean_html_entities(text: str) -> str:
464 | """清理HTML实体编码,保留单引号转换"""
465 | text = text.replace(''', "'")
466 | return re.sub(r'[0-9a-fA-F]+;', '', text)
467 |
468 |
469 | def fetch_api(api, url: str) -> Optional[Dict[str, str]]:
470 | """调用API获取网站标题和描述"""
471 | try:
472 | encoded_url = quote(url)
473 | api_url = api['url_template'].format(encoded_url)
474 |
475 | response = requests.get(
476 | api_url,
477 | headers=HTTP_CONFIG['headers'],
478 | timeout=HTTP_CONFIG['timeout'],
479 | allow_redirects=False
480 | )
481 | response.raise_for_status()
482 | return api['parse_func'](response.json())
483 |
484 | except Exception:
485 | return None
486 |
487 |
488 | def fetch_website_info(url: str) -> Optional[Dict[str, str]]:
489 | """通过多个API获取网站标题和描述"""
490 | if not url:
491 | return None
492 |
493 | invalid_values = {"null", "暂无标题", "暂无描述"}
494 |
495 | api_list = [
496 | {
497 | "name": "amogu",
498 | "url_template": "https://api.amogu.cn/api/tdk?url={}",
499 | "parse_func": lambda data: {
500 | "title": title,
501 | "description": desc
502 | } if data.get('code') == 1
503 | and (data_dict := data.get('data', {}))
504 | and (desc := data_dict.get('description', ''))
505 | and (title := data_dict.get('title', ''))
506 | and (title.strip() and title not in invalid_values or
507 | desc.strip() and desc not in invalid_values)
508 | and (not desc.strip() or is_valid_text(desc))
509 | else None
510 | },
511 | {
512 | "name": "shanhe",
513 | "url_template": "https://shanhe.kim/api/wz/web_tdk.php?url={}",
514 | "parse_func": lambda data: {
515 | "title": title,
516 | "description": desc
517 | } if (desc := data.get('description', ''))
518 | and (title := data.get('title', ''))
519 | and data.get('code') == 1
520 | and (title.strip() and title not in invalid_values or
521 | desc.strip() and desc not in invalid_values)
522 | and (not desc.strip() or is_valid_text(desc))
523 | else None
524 | },
525 | {
526 | "name": "suol",
527 | "url_template": "https://api.suol.cc/v1/zs_wzxx.php?url={}",
528 | "parse_func": lambda data: {
529 | "title": title,
530 | "description": desc
531 | } if (desc := data.get('description', ''))
532 | and (title := data.get('title', ''))
533 | and data.get('code') == 1
534 | and (title.strip() and title not in invalid_values or
535 | desc.strip() and desc not in invalid_values)
536 | and (not desc.strip() or is_valid_text(desc))
537 | else None
538 | },
539 | {
540 | "name": "ahfi",
541 | "url_template": "https://api.ahfi.cn/api/websiteinfo?url={}",
542 | "parse_func": lambda data: {
543 | "title": title,
544 | "description": desc
545 | } if (desc := data.get('data', {}).get('description', ''))
546 | and (title := data.get('data', {}).get('title', ''))
547 | and (title.strip() and title not in invalid_values or
548 | desc.strip() and desc not in invalid_values)
549 | and (not desc.strip() or is_valid_text(desc))
550 | else None
551 | }
552 | ]
553 |
554 | for api in api_list:
555 | try:
556 | if website_info := fetch_api(api, url):
557 | cleaned_title = website_info["title"].strip().replace('\n', ' ').replace('\r', ' ')
558 | cleaned_desc = website_info["description"].strip().replace('\n', ' ').replace('\r', ' ')
559 | return {
560 | "title": cleaned_title,
561 | "description": cleaned_desc
562 | }
563 | except Exception as e:
564 | logger.warning(f"{api['name']} API失败: {str(e)}")
565 | continue
566 |
567 | return None
568 |
569 |
570 | def ask_openai(question: str) -> Optional[Dict[str, str]]:
571 | """调用AI接口生成标题、描述和分类(带重试机制)"""
572 | system_msg: ChatCompletionSystemMessageParam = {
573 | "role": "system",
574 | "content": "我会给你一个网址、网站标题和网站描述,帮我生成网站收藏的标题、中文描述和分类。"
575 | "1. 标题要求简短最好一个词,优先从我给你的标题中取,不要翻译;"
576 | "2. 描述长度控制在120字符内,尽量精简,不要有多余空格,末尾不要带标点;"
577 | "3. 分类必须从以下选项中选择一个:" + str(AI_CATEGORY_OPTIONS) + ",如果未找到则返回 其他;"
578 | "返回给我包含三个字段 title、description、category 的JSON格式。"
579 | }
580 |
581 | user_msg: ChatCompletionUserMessageParam = {
582 | "role": "user",
583 | "content": question
584 | }
585 |
586 | messages: List[ChatCompletionMessageParam] = [system_msg, user_msg]
587 |
588 | for attempt in range(1, AI_CONFIG["max_retries"] + 1):
589 | try:
590 | response = ai_client.chat.completions.create(
591 | model=AI_CONFIG["model"],
592 | messages=messages,
593 | temperature=AI_CONFIG["temperature"],
594 | max_tokens=AI_CONFIG["max_tokens"]
595 | )
596 |
597 | result = response.choices[0].message.content.strip()
598 | if result == "不知道":
599 | return None
600 |
601 | result = re.sub(r'^```json\s*', '', result)
602 | result = re.sub(r'\s*```$', '', result)
603 |
604 | json_result = json.loads(result)
605 | if "title" in json_result and "description" in json_result and "category" in json_result:
606 | return {
607 | "title": json_result["title"].strip(),
608 | "description": json_result["description"].strip(),
609 | "category": json_result["category"].strip()
610 | }
611 | return None
612 |
613 | except json.JSONDecodeError:
614 | logger.warning(f"AI返回的不是有效的JSON: {result}")
615 | if attempt < AI_CONFIG["max_retries"]:
616 | time.sleep(AI_CONFIG["retry_delay"] * attempt)
617 | continue
618 | return None
619 | except Exception as e:
620 | logger.warning(f"AI调用失败 (尝试 {attempt}/{AI_CONFIG['max_retries']}): {str(e)}")
621 | if attempt < AI_CONFIG["max_retries"]:
622 | time.sleep(AI_CONFIG["retry_delay"] * attempt)
623 |
624 | logger.error(f"AI调用超过最大重试次数 ({AI_CONFIG['max_retries']}次),放弃请求")
625 | return None
626 |
627 |
628 | def clean_website_info(url: str, original_title: str = "", original_desc: str = "") -> Optional[Dict[str, str]]:
629 | """清理并优化网站标题、描述和分类(结合API和AI)"""
630 | invalid_values = {"null", "暂无标题", "暂无描述"}
631 |
632 | cleaned_original_title = clean_html_entities(original_title).strip() if original_title else ""
633 | cleaned_original_desc = clean_html_entities(original_desc).strip() if original_desc else ""
634 |
635 | if cleaned_original_title in invalid_values:
636 | cleaned_original_title = ""
637 | if cleaned_original_desc in invalid_values:
638 | cleaned_original_desc = ""
639 |
640 | # 尝试通过API获取信息
641 | api_info = fetch_website_info(url)
642 | domain = extract_domain(url)
643 |
644 | # 处理API获取到的信息
645 | if api_info:
646 | api_title = api_info["title"] if api_info["title"] not in invalid_values else ""
647 | api_desc = api_info["description"] if api_info["description"] not in invalid_values else ""
648 |
649 | if api_title or api_desc:
650 | if ai_info := ask_openai(f"网址:{domain}\n网站标题:{api_title}\n网站描述:{api_desc}"):
651 | if re.search(r'[\u0400-\u04FF]', ai_info["title"]):
652 | logger.warning(f"AI生成俄文标题,丢弃URL: {url}")
653 | return None
654 | # 验证分类是否有效
655 | if ai_info["category"] not in AI_CATEGORY_OPTIONS:
656 | logger.warning(f"AI返回无效分类 {ai_info['category']},使用默认分类")
657 | ai_info["category"] = "其他"
658 | return ai_info
659 | logger.warning(f"API获取到信息但都无效,丢弃URL: {url}")
660 | return None
661 |
662 | # 白名单域名直接调用AI
663 | if is_domain_whitelisted(url):
664 | prompt = f"网址:{domain}"
665 | if cleaned_original_title:
666 | prompt += f"\n网站标题:{cleaned_original_title}"
667 | if cleaned_original_desc:
668 | prompt += f"\n网站描述:{cleaned_original_desc}"
669 |
670 | if ai_info := ask_openai(prompt):
671 | if re.search(r'[\u0400-\u04FF]', ai_info["title"]):
672 | logger.warning(f"AI生成俄文标题,丢弃URL: {url}")
673 | return None
674 | if ai_info["category"] not in AI_CATEGORY_OPTIONS:
675 | logger.warning(f"AI返回无效分类 {ai_info['category']},使用默认分类")
676 | ai_info["category"] = "其他"
677 | return ai_info
678 | logger.warning(f"白名单域名但AI调用失败,丢弃URL: {url}")
679 | return None
680 |
681 | # 只要有原始标题或描述,就尝试使用AI处理
682 | if cleaned_original_title or cleaned_original_desc:
683 | prompt = f"网址:{domain}"
684 | if cleaned_original_title:
685 | prompt += f"\n网站标题:{cleaned_original_title}"
686 | if cleaned_original_desc:
687 | prompt += f"\n网站描述:{cleaned_original_desc}"
688 |
689 | if ai_info := ask_openai(prompt):
690 | if re.search(r'[\u0400-\u04FF]', ai_info["title"]):
691 | logger.warning(f"AI生成俄文标题,丢弃URL: {url}")
692 | return None
693 | if ai_info["category"] not in AI_CATEGORY_OPTIONS:
694 | logger.warning(f"AI返回无效分类 {ai_info['category']},使用默认分类")
695 | ai_info["category"] = "其他"
696 | return ai_info
697 | logger.warning(f"有原始信息但AI调用失败,丢弃URL: {url}")
698 | return None
699 |
700 | return None
701 |
702 |
703 | # ============================== 图像处理函数 ==============================
704 | def compress_svg(svg_content: str) -> str:
705 | """压缩SVG内容,移除注释和多余空格"""
706 | try:
707 | svg_content = re.sub(r'', '', svg_content, flags=re.DOTALL)
708 | lines = []
709 | for line in svg_content.split('\n'):
710 | line = line.strip()
711 | if line:
712 | lines.append(' '.join(line.split()))
713 | return ''.join(lines)
714 | except Exception as e:
715 | logger.error(f"SVG压缩失败: {e}")
716 | return svg_content
717 |
718 |
719 | def image_to_svg(img_response: requests.Response) -> str:
720 | """将图片转换为SVG格式"""
721 | try:
722 | img = Image.open(io.BytesIO(img_response.content))
723 | img_base64 = b64encode(img_response.content).decode('utf-8')
724 |
725 | content_type = img_response.headers.get('Content-Type', 'png')
726 | img_format = content_type.split('/')[-1].lower()
727 | if img_format not in ['png', 'jpeg', 'jpg', 'gif']:
728 | img_format = 'png'
729 |
730 | svg_template = """"""
733 |
734 | return compress_svg(svg_template.format(
735 | img.width, img.height, img_format, img_base64, img.width, img.height
736 | ))
737 | except Exception as e:
738 | logger.error(f"图片转换失败: {e}")
739 | raise ValueError(f"图片转换失败: {e}")
740 |
741 |
742 | def validate_svg(svg_content: str) -> bool:
743 | """验证SVG内容有效性"""
744 | return all(tag in svg_content for tag in ['