├── proxies.txt
├── whitelist.json
├── new
└── README.md
├── .github
└── workflows
│ ├── main.yml
│ └── update_status.yml
├── verify-url.py
└── README.md
/proxies.txt:
--------------------------------------------------------------------------------
1 | # Add proxies to be used when testing links
2 |
--------------------------------------------------------------------------------
/whitelist.json:
--------------------------------------------------------------------------------
1 | [
2 | "\\.edu(\\.[a-zA-Z0-9]+)*$",
3 | "hfbz\\.com",
4 | "sz\\.gov\\.cn"
5 | ]
--------------------------------------------------------------------------------
/new/README.md:
--------------------------------------------------------------------------------
1 | # 收录申请目录
2 |
3 | 在这个目录里创建以`.add.md`结尾的文件,内容格式:
4 | ```
5 | | [域名](完整链接) | 学校名称 |
6 | ```
7 | 每行一个,**格式不符合的行会自动忽略**。
8 | 下一次刷新状态时,所有符合以上格式的行会被添加到列表,同时**文件将会被删除**
9 |
--------------------------------------------------------------------------------
/.github/workflows/main.yml:
--------------------------------------------------------------------------------
1 | name: Sync README to docusaurus-yuanshen Repo
2 |
3 | on:
4 | workflow_call:
5 | secrets:
6 | API_TOKEN_GITHUB:
7 | required: true
8 | workflow_dispatch:
9 |
10 | jobs:
11 | sync:
12 | runs-on: ubuntu-latest
13 | steps:
14 | - name: Checkout source repo
15 | uses: actions/checkout@v4
16 | with:
17 | ref: 'main'
18 |
19 | - name: Push README to docusaurus-yuanshen repo
20 | env:
21 | API_TOKEN_GITHUB: ${{ secrets.API_TOKEN_GITHUB }} # 设置一个secret
22 | if: ${{ github.repository == 'cokice/List-of-genshin-University' && github.ref == 'refs/heads/main' }}
23 | run: |
24 | git config --global user.name 'howen'
25 | git config --global user.email 'howen.huang@qq.com'
26 |
27 | # Clone the destination repo
28 | git clone https://x-access-token:$API_TOKEN_GITHUB@github.com/cokice/docusaurus-yuanshen.git
29 |
30 | cd docusaurus-yuanshen
31 | git checkout main # 切换到main分支
32 | git pull origin main # 拉取最新的更改
33 | cp ../README.md docs/README.md # Copy the updated README
34 | git add docs/README.md
35 | git commit --allow-empty -m "Update README from List-of-genshin-University"
36 | git push origin main
37 |
--------------------------------------------------------------------------------
/.github/workflows/update_status.yml:
--------------------------------------------------------------------------------
1 | name: Update Status
2 | run-name: Update Status
3 |
4 | on:
5 | workflow_dispatch:
6 | inputs:
7 | no_skip_edu:
8 | description: 'Do not skip edu domains'
9 | required: true
10 | type: 'boolean'
11 | default: false
12 | count_only:
13 | description: 'Only count the number of domains, do not verify'
14 | required: true
15 | type: 'boolean'
16 | default: false
17 | schedule:
18 | - cron: '30 3/4 * * *'
19 | push:
20 | branches:
21 | - main
22 | paths:
23 | - 'README.md'
24 | - '.github/workflows/update_status.yml'
25 | - 'verify-url.py'
26 | - 'new/**'
27 | - 'whitelist.json'
28 |
29 | permissions:
30 | contents: write
31 |
32 | concurrency:
33 | group: ${{ github.workflow }}
34 | cancel-in-progress: true
35 |
36 | jobs:
37 | update_status:
38 | name: Update Status
39 | runs-on: ubuntu-latest
40 | if: ${{ github.repository == 'cokice/List-of-genshin-University' }}
41 | steps:
42 | - name: Checkout repository
43 | uses: actions/checkout@v4
44 | with:
45 | fetch-depth: 0
46 |
47 | - name: Set up Python
48 | uses: actions/setup-python@v5
49 | with:
50 | python-version: '3.11'
51 |
52 | - name: Install dependencies
53 | run: |
54 | python -m pip install --upgrade pip
55 | pip install dnspython requests pypinyin
56 |
57 | - name: Prepare Proxy
58 | run: |
59 | echo -e "${{ secrets.PROXY }}" >> proxies.txt
60 |
61 | - name: Update status
62 | run: python3 verify-url.py
63 | id: update_status
64 | env:
65 | NO_SKIP_EDU: ${{ github.event.inputs.no_skip_edu == true && '1' || '0' }}
66 | COUNT_ONLY: ${{ github.event.inputs.count_only == true && '1' || '0' }}
67 |
68 | - name: Revert Proxy
69 | run: |
70 | git checkout proxies.txt
71 |
72 | - name: Commit and push
73 | uses: actions4git/add-commit-push@v1
74 | with:
75 | commit-message: ${{ steps.update_status.outputs.commit_message }}
76 | continue-on-error: true
77 |
78 | push_to_website:
79 | name: Push to website
80 | uses: ./.github/workflows/main.yml
81 | needs: update_status
82 | if: ${{ github.repository == 'cokice/List-of-genshin-University' }}
83 | secrets:
84 | API_TOKEN_GITHUB: ${{ secrets.API_TOKEN_GITHUB }}
85 |
--------------------------------------------------------------------------------
/verify-url.py:
--------------------------------------------------------------------------------
1 | import concurrent.futures
2 | import datetime
3 | import dns.resolver
4 | import functools
5 | import idna
6 | import io
7 | import json
8 | import os
9 | import pypinyin
10 | import pypinyin.contrib.tone_convert
11 | import pypinyin.style
12 | import re
13 | import requests
14 | import subprocess
15 | import sys
16 | import threading
17 | import traceback
18 | import warnings
19 |
20 |
21 | print_lock = threading.Lock()
22 |
23 |
24 | def stack_func_stdout(func):
25 | @functools.wraps(func)
26 | def wrapper(*args, **kwargs):
27 | temp_stdout = io.StringIO()
28 | kwargs["file"] = temp_stdout
29 | try:
30 | ret = func(*args, **kwargs)
31 | except Exception as e:
32 | with print_lock:
33 | print("-" * 20, "Error occurred:", file=sys.stdout, flush=True)
34 | print(traceback.format_exc(), file=sys.stderr, flush=True)
35 | print("-" * 20, "Output:", file=sys.stdout, flush=True)
36 | print(temp_stdout.getvalue(), end="", file=sys.stdout, flush=True)
37 | sys.exit(1)
38 | with print_lock:
39 | print(temp_stdout.getvalue(), end="", file=sys.stdout, flush=True)
40 | return ret
41 |
42 | return wrapper
43 |
44 |
45 | def resub_concurrent(pattern, repl, string, count=0, flags=0, thread_count=16):
46 | assert callable(repl)
47 | pool = concurrent.futures.ThreadPoolExecutor(thread_count)
48 | futures = []
49 | replaced = 0
50 | while string and (count == 0 or replaced < count):
51 | m = re.search(pattern, string, flags)
52 | if m:
53 | if m.start() == 0:
54 | futures.append(pool.submit(repl, m))
55 | string = string[m.end() :]
56 | replaced += 1
57 | else:
58 | futures.append(string[: m.start()])
59 | futures.append(pool.submit(repl, m))
60 | string = string[m.end() :]
61 | replaced += 1
62 | else:
63 | futures.append(string)
64 | break
65 | out = ""
66 | for future in futures:
67 | if isinstance(future, str):
68 | out += future
69 | else:
70 | out += future.result()
71 | return out
72 |
73 |
74 | warnings.filterwarnings("ignore", category=requests.packages.urllib3.exceptions.InsecureRequestWarning)
75 | sys.setrecursionlimit(64)
76 |
77 | # Whitelist json file, contains an array of regex strings
78 | with open("whitelist.json", mode="rt", encoding="utf8") as f:
79 | whitelist = json.load(f)
80 |
81 | resolver = dns.resolver.Resolver()
82 | resolver.nameservers += ["114.114.114.114", "8.8.8.8"]
83 |
84 | header = {
85 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 Edg/123.0.0.0"
86 | }
87 |
88 | # Get proxies
89 | with open("proxies.txt", "rt", encoding="utf-8") as f:
90 | proxies = list(i for i in map(str.strip, f.read().splitlines()) if i and not i.startswith("#"))
91 | proxies = [None] + list(map(lambda x: {"http": x, "https": x}, proxies))
92 |
93 | # Get the list
94 | with open("README.md", "rt", encoding="utf-8") as f:
95 | md_content = f.read()
96 |
97 | # Disable retry
98 | s = requests.Session()
99 | a = requests.adapters.HTTPAdapter(max_retries=2)
100 | s.mount("http://", a)
101 | s.mount("https://", a)
102 |
103 |
104 | # Print in red
105 | def print_error(*s, **kwargs):
106 | if "end" in kwargs:
107 | end = kwargs["end"]
108 | del kwargs["end"]
109 | else:
110 | end = "\n"
111 | if "flush" in kwargs:
112 | del kwargs["flush"]
113 | print("\033[31m", end="", flush=True, **kwargs)
114 | if s:
115 | print(*s, end="", flush=True, **kwargs)
116 | else:
117 | print("Error", end="", flush=True, **kwargs)
118 | print("\033[0m", end=end, flush=True, **kwargs)
119 |
120 |
121 | # Print in green
122 | def print_success(*s, **kwargs):
123 | if "end" in kwargs:
124 | end = kwargs["end"]
125 | del kwargs["end"]
126 | else:
127 | end = "\n"
128 | if "flush" in kwargs:
129 | del kwargs["flush"]
130 | print("\033[32m", end="", flush=True, **kwargs)
131 | if s:
132 | print(*s, end="", flush=True, **kwargs)
133 | else:
134 | print("Success", end="", flush=True, **kwargs)
135 | print("\033[0m", end=end, flush=True, **kwargs)
136 |
137 |
138 | @stack_func_stdout
139 | def replace_table_row(m: re.Match, file=sys.stdout):
140 | print(file=file)
141 | url = m.group(2)
142 | if not re.match(r"https?://", url):
143 | url = "http://" + url
144 | url_split = url.split("/")
145 | url_split[2] = idna.decode(url_split[2])
146 | url = "/".join(url_split)
147 | successes = {0: [], 1: [], 2: []}
148 | try:
149 | success, method = check_url(url, file=file)
150 | except Exception:
151 | success = 0
152 | method = "Other Error"
153 | print("\n", end="", flush=True, file=file)
154 | print_error(traceback.format_exc(), flush=True, file=file)
155 | if success == 1:
156 | if method[0] == "Unknown":
157 | method = ("", None)
158 | elif method[0] == "EDU Domain":
159 | return f"| [{m.group(1)}]({url}) | {m.group(3)} | :warning: {method[0]} |"
160 | suggestion_removed = re.sub(r"\*.*?\*", "", m.group(3))
161 | return f"| [{m.group(1)}]({url}) | {suggestion_removed} | :white_check_mark: {method[0]} |"
162 | else:
163 | successes[success].append((url, method))
164 | print_error("Failed, trying other possible URLs", file=file)
165 | for new_url in get_other_possible_url(url):
166 | try:
167 | success, method1 = check_url(new_url, file=file)
168 | except Exception:
169 | success = 0
170 | method1 = "Other Error"
171 | print("\n", end="", flush=True, file=file)
172 | print_error(traceback.format_exc(), flush=True, file=file)
173 | if success == 1:
174 | if method1[0] == "Unknown":
175 | method1 = ("", None)
176 | suggestion_removed = re.sub(r"\*.*?\*", "", m.group(3))
177 | return f"| [{m.group(1)}]({new_url}) | {suggestion_removed} | :white_check_mark: {method1[0]} |"
178 | successes[success].append((new_url, method1))
179 | if successes[2]:
180 | if sum(1 for i in successes[2] if i[1] == "NXDOMAIN") != len(successes[2]):
181 | i = 0
182 | while successes[2][i][1] == "NXDOMAIN":
183 | i += 1
184 | return f"| [{m.group(1)}]({successes[2][i][0]}) | {m.group(3)} | :question: {successes[2][i][1]} |"
185 | return f"| [{m.group(1)}]({url}) | {m.group(3)} | :x: {method} |"
186 |
187 |
188 | # Try to remove or add https and www.
189 | def get_other_possible_url(url):
190 | assert re.match(r"https?://", url)
191 | new_urls = []
192 | if re.match(r"https?://www\.", url):
193 | new_urls.append(re.sub(r"https?://www\.", "http://", url))
194 | new_urls.append(re.sub(r"https?://www\.", "https://", url))
195 | if url.startswith("https://"):
196 | new_urls.append("http://" + url[8:])
197 | else:
198 | new_urls.append("https://" + url[7:])
199 | else:
200 | new_urls.append(re.sub(r"https?://", "http://www.", url))
201 | new_urls.append(re.sub(r"https?://", "https://www.", url))
202 | if url.startswith("https://"):
203 | new_urls.append("http://" + url[8:])
204 | else:
205 | new_urls.append("https://" + url[7:])
206 | return sorted(new_urls, reverse=True)
207 |
208 |
209 | def get_domain(url):
210 | if not re.match(r"https?://", url):
211 | url = "http://" + url
212 | return url.split("/")[2]
213 |
214 |
215 | def check_whitelist(url):
216 | for i in whitelist:
217 | if re.search(i, url):
218 | return True
219 | return False
220 |
221 |
222 | def check_url(url, ignore_ssl=False, file=sys.stdout):
223 | global proxies, resolver
224 | print(f"Checking [{url}]...", end=" ", flush=True, file=file)
225 | method = ("", None)
226 | if check_whitelist(get_domain(url)) and os.environ.get("NO_SKIP_EDU") not in {"1", "true", "True", "yes"}:
227 | print_success("Found in whitelist, skipped", file=file)
228 | return 1, ("EDU Domain", None)
229 | error = "Unknown error"
230 | try:
231 | res = resolver.resolve(get_domain(url), "CNAME")
232 | print(f"CNAME to [{res[0].target.to_text()[:-1]}]", end=" ", flush=True, file=file)
233 | method = ("CNAME", res[0].target.to_text()[:-1])
234 | except dns.resolver.NoAnswer:
235 | pass
236 | except dns.resolver.NXDOMAIN:
237 | print("CNAME NXDOMAIN", end=" ", flush=True, file=file)
238 | except Exception:
239 | print("DNS CNAME error", end=" ", flush=True, file=file)
240 | error = "DNS CNAME error"
241 | if not method[0]:
242 | try:
243 | res = resolver.resolve(get_domain(url), "A")
244 | print(f"A to [{res[0].address}]", end=" ", flush=True, file=file)
245 | method = ("Unknown", res[0].address)
246 | except dns.resolver.NoAnswer:
247 | print("A NXDOMAIN", end=" ", flush=True, file=file)
248 | error = "NXDOMAIN"
249 | except dns.resolver.NXDOMAIN:
250 | print("DNS A error", end=" ", flush=True, file=file)
251 | error = "NXDOMAIN"
252 | except Exception:
253 | print("DNS A error", end=" ", flush=True, file=file)
254 | error = "DNS error"
255 | if method[0]:
256 | for idx, p in enumerate(proxies):
257 | try:
258 | if p is not None:
259 | print(f" -- Using proxy {idx}...", end=" ", flush=True, file=file)
260 | r = s.get(url, allow_redirects=False, timeout=3, verify=not ignore_ssl, proxies=p, headers=header)
261 | if not 200 <= r.status_code < 400:
262 | print_error(f"Failed with status code {r.status_code}", file=file)
263 | error = str(r.status_code)
264 | return 0, error
265 | elif 300 <= r.status_code < 400:
266 | target = r.headers["Location"]
267 | if not re.match(r"https?://", target):
268 | if target.startswith("/"):
269 | target = "/".join(url.split("/")[:3]) + target
270 | elif url.split("#")[0].split("?")[0].endswith("/"):
271 | target = url.split("#")[0].split("?")[0] + target
272 | else:
273 | target = "/".join(url.split("#")[0].split("?")[0].split("/")[:-1]) + "/" + target
274 | print(f"Redirect to [{target}] with status code {r.status_code}", file=file)
275 | print("-- Checking redirect...", end=" ", flush=True, file=file)
276 | success, submethod = check_url(target, file=file)
277 | if method[0] == "CNAME" and (get_domain(url) in target or get_domain(target) in url):
278 | method = (submethod[0], method[1])
279 | if success != 1:
280 | return 2, submethod
281 | method = (f"Redirect {r.status_code}", target)
282 | elif 'http-equiv="refresh"' in r.text:
283 | target = re.search(r'content="\d+; *(?:url=)?(.*?)"', r.text, re.I).group(1)
284 | if not re.match(r"https?://", target):
285 | if target.startswith("/"):
286 | target = "/".join(url.split("/")[:3]) + target
287 | elif url.split("#")[0].split("?")[0].endswith("/"):
288 | target = url.split("#")[0].split("?")[0] + target
289 | else:
290 | target = "/".join(url.split("#")[0].split("?")[0].split("/")[:-1]) + "/" + target
291 | print(f"Redirect with meta refresh to [{target}]", file=file)
292 | print("-- Checking redirect...", end=" ", flush=True, file=file)
293 | success, submethod = check_url(target, file=file)
294 | if method[0] == "CNAME":
295 | method = (submethod[0], method[1])
296 | if success != 1:
297 | return 2, submethod
298 | method = ("Meta Refresh", target)
299 | elif method[0] == "CNAME":
300 | print_success("CNAME Success", file=file)
301 | else:
302 | print_success("Unknown redirect method or no redirect", file=file)
303 | return 1, method
304 | except requests.exceptions.SSLError as e:
305 | if ignore_ssl:
306 | print_error(f"Failed with exception {e.__class__.__name__}", file=file)
307 | return 0, e.__class__.__name__
308 | if traceback.extract_stack()[-2].name == "check_url":
309 | print("\r-- Checking redirect... ", end="", flush=True, file=file)
310 | else:
311 | print("\r", end="", flush=True)
312 | print_error("SSLError, retrying without SSL...", end=" ", flush=True, file=file)
313 | return check_url(url, True, file=file)
314 | except requests.exceptions.RequestException as e:
315 | print_error(f"Failed with exception {e.__class__.__name__}", file=file)
316 | error = "Connection error"
317 | else:
318 | print_error(file=file)
319 | return 0, error
320 | return 0, error
321 |
322 |
323 | @pypinyin.style.register("tone_with_original")
324 | def tone_with_original(pinyin, han, **kwargs):
325 | return [pypinyin.contrib.tone_convert.to_tone3(pinyin), han]
326 |
327 |
328 | def handle_no_pinyin(s):
329 | if sum(1 for i in s if 65 <= ord(i) <= 90 or 97 <= ord(i) <= 122) == 0:
330 | return [[[i]] for i in s]
331 | return [[["", i]] for i in s]
332 |
333 |
334 | def reshape_pinyin(l):
335 | o = []
336 | first = []
337 | second = []
338 | for i in l:
339 | if len(i[0]) == 1 or i[0][0] == "":
340 | if first:
341 | o.append(first)
342 | o.append(second)
343 | first = []
344 | second = []
345 | o.append(i[0])
346 | else:
347 | first.append(i[0][0])
348 | second.append(i[0][1])
349 | if first:
350 | o.append(first)
351 | o.append(second)
352 | return o
353 |
354 |
355 | def sort_key(s):
356 | m = re.match(r"\| *\[(.*?)\]\((?P.*?)\) *\| *(?P.*?) *\|.*", s)
357 | return [
358 | reshape_pinyin(pypinyin.pinyin(m["name"], style="tone_with_original", errors=handle_no_pinyin)),
359 | reshape_pinyin(pypinyin.pinyin(m["link"], style="tone_with_original", errors=handle_no_pinyin)),
360 | ]
361 |
362 |
363 | # Detect new merged PR
364 | new = []
365 | remove_pending = []
366 | co_authors = set()
367 | for i in os.listdir("new"):
368 | if not i.endswith(".add.md"):
369 | continue
370 | with open(os.path.join("new", i), "rt", encoding="utf-8") as f:
371 | add_content = f.read()
372 | while m := re.search(r"\| *\[(.*?)\]\((.*?)\) *\| *(.*?) *\|.*", add_content):
373 | new.append(replace_table_row(m))
374 | add_content = add_content[m.end() :]
375 | l = subprocess.check_output(["git", "log", os.path.join("new", i)]).decode()
376 | for k in re.findall(r"(?:(?:Author|Co-authored-by|Signed-off-by): )(.*? <.*?>)", l):
377 | if "[bot]" in k:
378 | break
379 | co_authors.add(k)
380 | remove_pending.append(os.path.join("new", i))
381 |
382 |
383 | if not "COUNT_ONLY" in os.environ or os.environ["COUNT_ONLY"] not in ("1", "true", "True"):
384 | md_out = resub_concurrent(r"\| *\[(.*?)\]\((.*?)\) *\| *(.*?) *\|.*", replace_table_row, md_content)
385 | print("Sorting...")
386 | m = re.match(
387 | r"(?P.*?)(?P(\| *\[[^\n]*?\]\([^\n]*?\) *\|[^\n]*?\|[^\n]*?\|\n?)+)(?P