├── start.bat ├── input.txt ├── assets ├── yuque-demo.png └── image-20240621112212019.png ├── README.md └── main.py /start.bat: -------------------------------------------------------------------------------- 1 | python main.py --input input.txt --output D:\ 2 | -------------------------------------------------------------------------------- /input.txt: -------------------------------------------------------------------------------- 1 | https://www.yuque.com/xxx/xxx,xxx 2 | https://www.yuque.com/xxx/xxx,xxx -------------------------------------------------------------------------------- /assets/yuque-demo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Spritualkb/yuque-spider-plus/HEAD/assets/yuque-demo.png -------------------------------------------------------------------------------- /assets/image-20240621112212019.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Spritualkb/yuque-spider-plus/HEAD/assets/image-20240621112212019.png -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 新版本的具备有gui 2 | https://github.com/Spritualkb/yuque-spider-gui 3 | 4 | # yuque-spider-plus 5 | 6 | 该项目基于https://github.com/burpheart/yuque-crawl项目进行修改 7 | 8 | 语雀文档抓取工具(爬虫) 可以保存任意用户整个语雀知识库为Markdown格式 (包含完整目录结构和索引) 9 | 10 | ![](./assets/yuque-demo.png) 11 | 12 | 使用: 13 | 安装 python3 14 | 15 | https://www.python.org/downloads/ 16 | 17 | 执行安装运行模块 18 | 19 | ```shell 20 | pip install requests tqdm urllib3 21 | ``` 22 | 23 | 执行抓取: 24 | 25 | `python3 main.py 语雀文档地址` 26 | 27 | demo: 28 | `python3 main.py https://www.yuque.com/burpheart/phpaudit` 29 | 30 | 31 | 32 | ## 2024/07/03 33 | 34 | ### 增加需要密码需要设置cookie功能爬取 35 | 36 | 把浏览器全部cookie值复制到指定设置 37 | 38 | 命令行 39 | 40 | 示例 1:提供 URL 和 Cookie 41 | 42 | ```shell 43 | python main.py "https://www.yuque.com/burpheart/phpaudit" --cookie "verified_books=****" 44 | ``` 45 | 46 | 47 | 示例 2:提供 URL、Cookie 和输出路径 48 | 49 | ```python 50 | python main.py "https://www.yuque.com/burpheart/phpaudit" --cookie "verified_books=****" --output "download" 51 | ``` 52 | 53 | 54 | 示例 3:仅提供 URL 55 | 56 | ```shell 57 | python main.py "https://www.yuque.com/burpheart/phpaudit" 58 | ``` 59 | 60 | 示例 4:提供 URL 和输出路径 61 | 62 | ```shell 63 | python main.py "https://www.yuque.com/burpheart/phpaudit" --output "download" 64 | ``` 65 | 66 | 67 | 示例 5:使用默认参数(显示帮助信息) 68 | 69 | ```shell 70 | python main.py 71 | ``` 72 | 73 | 74 | 75 | ## 2024/07/04 76 | 77 | ### 网络图片本地观看 78 | 79 | 修复出现部分图片无法本地加载的情况,把网络图片下载下来并把markdown对应的图片路径替换为相对路径的./assets路径下 80 | 81 | ## 2024/09/27 82 | 83 | ### 实现批量url笔记爬取 84 | 85 | 从input.txt读取对应链接和cookie 86 | 链接和cookie以逗号分隔 87 | 88 | 89 | 90 | ### 执行命令 91 | 92 | python main.py --input input.txt --output D:\Notebook 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import requests 3 | import json 4 | import re 5 | import os 6 | import urllib.parse 7 | import time 8 | import random 9 | import argparse 10 | from requests.adapters import HTTPAdapter 11 | from tqdm import tqdm 12 | from urllib3 import Retry 13 | from bs4 import BeautifulSoup 14 | 15 | 16 | def fetch_url_title(url, cookies=None): 17 | try: 18 | headers = {'Cookie': cookies} if cookies else {} 19 | response = requests.get(url, headers=headers) 20 | if response.status_code == 200: 21 | html_content = response.text 22 | soup = BeautifulSoup(html_content, 'html.parser') 23 | title_tag = soup.title 24 | if title_tag: 25 | title = title_tag.string.strip() 26 | title_cleaned = re.sub(r'[\/\\:*?"<>|]', '-', title) 27 | title_cleaned = title_cleaned.replace(' · 语雀', '') 28 | match = re.search(r'u\d+/([\w-]+)', url) 29 | if match: 30 | extracted_part = match.group(1) 31 | final_title = f"{extracted_part}-{title_cleaned}" 32 | print("页面标题:", final_title) 33 | return final_title 34 | else: 35 | print("页面标题:", title_cleaned) 36 | return title_cleaned 37 | else: 38 | return "无标题" 39 | else: 40 | print(f"请求失败,状态码:{response.status_code}") 41 | return "请求失败" 42 | except requests.exceptions.RequestException as e: 43 | print(f"请求发生错误:{e}") 44 | return "请求错误" 45 | 46 | 47 | def save_page(book_id, slug, path, cookies=None): 48 | try: 49 | headers = {'Cookie': cookies} if cookies else {} 50 | docsdata = requests.get( 51 | f'https://www.yuque.com/api/docs/{slug}?book_id={book_id}&merge_dynamic_data=false&mode=markdown', 52 | headers=headers, timeout=20 53 | ) 54 | if docsdata.status_code != 200: 55 | print("文档下载失败 页面可能被删除 ", book_id, slug, docsdata.content) 56 | return 57 | docsjson = json.loads(docsdata.content) 58 | markdown_content = docsjson['data']['sourcecode'] 59 | 60 | assets_dir = os.path.join(os.path.dirname(path), 'assets') 61 | if not os.path.exists(assets_dir): 62 | os.makedirs(assets_dir) 63 | 64 | def download_image(match): 65 | url = match.group(1) 66 | if not url.startswith('http'): 67 | return match.group(0) 68 | url = url.split('#')[0] 69 | timestamp = int(time.time() * 1000) 70 | extension = os.path.splitext(url)[1] 71 | image_name = f"image-{timestamp}{extension}" 72 | image_name = re.sub(r'[<>:"/\\|?*]', '_', image_name) 73 | image_path = os.path.join(assets_dir, image_name) 74 | try: 75 | image_data = requests.get(url, headers=headers, timeout=10).content 76 | with open(image_path, 'wb') as img_file: 77 | img_file.write(image_data) 78 | return f'![image-{timestamp}](./assets/{image_name})' 79 | except requests.exceptions.RequestException as e: 80 | print(f"图片下载失败: {e}") 81 | return match.group(0) 82 | 83 | markdown_content = re.sub(r'!\[.*?\]\((.*?)\)', download_image, markdown_content) 84 | 85 | with open(path, 'w', encoding='utf-8') as f: 86 | f.write(markdown_content) 87 | except requests.exceptions.RequestException as e: 88 | print(f"请求失败: {e}") 89 | 90 | 91 | def get_book(url, cookies=None, output_path="download"): 92 | session = requests.Session() 93 | retries = Retry(total=5, backoff_factor=0.5, status_forcelist=[500, 502, 503, 504]) 94 | session.mount('https://', HTTPAdapter(max_retries=retries)) 95 | headers = {'Cookie': cookies} if cookies else {} 96 | try: 97 | docsdata = session.get(url, headers=headers, timeout=10) 98 | data = re.findall(r"decodeURIComponent\(\"(.+)\"\)\);", docsdata.content.decode('utf-8')) 99 | docsjson = json.loads(urllib.parse.unquote(data[0])) 100 | except requests.exceptions.RequestException as e: 101 | print(f"请求失败: {e}") 102 | return 103 | 104 | list = {} 105 | temp = {} 106 | md = "" 107 | table = str.maketrans('\/:*?"<>|\n\r', "___________") 108 | 109 | book_title = fetch_url_title(url, cookies) 110 | output_dir = os.path.join(output_path, book_title) 111 | 112 | if not os.path.exists(output_dir): 113 | os.makedirs(output_dir) 114 | 115 | for doc in tqdm(docsjson['book']['toc'], desc="Downloading Documents", unit="doc"): 116 | if doc['type'] == 'TITLE' or doc['child_uuid'] != '': 117 | list[doc['uuid']] = {'0': doc['title'], '1': doc['parent_uuid']} 118 | uuid = doc['uuid'] 119 | temp[doc['uuid']] = '' 120 | while True: 121 | if list[uuid]['1'] != '': 122 | if temp[doc['uuid']] == '': 123 | temp[doc['uuid']] = doc['title'].translate(table) 124 | else: 125 | temp[doc['uuid']] = list[uuid]['0'].translate(table) + '/' + temp[doc['uuid']] 126 | uuid = list[uuid]['1'] 127 | else: 128 | temp[doc['uuid']] = list[uuid]['0'].translate(table) + '/' + temp[doc['uuid']] 129 | break 130 | doc_dir = os.path.join(output_dir, temp[doc['uuid']]) 131 | if not os.path.exists(doc_dir): 132 | os.makedirs(doc_dir) 133 | if temp[doc['uuid']].endswith("/"): 134 | md += "## " + temp[doc['uuid']][:-1] + "\n" 135 | else: 136 | md += " " * (temp[doc['uuid']].count("/") - 1) + "* " + temp[doc['uuid']][ 137 | temp[doc['uuid']].rfind("/") + 1:] + "\n" 138 | if doc['url'] != '': 139 | if doc['parent_uuid'] != "": 140 | if temp[doc['parent_uuid']].endswith("/"): 141 | md += " " * temp[doc['parent_uuid']].count("/") + "* [" + doc['title'] + "](" + urllib.parse.quote( 142 | temp[doc['parent_uuid']] + "/" + doc['title'].translate(table) + '.md') + ")" + "\n" 143 | else: 144 | md += " " * temp[doc['parent_uuid']].count("/") + "* [" + doc['title'] + "](" + urllib.parse.quote( 145 | temp[doc['parent_uuid']] + "/" + doc['title'].translate(table) + '.md') + ")" + "\n" 146 | save_page(str(docsjson['book']['id']), doc['url'], 147 | os.path.join(output_dir, temp[doc['parent_uuid']], doc['title'].translate(table) + '.md'), 148 | cookies) 149 | else: 150 | md += " " + "* [" + doc['title'] + "](" + urllib.parse.quote( 151 | doc['title'].translate(table) + '.md') + ")" + "\n" 152 | save_page(str(docsjson['book']['id']), doc['url'], 153 | os.path.join(output_dir, doc['title'].translate(table) + '.md'), cookies) 154 | time.sleep(random.randint(1, 4)) 155 | 156 | with open(os.path.join(output_dir, 'SUMMARY.md'), 'w', encoding='utf-8') as f: 157 | f.write(md) 158 | 159 | 160 | 161 | def create_ascii_banner(): 162 | banner = ''' 163 | 164 | __..._ 165 | ..-' o. 166 | .-' : 167 | _..' .'__..--< 168 | ...--"" '-. 169 | ..-" __.' 170 | .' ___...--' 171 | : ____....---' 172 | : .' 173 | : : _____ 174 | : : _..--""" """--..__ 175 | : : ." ""i--. 176 | : '.: : '. 177 | : '--...___i---""""--..___.' : 178 | : ""---...---"" : 179 | '. : 180 | '-. : 181 | '--... .' 182 | : ""---....._____.....---"" 183 | '. '. 184 | '-.. '. 185 | '. : 186 | : .' 187 | / : 188 | .' : 189 | .' .--' 190 | '--' 191 | 192 | Hi, I'm Spritualkb, welcome master to use my program 193 | 你好,我是Spritualkb,欢迎师傅使用我的程序 194 | ''' 195 | return banner 196 | 197 | 198 | 199 | if __name__ == '__main__': 200 | print(create_ascii_banner()) 201 | parser = argparse.ArgumentParser(description='从语雀下载书籍文档。') 202 | parser.add_argument('--input', default="input.txt", help='包含书籍 URL 和 Cookie 的输入文件,每行格式为 URL,cookie。') 203 | parser.add_argument('--output', default="download", help='下载文件的输出目录。') 204 | 205 | args = parser.parse_args() 206 | 207 | if not os.path.isfile(args.input): 208 | print(f"输入文件 {args.input} 不存在。") 209 | sys.exit(1) 210 | 211 | with open(args.input, 'r') as file: 212 | lines = file.readlines() 213 | 214 | for line in lines: 215 | line = line.strip() 216 | if line: 217 | parts = line.split(',', 1) 218 | url = parts[0] 219 | cookie = parts[1] if len(parts) > 1 else None 220 | print("当前下载地址:" + url) 221 | if cookie is not None: 222 | print("当前cookie:" + cookie) 223 | else: 224 | print("当前cookie:None") 225 | get_book(url, cookie, args.output) 226 | --------------------------------------------------------------------------------