├── start.bat
├── input.txt
├── assets
    ├── yuque-demo.png
    └── image-20240621112212019.png
├── README.md
└── main.py


/start.bat:
--------------------------------------------------------------------------------
1 | python main.py --input input.txt --output D:\
2 | 


--------------------------------------------------------------------------------
/input.txt:
--------------------------------------------------------------------------------
1 | https://www.yuque.com/xxx/xxx,xxx
2 | https://www.yuque.com/xxx/xxx,xxx


--------------------------------------------------------------------------------
/assets/yuque-demo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Spritualkb/yuque-spider-plus/HEAD/assets/yuque-demo.png


--------------------------------------------------------------------------------
/assets/image-20240621112212019.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Spritualkb/yuque-spider-plus/HEAD/assets/image-20240621112212019.png


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # 新版本的具备有gui
  2 | https://github.com/Spritualkb/yuque-spider-gui
  3 | 
  4 | # yuque-spider-plus
  5 | 
  6 | 该项目基于https://github.com/burpheart/yuque-crawl项目进行修改
  7 | 
  8 | 语雀文档抓取工具（爬虫） 可以保存任意用户整个语雀知识库为Markdown格式 (包含完整目录结构和索引) 
  9 | 
 10 | ![](./assets/yuque-demo.png)
 11 | 
 12 | 使用：
 13 | 安装 python3
 14 | 
 15 | https://www.python.org/downloads/
 16 | 
 17 | 执行安装运行模块
 18 | 
 19 | ```shell
 20 | pip install requests tqdm urllib3
 21 | ```
 22 | 
 23 | 执行抓取：
 24 | 
 25 | `python3 main.py 语雀文档地址`
 26 | 
 27 | demo：
 28 | `python3 main.py https://www.yuque.com/burpheart/phpaudit`
 29 | 
 30 | 
 31 | 
 32 | ## 2024/07/03
 33 | 
 34 | ### 增加需要密码需要设置cookie功能爬取
 35 | 
 36 | 把浏览器全部cookie值复制到指定设置
 37 | 
 38 | 命令行 
 39 | 
 40 | 示例 1：提供 URL 和 Cookie
 41 | 
 42 | ```shell
 43 | python main.py "https://www.yuque.com/burpheart/phpaudit" --cookie "verified_books=****"
 44 | ```
 45 | 
 46 | 
 47 | 示例 2：提供 URL、Cookie 和输出路径
 48 | 
 49 | ```python
 50 | python main.py "https://www.yuque.com/burpheart/phpaudit" --cookie "verified_books=****" --output "download"
 51 | ```
 52 | 
 53 | 
 54 | 示例 3：仅提供 URL
 55 | 
 56 | ```shell
 57 | python main.py "https://www.yuque.com/burpheart/phpaudit"
 58 | ```
 59 | 
 60 | 示例 4：提供 URL 和输出路径
 61 | 
 62 | ```shell
 63 | python main.py "https://www.yuque.com/burpheart/phpaudit" --output "download"
 64 | ```
 65 | 
 66 | 
 67 | 示例 5：使用默认参数（显示帮助信息）
 68 | 
 69 | ```shell
 70 | python main.py
 71 | ```
 72 | 
 73 | 
 74 | 
 75 | ## 2024/07/04
 76 | 
 77 | ### 网络图片本地观看
 78 | 
 79 | 修复出现部分图片无法本地加载的情况，把网络图片下载下来并把markdown对应的图片路径替换为相对路径的./assets路径下
 80 | 
 81 | ## 2024/09/27
 82 | 
 83 | ### 实现批量url笔记爬取
 84 | 
 85 | 从input.txt读取对应链接和cookie
 86 | 链接和cookie以逗号分隔
 87 | 
 88 | 
 89 | 
 90 | ### 执行命令
 91 | 
 92 | python main.py --input input.txt --output D:\Notebook
 93 | 
 94 | 
 95 | 
 96 | 
 97 | 
 98 | 
 99 | 
100 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import requests
  3 | import json
  4 | import re
  5 | import os
  6 | import urllib.parse
  7 | import time
  8 | import random
  9 | import argparse
 10 | from requests.adapters import HTTPAdapter
 11 | from tqdm import tqdm
 12 | from urllib3 import Retry
 13 | from bs4 import BeautifulSoup
 14 | 
 15 | 
 16 | def fetch_url_title(url, cookies=None):
 17 |     try:
 18 |         headers = {'Cookie': cookies} if cookies else {}
 19 |         response = requests.get(url, headers=headers)
 20 |         if response.status_code == 200:
 21 |             html_content = response.text
 22 |             soup = BeautifulSoup(html_content, 'html.parser')
 23 |             title_tag = soup.title
 24 |             if title_tag:
 25 |                 title = title_tag.string.strip()
 26 |                 title_cleaned = re.sub(r'[\/\\:*?"<>|]', '-', title)
 27 |                 title_cleaned = title_cleaned.replace(' · 语雀', '')
 28 |                 match = re.search(r'u\d+/([\w-]+)', url)
 29 |                 if match:
 30 |                     extracted_part = match.group(1)
 31 |                     final_title = f"{extracted_part}-{title_cleaned}"
 32 |                     print("页面标题:", final_title)
 33 |                     return final_title
 34 |                 else:
 35 |                     print("页面标题:", title_cleaned)
 36 |                     return title_cleaned
 37 |             else:
 38 |                 return "无标题"
 39 |         else:
 40 |             print(f"请求失败，状态码：{response.status_code}")
 41 |             return "请求失败"
 42 |     except requests.exceptions.RequestException as e:
 43 |         print(f"请求发生错误：{e}")
 44 |         return "请求错误"
 45 | 
 46 | 
 47 | def save_page(book_id, slug, path, cookies=None):
 48 |     try:
 49 |         headers = {'Cookie': cookies} if cookies else {}
 50 |         docsdata = requests.get(
 51 |             f'https://www.yuque.com/api/docs/{slug}?book_id={book_id}&merge_dynamic_data=false&mode=markdown',
 52 |             headers=headers, timeout=20
 53 |         )
 54 |         if docsdata.status_code != 200:
 55 |             print("文档下载失败 页面可能被删除 ", book_id, slug, docsdata.content)
 56 |             return
 57 |         docsjson = json.loads(docsdata.content)
 58 |         markdown_content = docsjson['data']['sourcecode']
 59 | 
 60 |         assets_dir = os.path.join(os.path.dirname(path), 'assets')
 61 |         if not os.path.exists(assets_dir):
 62 |             os.makedirs(assets_dir)
 63 | 
 64 |         def download_image(match):
 65 |             url = match.group(1)
 66 |             if not url.startswith('http'):
 67 |                 return match.group(0)
 68 |             url = url.split('#')[0]
 69 |             timestamp = int(time.time() * 1000)
 70 |             extension = os.path.splitext(url)[1]
 71 |             image_name = f"image-{timestamp}{extension}"
 72 |             image_name = re.sub(r'[<>:"/\\|?*]', '_', image_name)
 73 |             image_path = os.path.join(assets_dir, image_name)
 74 |             try:
 75 |                 image_data = requests.get(url, headers=headers, timeout=10).content
 76 |                 with open(image_path, 'wb') as img_file:
 77 |                     img_file.write(image_data)
 78 |                 return f'![image-{timestamp}](./assets/{image_name})'
 79 |             except requests.exceptions.RequestException as e:
 80 |                 print(f"图片下载失败: {e}")
 81 |                 return match.group(0)
 82 | 
 83 |         markdown_content = re.sub(r'!\[.*?\]\((.*?)\)', download_image, markdown_content)
 84 | 
 85 |         with open(path, 'w', encoding='utf-8') as f:
 86 |             f.write(markdown_content)
 87 |     except requests.exceptions.RequestException as e:
 88 |         print(f"请求失败: {e}")
 89 | 
 90 | 
 91 | def get_book(url, cookies=None, output_path="download"):
 92 |     session = requests.Session()
 93 |     retries = Retry(total=5, backoff_factor=0.5, status_forcelist=[500, 502, 503, 504])
 94 |     session.mount('https://', HTTPAdapter(max_retries=retries))
 95 |     headers = {'Cookie': cookies} if cookies else {}
 96 |     try:
 97 |         docsdata = session.get(url, headers=headers, timeout=10)
 98 |         data = re.findall(r"decodeURIComponent\(\"(.+)\"\)\);", docsdata.content.decode('utf-8'))
 99 |         docsjson = json.loads(urllib.parse.unquote(data[0]))
100 |     except requests.exceptions.RequestException as e:
101 |         print(f"请求失败: {e}")
102 |         return
103 | 
104 |     list = {}
105 |     temp = {}
106 |     md = ""
107 |     table = str.maketrans('\/:*?"<>|\n\r', "___________")
108 | 
109 |     book_title = fetch_url_title(url, cookies)
110 |     output_dir = os.path.join(output_path, book_title)
111 | 
112 |     if not os.path.exists(output_dir):
113 |         os.makedirs(output_dir)
114 | 
115 |     for doc in tqdm(docsjson['book']['toc'], desc="Downloading Documents", unit="doc"):
116 |         if doc['type'] == 'TITLE' or doc['child_uuid'] != '':
117 |             list[doc['uuid']] = {'0': doc['title'], '1': doc['parent_uuid']}
118 |             uuid = doc['uuid']
119 |             temp[doc['uuid']] = ''
120 |             while True:
121 |                 if list[uuid]['1'] != '':
122 |                     if temp[doc['uuid']] == '':
123 |                         temp[doc['uuid']] = doc['title'].translate(table)
124 |                     else:
125 |                         temp[doc['uuid']] = list[uuid]['0'].translate(table) + '/' + temp[doc['uuid']]
126 |                     uuid = list[uuid]['1']
127 |                 else:
128 |                     temp[doc['uuid']] = list[uuid]['0'].translate(table) + '/' + temp[doc['uuid']]
129 |                     break
130 |             doc_dir = os.path.join(output_dir, temp[doc['uuid']])
131 |             if not os.path.exists(doc_dir):
132 |                 os.makedirs(doc_dir)
133 |             if temp[doc['uuid']].endswith("/"):
134 |                 md += "## " + temp[doc['uuid']][:-1] + "\n"
135 |             else:
136 |                 md += "  " * (temp[doc['uuid']].count("/") - 1) + "* " + temp[doc['uuid']][
137 |                                                                          temp[doc['uuid']].rfind("/") + 1:] + "\n"
138 |         if doc['url'] != '':
139 |             if doc['parent_uuid'] != "":
140 |                 if temp[doc['parent_uuid']].endswith("/"):
141 |                     md += " " * temp[doc['parent_uuid']].count("/") + "* [" + doc['title'] + "](" + urllib.parse.quote(
142 |                         temp[doc['parent_uuid']] + "/" + doc['title'].translate(table) + '.md') + ")" + "\n"
143 |                 else:
144 |                     md += "  " * temp[doc['parent_uuid']].count("/") + "* [" + doc['title'] + "](" + urllib.parse.quote(
145 |                         temp[doc['parent_uuid']] + "/" + doc['title'].translate(table) + '.md') + ")" + "\n"
146 |                 save_page(str(docsjson['book']['id']), doc['url'],
147 |                           os.path.join(output_dir, temp[doc['parent_uuid']], doc['title'].translate(table) + '.md'),
148 |                           cookies)
149 |             else:
150 |                 md += " " + "* [" + doc['title'] + "](" + urllib.parse.quote(
151 |                     doc['title'].translate(table) + '.md') + ")" + "\n"
152 |                 save_page(str(docsjson['book']['id']), doc['url'],
153 |                           os.path.join(output_dir, doc['title'].translate(table) + '.md'), cookies)
154 |             time.sleep(random.randint(1, 4))
155 | 
156 |     with open(os.path.join(output_dir, 'SUMMARY.md'), 'w', encoding='utf-8') as f:
157 |         f.write(md)
158 | 
159 | 
160 | 
161 | def create_ascii_banner():
162 |     banner = '''
163 | 
164 |                             __..._              
165 |                         ..-'      o.            
166 |                      .-'            :           
167 |                  _..'             .'__..--<     
168 |           ...--""                 '-.           
169 |       ..-"                       __.'           
170 |     .'                  ___...--'               
171 |    :        ____....---'                        
172 |   :       .'                                    
173 |  :       :           _____                      
174 |  :      :    _..--"""     """--..__             
175 | :       :  ."                      ""i--.       
176 | :       '.:                         :    '.     
177 | :         '--...___i---""""--..___.'      :     
178 |  :                 ""---...---""          :     
179 |   '.                                     :      
180 |     '-.                                 :       
181 |        '--...                         .'        
182 |          :   ""---....._____.....---""          
183 |          '.    '.                               
184 |            '-..  '.                             
185 |                '.  :                            
186 |                 : .'                            
187 |                /  :                             
188 |              .'   :                             
189 |            .' .--'                              
190 |           '--'
191 | 
192 | Hi, I'm Spritualkb, welcome master to use my program
193 | 你好,我是Spritualkb,欢迎师傅使用我的程序
194 |     '''
195 |     return banner
196 | 
197 | 
198 | 
199 | if __name__ == '__main__':
200 |     print(create_ascii_banner())
201 |     parser = argparse.ArgumentParser(description='从语雀下载书籍文档。')
202 |     parser.add_argument('--input', default="input.txt", help='包含书籍 URL 和 Cookie 的输入文件，每行格式为 URL,cookie。')
203 |     parser.add_argument('--output', default="download", help='下载文件的输出目录。')
204 | 
205 |     args = parser.parse_args()
206 | 
207 |     if not os.path.isfile(args.input):
208 |         print(f"输入文件 {args.input} 不存在。")
209 |         sys.exit(1)
210 | 
211 |     with open(args.input, 'r') as file:
212 |         lines = file.readlines()
213 | 
214 |     for line in lines:
215 |         line = line.strip()
216 |         if line:
217 |             parts = line.split(',', 1)
218 |             url = parts[0]
219 |             cookie = parts[1] if len(parts) > 1 else None
220 |             print("当前下载地址：" + url)
221 |             if cookie is not None:
222 |                 print("当前cookie：" + cookie)
223 |             else:
224 |                 print("当前cookie：None")
225 |             get_book(url, cookie, args.output)
226 | 


--------------------------------------------------------------------------------