├── .gitignore
├── ExportMD.py
├── README.MD
└── requirements.txt


/.gitignore:
--------------------------------------------------------------------------------
1 | yuque
2 | .userinfo
3 | 


--------------------------------------------------------------------------------
/ExportMD.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: UTF-8 -*-
  2 | # -----------------------------------------
  3 | # createTime : 2021-08-17
  4 | # author     : Truda
  5 | # email      : truda8@pm.me
  6 | # description: 自动导出语雀知识库为Markdown格式
  7 | # -----------------------------------------
  8 | 
  9 | from prettytable import PrettyTable
 10 | import re
 11 | import os
 12 | import time
 13 | import aiohttp
 14 | import asyncio
 15 | from urllib import parse
 16 | from PyInquirer import prompt, Separator
 17 | from examples import custom_style_2
 18 | from colr import color
 19 | from cfonts import render, say
 20 | 
 21 | 
 22 | class ExportMD:
 23 |     def __init__(self):
 24 |         self.repo_table = PrettyTable(["知识库ID", "名称"])
 25 |         self.namespace, self.Token = self.get_UserInfo()
 26 |         self.headers = {
 27 |             "Content-Type": "application/json",
 28 |             "User-Agent": "ExportMD",
 29 |             "X-Auth-Token": self.Token
 30 |         }
 31 |         self.repo = {}
 32 |         self.export_dir = './yuque'
 33 | 
 34 |     def print_logo(self):
 35 |         output = render('ExportMD', colors=['red', 'yellow'], align='center')
 36 |         print(output)
 37 | 
 38 |     # 语雀用户信息
 39 |     def get_UserInfo(self):
 40 |         f_name = ".userinfo"
 41 |         if os.path.isfile(f_name):
 42 |             with open(f_name, encoding="utf-8") as f:
 43 |                 userinfo = f.read().split("&")
 44 |         else:
 45 |             namespace = input("请输入语雀namespace：")
 46 |             Token = input("请输入语雀Token：")
 47 |             userinfo = [namespace, Token]
 48 |             with open(f_name, "w") as f:
 49 |                 f.write(namespace + "&" + Token)
 50 |         return userinfo
 51 | 
 52 |     # 发送请求
 53 |     async def req(self, session, api):
 54 |         url = "https://www.yuque.com/api/v2" + api
 55 |         # print(url)
 56 |         async with session.get(url, headers=self.headers) as resp:
 57 |             result = await resp.json()
 58 |             return result
 59 | 
 60 |     # 获取所有知识库
 61 |     async def getRepo(self):
 62 |         api = "/users/%s/repos" % self.namespace
 63 |         async with aiohttp.ClientSession() as session:
 64 |             result = await self.req(session, api)
 65 |             for repo in result.get('data'):
 66 |                 repo_id = str(repo['id'])
 67 |                 repo_name = repo['name']
 68 |                 self.repo[repo_name] = repo_id
 69 |                 self.repo_table.add_row([repo_id, repo_name])
 70 | 
 71 |     # 获取一个知识库的文档列表
 72 |     async def get_docs(self, repo_id):
 73 |         api = "/repos/%s/docs" % repo_id
 74 |         async with aiohttp.ClientSession() as session:
 75 |             result = await self.req(session, api)
 76 |             docs = {}
 77 |             for doc in result.get('data'):
 78 |                 title = doc['title']
 79 |                 slug = doc['slug']
 80 |                 docs[slug] = title
 81 |             return docs
 82 | 
 83 |     # 获取正文 Markdown 源代码
 84 |     async def get_body(self, repo_id, slug):
 85 |         api = "/repos/%s/docs/%s" % (repo_id, slug)
 86 |         async with aiohttp.ClientSession() as session:
 87 |             result = await self.req(session, api)
 88 |             body = result['data']['body']
 89 |             body = re.sub("<a name=\".*\"></a>","", body)  # 正则去除语雀导出的<a>标签
 90 |             body = re.sub("\x00", "", body) # 去除不可见字符\x00
 91 |             body = re.sub("\x05", "", body) # 去除不可见字符\x05
 92 |             body = re.sub(r'\<br \/\>!\[image.png\]',"\n![image.png]",body) # 正则去除语雀导出的图片后紧跟的<br \>标签
 93 |             body = re.sub(r'\)\<br \/\>', ")\n", body)  # 正则去除语雀导出的图片后紧跟的<br \>标签
 94 |             return body
 95 | 
 96 |     # 选择知识库
 97 |     def selectRepo(self):
 98 |         choices = [{"name": repo_name} for repo_name, _ in self.repo.items()]
 99 |         choices.insert(0, Separator('=== 知识库列表 ==='))
100 |         questions = [
101 |             {
102 |                 'type': 'checkbox',
103 |                 'qmark': '>>>',
104 |                 'message': '选择知识库',
105 |                 'name': 'repo',
106 |                 'choices': choices
107 |             }
108 |         ]
109 |         repo_name_list = prompt(questions, style=custom_style_2)
110 |         return repo_name_list["repo"]
111 | 
112 |     # 创建文件夹
113 |     def mkDir(self, dir):
114 |         isExists = os.path.exists(dir)
115 |         if not isExists:
116 |             os.makedirs(dir)
117 | 
118 |     # 获取文章并执行保存
119 |     async def download_md(self, repo_id, slug, repo_name, title):
120 |         """
121 |         :param repo_id: 知识库id
122 |         :param slug: 文章id
123 |         :param repo_name: 知识库名称
124 |         :param title: 文章名称
125 |         :return: none
126 |         """
127 |         body = await self.get_body(repo_id, slug)
128 |         new_body, image_list = await self.to_local_image_src(body)
129 | 
130 |         if image_list:
131 |             # 图片保存位置: .yuque/<repo_name>/assets/<filename>
132 |             save_dir = os.path.join(self.export_dir, repo_name, "assets")
133 |             self.mkDir(save_dir)
134 |             async with aiohttp.ClientSession() as session:
135 |                 await asyncio.gather(
136 |                     *(self.download_image(session, image_info, save_dir) for image_info in image_list)
137 |                 )
138 | 
139 |         self.save(repo_name, title, new_body)
140 | 
141 |         print("📑 %s 导出成功！" % color(title, fore='green', style='bright'))
142 | 
143 |     # 将md里的图片地址替换成本地的图片地址
144 |     async def to_local_image_src(self, body):
145 |         body = re.sub(r'\<br \/\>!\[image.png\]',"\n![image.png]",body) # 正则去除语雀导出的图片后紧跟的<br \>标签
146 |         body = re.sub(r'\)\<br \/\>', ")\n", body)  # 正则去除语雀导出的图片后紧跟的<br \>标签
147 |         
148 |         pattern = r"!\[(?P<img_name>.*?)\]" \
149 |                   r"\((?P<img_src>https:\/\/cdn\.nlark\.com\/yuque.*\/(?P<slug>\d+)\/(?P<filename>.*?\.[a-zA-z]+)).*\)"
150 |         repl = r"![\g<img_name>](./assets/\g<filename>)"
151 |         images = [_.groupdict() for _ in re.finditer(pattern, body)]
152 |         new_body = re.sub(pattern, repl, body)
153 |         return new_body, images
154 | 
155 |     # 下载图片
156 |     async def download_image(self, session, image_info: dict, save_dir: str):
157 |         img_src = image_info['img_src']
158 |         filename = image_info["filename"]
159 | 
160 |         async with session.get(img_src) as resp:
161 |             with open(os.path.join(save_dir, filename), 'wb') as f:
162 |                 f.write(await resp.read())
163 | 
164 |     # 保存文章
165 |     def save(self, repo_name, title, body):
166 |         # 将不能作为文件名的字符进行编码
167 |         def check_safe_path(path: str):
168 |             for char in r'/\<>?:"|*':
169 |                 path = path.replace(char, parse.quote_plus(char))
170 |             return path
171 | 
172 |         repo_name = check_safe_path(repo_name)
173 |         title = check_safe_path(title)
174 |         save_path = "./yuque/%s/%s.md" % (repo_name, title)
175 |         with open(save_path, "w", encoding="utf-8") as f:
176 |             f.write(body)
177 | 
178 |     async def run(self):
179 |         self.print_logo()
180 |         await self.getRepo()
181 |         repo_name_list = self.selectRepo()
182 |         
183 |         self.mkDir(self.export_dir)  # 创建用于存储知识库文章的文件夹
184 | 
185 |         # 遍历所选知识库
186 |         for repo_name in repo_name_list:
187 |             dir_path = self.export_dir + "/" + repo_name.replace("/", "%2F")
188 |             dir_path.replace("//", "/")
189 |             self.mkDir(dir_path)
190 | 
191 |             repo_id = self.repo[repo_name]
192 |             docs = await self.get_docs(repo_id)
193 |             
194 |             # 异步导出接口会报错，修改为同步导出，且每次导出等待50ms
195 |             for slug in docs:
196 |                 time.sleep(0.05)
197 |                 title = docs[slug]
198 |                 await self.download_md(repo_id, slug, repo_name, title)
199 | 
200 | #             await asyncio.gather(
201 | #                 *(self.download_md(repo_id, slug, repo_name, title) for slug, title in docs.items())
202 | #             )
203 | 
204 |         print("\n" + color('🎉 导出完成！', fore='green', style='bright'))
205 |         print("已导出到：" + color(os.path.realpath(self.export_dir), fore='green', style='bright'))
206 | 
207 | 
208 | if __name__ == '__main__':
209 |     export = ExportMD()
210 |     loop = asyncio.get_event_loop()
211 |     loop.run_until_complete(export.run())
212 | 


--------------------------------------------------------------------------------
/README.MD:
--------------------------------------------------------------------------------
 1 | # ExportMD-rectify-pics
 2 | ## 1、写在前面
 3 | ![Version](https://img.shields.io/badge/Version-1.0.0-blue) ![License](https://img.shields.io/badge/license-MIT-yellow) ![python](https://img.shields.io/badge/python->=3.6-orange)
 4 | 
 5 | 简介：ExportMD语雀知识库自动导出为 Markdown 格式，支持同时导出多个知识库。
 6 | 但ExportMD导出连续三张图片时，会出现错误，只有第一张图片的内容会被识别，本复刻版本增加了正则表达式，修复这一情况
 7 | 
 8 | 本复刻版本无法提交issue，有问题欢迎到[语雀评论区](https://www.yuque.com/duzh929/blog/ocffqg)和我交流！  
 9 | 
10 | 欢迎给我的语雀文档点赞，或者给我复刻的仓库star  
11 | 
12 | 另外我想说明一下，复刻这个仓库的目的是为了防止重要的笔记丢失，让大家能够有一个本地的markdown笔记备份，但我并不希望语雀因此流失用户，语雀真的是一款非常优秀的笔记软件，我会一直支持下去的！
13 | 
14 | 
15 | ### 版本v1.0
16 | API导出连着的图片时，会出现很多`<br />`，这时连续图片中只有第一张图片会被识别
17 | ```
18 | ![image.png](https://cdn.nlark.com/xxx1.png)<br />![image.png](https://cdn.nlark.com/xxx2.png)<br />![image.png](https://cdn.nlark.com/xxx3.png)
19 | 
20 | ```
21 | 为了解决这一问题，在原来的ExportMD基础上，我增加了正则表达式，现在连续图片的导出已经没有问题
22 | 这里特别感谢ExportMD！
23 | ### 版本v1.1
24 | 实际使用的时候有遇到一些小问题，主要是接口限制等会导致有时候请求失败
25 | 这里使用[@stone0090](https://github.com/stone0090/ExportMD-rectify-pics)的解决方案  
26 | 
27 | https://github.com/stone0090/ExportMD-rectify-pics/blob/bf2b79ef0afaffcad78253cafa1760f45359623e/ExportMD.py#L198 
28 | 
29 | 修改为同步导出，且每次导出等待100ms
30 | 本人技术水平有限，感谢[@stone0090](https://github.com/stone0090/ExportMD-rectify-pics)!的修复
31 | 
32 | 
33 | ## 2、安装
34 | ### 环境要求
35 |  - python >= 3.6 && python <= 3.9
36 | 
37 | ### 安装 python 依赖
38 | ```bash
39 | pip install -r requirements.txt
40 | ```
41 | ---
42 | 
43 | ## 3、使用
44 | 
45 | ## 获取 namespace
46 | 知识库 namespace：知识库 URL 中路径部分
47 | 
48 | 知识库 [https://www.yuque.com/YourYuqueUserName](https://www.yuque.com/YourYuqueUserName)  对应的 *namespace* 为 *YourYuqueUserName*
49 | 
50 | ## 获取 token
51 | ![token](https://s3.jpg.cm/2021/08/17/IUIASp.png)
52 | ## 使用 python3 运行
53 | ```bash
54 | python ExportMD.py
55 | ```
56 | ## 输入namespace和token
57 | `⬆ ⬇`移动，`space`选择，`a`全选，`Enter`确认
58 | ![image](https://user-images.githubusercontent.com/61380549/162611337-9b2f875f-6cf0-47d6-87ba-6aa6a7f5efef.png)
59 | 
60 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | prettytable
2 | aiohttp
3 | PyInquirer
4 | Colr
5 | python-cfonts


--------------------------------------------------------------------------------