├── .gitignore ├── ExportMD.py ├── README.MD └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- 1 | yuque 2 | .userinfo 3 | -------------------------------------------------------------------------------- /ExportMD.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | # ----------------------------------------- 3 | # createTime : 2021-08-17 4 | # author : Truda 5 | # email : truda8@pm.me 6 | # description: 自动导出语雀知识库为Markdown格式 7 | # ----------------------------------------- 8 | 9 | from prettytable import PrettyTable 10 | import re 11 | import os 12 | import time 13 | import aiohttp 14 | import asyncio 15 | from urllib import parse 16 | from PyInquirer import prompt, Separator 17 | from examples import custom_style_2 18 | from colr import color 19 | from cfonts import render, say 20 | 21 | 22 | class ExportMD: 23 | def __init__(self): 24 | self.repo_table = PrettyTable(["知识库ID", "名称"]) 25 | self.namespace, self.Token = self.get_UserInfo() 26 | self.headers = { 27 | "Content-Type": "application/json", 28 | "User-Agent": "ExportMD", 29 | "X-Auth-Token": self.Token 30 | } 31 | self.repo = {} 32 | self.export_dir = './yuque' 33 | 34 | def print_logo(self): 35 | output = render('ExportMD', colors=['red', 'yellow'], align='center') 36 | print(output) 37 | 38 | # 语雀用户信息 39 | def get_UserInfo(self): 40 | f_name = ".userinfo" 41 | if os.path.isfile(f_name): 42 | with open(f_name, encoding="utf-8") as f: 43 | userinfo = f.read().split("&") 44 | else: 45 | namespace = input("请输入语雀namespace:") 46 | Token = input("请输入语雀Token:") 47 | userinfo = [namespace, Token] 48 | with open(f_name, "w") as f: 49 | f.write(namespace + "&" + Token) 50 | return userinfo 51 | 52 | # 发送请求 53 | async def req(self, session, api): 54 | url = "https://www.yuque.com/api/v2" + api 55 | # print(url) 56 | async with session.get(url, headers=self.headers) as resp: 57 | result = await resp.json() 58 | return result 59 | 60 | # 获取所有知识库 61 | async def getRepo(self): 62 | api = "/users/%s/repos" % self.namespace 63 | async with aiohttp.ClientSession() as session: 64 | result = await self.req(session, api) 65 | for repo in result.get('data'): 66 | repo_id = str(repo['id']) 67 | repo_name = repo['name'] 68 | self.repo[repo_name] = repo_id 69 | self.repo_table.add_row([repo_id, repo_name]) 70 | 71 | # 获取一个知识库的文档列表 72 | async def get_docs(self, repo_id): 73 | api = "/repos/%s/docs" % repo_id 74 | async with aiohttp.ClientSession() as session: 75 | result = await self.req(session, api) 76 | docs = {} 77 | for doc in result.get('data'): 78 | title = doc['title'] 79 | slug = doc['slug'] 80 | docs[slug] = title 81 | return docs 82 | 83 | # 获取正文 Markdown 源代码 84 | async def get_body(self, repo_id, slug): 85 | api = "/repos/%s/docs/%s" % (repo_id, slug) 86 | async with aiohttp.ClientSession() as session: 87 | result = await self.req(session, api) 88 | body = result['data']['body'] 89 | body = re.sub("","", body) # 正则去除语雀导出的标签 90 | body = re.sub("\x00", "", body) # 去除不可见字符\x00 91 | body = re.sub("\x05", "", body) # 去除不可见字符\x05 92 | body = re.sub(r'\
!\[image.png\]',"\n![image.png]",body) # 正则去除语雀导出的图片后紧跟的
标签 93 | body = re.sub(r'\)\
', ")\n", body) # 正则去除语雀导出的图片后紧跟的
标签 94 | return body 95 | 96 | # 选择知识库 97 | def selectRepo(self): 98 | choices = [{"name": repo_name} for repo_name, _ in self.repo.items()] 99 | choices.insert(0, Separator('=== 知识库列表 ===')) 100 | questions = [ 101 | { 102 | 'type': 'checkbox', 103 | 'qmark': '>>>', 104 | 'message': '选择知识库', 105 | 'name': 'repo', 106 | 'choices': choices 107 | } 108 | ] 109 | repo_name_list = prompt(questions, style=custom_style_2) 110 | return repo_name_list["repo"] 111 | 112 | # 创建文件夹 113 | def mkDir(self, dir): 114 | isExists = os.path.exists(dir) 115 | if not isExists: 116 | os.makedirs(dir) 117 | 118 | # 获取文章并执行保存 119 | async def download_md(self, repo_id, slug, repo_name, title): 120 | """ 121 | :param repo_id: 知识库id 122 | :param slug: 文章id 123 | :param repo_name: 知识库名称 124 | :param title: 文章名称 125 | :return: none 126 | """ 127 | body = await self.get_body(repo_id, slug) 128 | new_body, image_list = await self.to_local_image_src(body) 129 | 130 | if image_list: 131 | # 图片保存位置: .yuque//assets/ 132 | save_dir = os.path.join(self.export_dir, repo_name, "assets") 133 | self.mkDir(save_dir) 134 | async with aiohttp.ClientSession() as session: 135 | await asyncio.gather( 136 | *(self.download_image(session, image_info, save_dir) for image_info in image_list) 137 | ) 138 | 139 | self.save(repo_name, title, new_body) 140 | 141 | print("📑 %s 导出成功!" % color(title, fore='green', style='bright')) 142 | 143 | # 将md里的图片地址替换成本地的图片地址 144 | async def to_local_image_src(self, body): 145 | body = re.sub(r'\
!\[image.png\]',"\n![image.png]",body) # 正则去除语雀导出的图片后紧跟的
标签 146 | body = re.sub(r'\)\
', ")\n", body) # 正则去除语雀导出的图片后紧跟的
标签 147 | 148 | pattern = r"!\[(?P.*?)\]" \ 149 | r"\((?Phttps:\/\/cdn\.nlark\.com\/yuque.*\/(?P\d+)\/(?P.*?\.[a-zA-z]+)).*\)" 150 | repl = r"![\g](./assets/\g)" 151 | images = [_.groupdict() for _ in re.finditer(pattern, body)] 152 | new_body = re.sub(pattern, repl, body) 153 | return new_body, images 154 | 155 | # 下载图片 156 | async def download_image(self, session, image_info: dict, save_dir: str): 157 | img_src = image_info['img_src'] 158 | filename = image_info["filename"] 159 | 160 | async with session.get(img_src) as resp: 161 | with open(os.path.join(save_dir, filename), 'wb') as f: 162 | f.write(await resp.read()) 163 | 164 | # 保存文章 165 | def save(self, repo_name, title, body): 166 | # 将不能作为文件名的字符进行编码 167 | def check_safe_path(path: str): 168 | for char in r'/\<>?:"|*': 169 | path = path.replace(char, parse.quote_plus(char)) 170 | return path 171 | 172 | repo_name = check_safe_path(repo_name) 173 | title = check_safe_path(title) 174 | save_path = "./yuque/%s/%s.md" % (repo_name, title) 175 | with open(save_path, "w", encoding="utf-8") as f: 176 | f.write(body) 177 | 178 | async def run(self): 179 | self.print_logo() 180 | await self.getRepo() 181 | repo_name_list = self.selectRepo() 182 | 183 | self.mkDir(self.export_dir) # 创建用于存储知识库文章的文件夹 184 | 185 | # 遍历所选知识库 186 | for repo_name in repo_name_list: 187 | dir_path = self.export_dir + "/" + repo_name.replace("/", "%2F") 188 | dir_path.replace("//", "/") 189 | self.mkDir(dir_path) 190 | 191 | repo_id = self.repo[repo_name] 192 | docs = await self.get_docs(repo_id) 193 | 194 | # 异步导出接口会报错,修改为同步导出,且每次导出等待50ms 195 | for slug in docs: 196 | time.sleep(0.05) 197 | title = docs[slug] 198 | await self.download_md(repo_id, slug, repo_name, title) 199 | 200 | # await asyncio.gather( 201 | # *(self.download_md(repo_id, slug, repo_name, title) for slug, title in docs.items()) 202 | # ) 203 | 204 | print("\n" + color('🎉 导出完成!', fore='green', style='bright')) 205 | print("已导出到:" + color(os.path.realpath(self.export_dir), fore='green', style='bright')) 206 | 207 | 208 | if __name__ == '__main__': 209 | export = ExportMD() 210 | loop = asyncio.get_event_loop() 211 | loop.run_until_complete(export.run()) 212 | -------------------------------------------------------------------------------- /README.MD: -------------------------------------------------------------------------------- 1 | # ExportMD-rectify-pics 2 | ## 1、写在前面 3 | ![Version](https://img.shields.io/badge/Version-1.0.0-blue) ![License](https://img.shields.io/badge/license-MIT-yellow) ![python](https://img.shields.io/badge/python->=3.6-orange) 4 | 5 | 简介:ExportMD语雀知识库自动导出为 Markdown 格式,支持同时导出多个知识库。 6 | 但ExportMD导出连续三张图片时,会出现错误,只有第一张图片的内容会被识别,本复刻版本增加了正则表达式,修复这一情况 7 | 8 | 本复刻版本无法提交issue,有问题欢迎到[语雀评论区](https://www.yuque.com/duzh929/blog/ocffqg)和我交流! 9 | 10 | 欢迎给我的语雀文档点赞,或者给我复刻的仓库star 11 | 12 | 另外我想说明一下,复刻这个仓库的目的是为了防止重要的笔记丢失,让大家能够有一个本地的markdown笔记备份,但我并不希望语雀因此流失用户,语雀真的是一款非常优秀的笔记软件,我会一直支持下去的! 13 | 14 | 15 | ### 版本v1.0 16 | API导出连着的图片时,会出现很多`
`,这时连续图片中只有第一张图片会被识别 17 | ``` 18 | ![image.png](https://cdn.nlark.com/xxx1.png)
![image.png](https://cdn.nlark.com/xxx2.png)
![image.png](https://cdn.nlark.com/xxx3.png) 19 | 20 | ``` 21 | 为了解决这一问题,在原来的ExportMD基础上,我增加了正则表达式,现在连续图片的导出已经没有问题 22 | 这里特别感谢ExportMD! 23 | ### 版本v1.1 24 | 实际使用的时候有遇到一些小问题,主要是接口限制等会导致有时候请求失败 25 | 这里使用[@stone0090](https://github.com/stone0090/ExportMD-rectify-pics)的解决方案 26 | 27 | https://github.com/stone0090/ExportMD-rectify-pics/blob/bf2b79ef0afaffcad78253cafa1760f45359623e/ExportMD.py#L198 28 | 29 | 修改为同步导出,且每次导出等待100ms 30 | 本人技术水平有限,感谢[@stone0090](https://github.com/stone0090/ExportMD-rectify-pics)!的修复 31 | 32 | 33 | ## 2、安装 34 | ### 环境要求 35 | - python >= 3.6 && python <= 3.9 36 | 37 | ### 安装 python 依赖 38 | ```bash 39 | pip install -r requirements.txt 40 | ``` 41 | --- 42 | 43 | ## 3、使用 44 | 45 | ## 获取 namespace 46 | 知识库 namespace:知识库 URL 中路径部分 47 | 48 | 知识库 [https://www.yuque.com/YourYuqueUserName](https://www.yuque.com/YourYuqueUserName) 对应的 *namespace* 为 *YourYuqueUserName* 49 | 50 | ## 获取 token 51 | ![token](https://s3.jpg.cm/2021/08/17/IUIASp.png) 52 | ## 使用 python3 运行 53 | ```bash 54 | python ExportMD.py 55 | ``` 56 | ## 输入namespace和token 57 | `⬆ ⬇`移动,`space`选择,`a`全选,`Enter`确认 58 | ![image](https://user-images.githubusercontent.com/61380549/162611337-9b2f875f-6cf0-47d6-87ba-6aa6a7f5efef.png) 59 | 60 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | prettytable 2 | aiohttp 3 | PyInquirer 4 | Colr 5 | python-cfonts --------------------------------------------------------------------------------