├── .dockerignore
├── .gitignore
├── All_Translation.py
├── Bing_translation.py
├── Deepl_Translation.py
├── Dockerfile
├── EbookTranslator
├── EbookTranslator
│ ├── All_Translation.py
│ ├── Deepl_Translation.py
│ ├── LLMS_translation.py
│ ├── YouDao_translation.py
│ ├── __init__.py
│ ├── cli.py
│ ├── convert2pdf.py
│ ├── load_config.py
│ └── main_function.py
├── LICENSE
├── README.md
├── requirements.txt
└── setup.py
├── LICENSE
├── LLMS_translation.py
├── OldMain.py
├── README.md
├── README_CN.md
├── README_JA.md
├── README_KO.md
├── README_TW.md
├── Subset_Font.py
├── YouDao_translation.py
├── app.py
├── build.py
├── config.json
├── convert2pdf.py
├── demo.mp4
├── demo.pdf
├── demo_zh.pdf
├── docker-compose.yml
├── download_model.py
├── get_new_blocks.py
├── icon.ico
├── index.html
├── languagedetect.py
├── load_config.py
├── main.py
├── merge_pdf.py
├── pdf_thumbnail.py
├── pdfviewer.html
├── pdfviewer2.html
├── recent.json
├── requirements.txt
├── static
├── 1.js
├── 2.js
├── 3.js
├── 4.js
├── Figure_1.png
├── Line-model-demo.pdf
├── Line-model-demo_zh.pdf
├── PolyglotPDF.png
├── colorspace_issue_sample.pdf
├── demo.gif
├── demo.mp4
├── i18n.js
├── main.css
├── merged_pdf
│ └── 2403.20127v1_auto_zh.pdf
├── original
│ ├── 2403.20127v1.pdf
│ ├── 2501.05450v1.pdf
│ └── demo.pdf
├── page1.png
├── page2.jpeg
├── page3.png
├── page4.png
├── setup.css
├── setup.js
├── target
│ ├── 2403.20127v1_zh.pdf
│ └── 2501.05450v1_zh.pdf
└── thumbnail
│ ├── ...txt
│ ├── 2403.20127v1.png
│ ├── 2501.05450v1.png
│ ├── 2g2.png
│ ├── 32g2.png
│ ├── High-precision real-time autonomous driving targetdetection based on YOLOv8.png
│ ├── g2.png
│ ├── g55.png
│ ├── g6.png
│ ├── gl1.png
│ ├── line.png
│ ├── m2.png
│ └── zz1.png
├── temp
└── fonts
│ └── zh_subset.ttf
└── update_recent.py
/.dockerignore:
--------------------------------------------------------------------------------
1 | # 忽略 Git 相关文件
2 | .git
3 | .gitignore
4 |
5 | # 忽略 Python 缓存和编译文件
6 | __pycache__/
7 | **/__pycache__/
8 | *.pyc
9 | *.pyo
10 | *.pyd
11 |
12 | # 忽略虚拟环境相关文件夹
13 | .Python
14 | env/
15 | venv/
16 |
17 | # 忽略 pip 日志
18 | pip-log.txt
19 | pip-delete-this-directory.txt
20 |
21 | # 忽略测试和覆盖率相关文件
22 | .tox/
23 | .coverage
24 | .coverage.*
25 | .cache/
26 | nosetests.xml
27 | coverage.xml
28 | *.cover
29 |
30 | # 忽略日志文件
31 | *.log
32 | logs/*.log
33 |
34 | # 忽略 pytest 缓存
35 | .pytest_cache/
36 |
37 | # 忽略项目根目录下的 lib 文件夹
38 | lib/
39 |
40 | # 忽略 IDE 配置文件
41 | .idea/
42 | .vscode/
43 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | venv
3 | *.local
4 |
--------------------------------------------------------------------------------
/All_Translation.py:
--------------------------------------------------------------------------------
1 | import time
2 | import os
3 | import Deepl_Translation as dt
4 | import YouDao_translation as yt
5 | import Bing_translation as bt
6 | import LLMS_translation as lt
7 | import asyncio
8 | from functools import wraps
9 | import threading
10 | from queue import Queue
11 |
12 | # 创建一个信号量,限制并发为1(串行处理)
13 | translation_semaphore = asyncio.Semaphore(1)
14 | # 创建一个队列处理锁,确保队列操作线程安全
15 | queue_lock = threading.Lock()
16 | # 创建翻译请求队列
17 | translation_queue = Queue()
18 | # 标记队列处理器是否已启动
19 | queue_processor_started = False
20 |
21 | def retry_on_error(max_retries=2, delay=1):
22 | def decorator(func):
23 | @wraps(func)
24 | def wrapper_sync(*args, **kwargs):
25 | retries = 0
26 | while retries <= max_retries:
27 | try:
28 | return func(*args, **kwargs)
29 | except Exception as e:
30 | retries += 1
31 | if retries <= max_retries:
32 | print(f"Error occurred: {str(e)}")
33 | print(f"Retrying... (Attempt {retries} of {max_retries})")
34 | time.sleep(delay)
35 | else:
36 | print(f"Max retries reached. Skipping... Final error: {str(e)}")
37 | return None
38 | return None
39 |
40 | async def wrapper_async(*args, **kwargs):
41 | retries = 0
42 | while retries <= max_retries:
43 | try:
44 | return await func(*args, **kwargs)
45 | except Exception as e:
46 | retries += 1
47 | if retries <= max_retries:
48 | print(f"Error occurred: {str(e)}")
49 | print(f"Retrying... (Attempt {retries} of {max_retries})")
50 | await asyncio.sleep(delay)
51 | else:
52 | print(f"Max retries reached. Skipping... Final error: {str(e)}")
53 | return None
54 | return None
55 |
56 | return wrapper_async if asyncio.iscoroutinefunction(func) else wrapper_sync
57 | return decorator
58 |
59 | # 队列处理器函数
60 | def process_translation_queue():
61 | global queue_processor_started
62 |
63 | # 在这里只创建一次事件循环
64 | loop = asyncio.new_event_loop()
65 | asyncio.set_event_loop(loop)
66 |
67 | while True:
68 | task = translation_queue.get()
69 | if task is None: # 终止信号
70 | translation_queue.task_done()
71 | break
72 | try:
73 | func, args, kwargs, result_holder = task
74 | # 这里直接用上面创建的 loop 执行
75 | result = loop.run_until_complete(func(*args, **kwargs))
76 | result_holder['result'] = result
77 | except Exception as e:
78 | print(f"Error processing translation task: {str(e)}")
79 | result_holder['result'] = None
80 | finally:
81 | translation_queue.task_done()
82 |
83 | # 跳出循环后,才一次性关闭事件循环
84 | # 先清理异步生成器
85 | loop.run_until_complete(loop.shutdown_asyncgens())
86 | # 然后再 close
87 | loop.close()
88 | # 启动队列处理线程
89 | def ensure_queue_processor():
90 | global queue_processor_started
91 | with queue_lock:
92 | if not queue_processor_started:
93 | threading.Thread(target=process_translation_queue, daemon=True).start()
94 | queue_processor_started = True
95 |
96 | class Online_translation:
97 | def __init__(self, original_language, target_language, translation_type, texts_to_process=[]):
98 | self.model_name = f"opus-mt-{original_language}-{target_language}"
99 | self.original_text = texts_to_process
100 | self.target_language = target_language
101 | self.original_lang = original_language
102 | self.translation_type = translation_type
103 | # 确保队列处理器已启动
104 | ensure_queue_processor()
105 |
106 | def run_async(self, coro):
107 | # 创建结果容器
108 | result_holder = {'result': None}
109 |
110 | # 将协程包装为任务并放入队列
111 | translation_queue.put((self._run_coro_with_semaphore, [coro], {}, result_holder))
112 |
113 | # 等待任务完成
114 | translation_queue.join()
115 |
116 | # 返回结果
117 | return result_holder['result']
118 |
119 | async def _run_coro_with_semaphore(self, coro):
120 | # 使用信号量确保串行执行
121 | async with translation_semaphore:
122 | return await coro
123 |
124 | def translation(self):
125 | print('翻译api', self.translation_type)
126 | if self.translation_type == 'deepl':
127 | translated_list = self.deepl_translation()
128 | elif self.translation_type == 'youdao':
129 | translated_list = self.youdao_translation()
130 | elif self.translation_type == 'bing':
131 | translated_list = self.bing_translation()
132 | elif self.translation_type == 'openai':
133 | translated_list = self.run_async(self.openai_translation())
134 | elif self.translation_type == 'deepseek':
135 | translated_list = self.run_async(self.deepseek_translation())
136 | elif self.translation_type == 'Doubao':
137 | translated_list = self.run_async(self.Doubao_translation())
138 | elif self.translation_type == 'Qwen':
139 | translated_list = self.run_async(self.Qwen_translation())
140 | elif self.translation_type == 'Grok':
141 | translated_list = self.run_async(self.Grok_translation())
142 | elif self.translation_type == 'ThirdParty':
143 | translated_list = self.run_async(self.ThirdParty_translation())
144 | elif self.translation_type == 'GLM':
145 | translated_list = self.run_async(self.GLM_translation())
146 | else:
147 | translated_list = self.deepl_translation()
148 |
149 | return translated_list
150 |
151 | @retry_on_error()
152 | def deepl_translation(self):
153 | translated_texts = dt.translate(
154 | texts=self.original_text,
155 | original_lang=self.original_lang,
156 | target_lang=self.target_language
157 | )
158 | return translated_texts
159 |
160 | @retry_on_error()
161 | def youdao_translation(self):
162 | translated_texts = yt.translate(
163 | texts=self.original_text,
164 | original_lang=self.original_lang,
165 | target_lang=self.target_language
166 | )
167 | return translated_texts
168 |
169 | @retry_on_error()
170 | def bing_translation(self):
171 | try:
172 | translated_texts = bt.translate(
173 | texts=self.original_text,
174 | original_lang=self.original_lang,
175 | target_lang=self.target_language
176 | )
177 | print(f"Bing translation completed: {len(translated_texts)} texts processed")
178 | return translated_texts
179 | except Exception as e:
180 | print(f"Error in Bing translation: {e}")
181 | return [""] * len(self.original_text)
182 |
183 | @retry_on_error()
184 | async def openai_translation(self):
185 | translator = lt.Openai_translation()
186 | translated_texts = await translator.translate(
187 | texts=self.original_text,
188 | original_lang=self.original_lang,
189 | target_lang=self.target_language
190 | )
191 | return translated_texts
192 |
193 | @retry_on_error()
194 | async def deepseek_translation(self):
195 | translator = lt.Deepseek_translation()
196 | translated_texts = await translator.translate(
197 | texts=self.original_text,
198 | original_lang=self.original_lang,
199 | target_lang=self.target_language
200 | )
201 | return translated_texts
202 |
203 | @retry_on_error()
204 | async def Doubao_translation(self):
205 | translator = lt.Doubao_translation()
206 | translated_texts = await translator.translate(
207 | texts=self.original_text,
208 | original_lang=self.original_lang,
209 | target_lang=self.target_language
210 | )
211 | return translated_texts
212 |
213 | @retry_on_error()
214 | async def Qwen_translation(self):
215 | translator = lt.Qwen_translation()
216 | translated_texts = await translator.translate(
217 | texts=self.original_text,
218 | original_lang=self.original_lang,
219 | target_lang=self.target_language
220 | )
221 | return translated_texts
222 |
223 | @retry_on_error()
224 | async def Grok_translation(self):
225 | translator = lt.Grok_translation()
226 | try:
227 | translated_texts = await translator.translate(
228 | texts=self.original_text,
229 | original_lang=self.original_lang,
230 | target_lang=self.target_language
231 | )
232 | print(f"Grok translation completed: {len(translated_texts)} texts processed")
233 | return translated_texts
234 | except Exception as e:
235 | print(f"Error in Grok translation: {e}")
236 | return [""] * len(self.original_text)
237 |
238 | @retry_on_error()
239 | async def ThirdParty_translation(self):
240 | translator = lt.ThirdParty_translation()
241 | try:
242 | translated_texts = await translator.translate(
243 | texts=self.original_text,
244 | original_lang=self.original_lang,
245 | target_lang=self.target_language
246 | )
247 | print(f"ThirdParty translation completed: {len(translated_texts)} texts processed")
248 | return translated_texts
249 | except Exception as e:
250 | print(f"Error in ThirdParty translation: {e}")
251 | return [""] * len(self.original_text)
252 |
253 | @retry_on_error()
254 | async def GLM_translation(self):
255 | translator = lt.GLM_translation()
256 | try:
257 | translated_texts = await translator.translate(
258 | texts=self.original_text,
259 | original_lang=self.original_lang,
260 | target_lang=self.target_language
261 | )
262 | print(f"GLM translation completed: {len(translated_texts)} texts processed")
263 | return translated_texts
264 | except Exception as e:
265 | print(f"Error in GLM translation: {e}")
266 | return [""] * len(self.original_text)
267 |
268 | # 确保程序退出前清理资源
269 | import atexit
270 |
271 | @atexit.register
272 | def cleanup():
273 | # 发送终止信号
274 | if queue_processor_started:
275 | translation_queue.put(None)
276 | # 给队列处理器一些时间来处理终止信号
277 | translation_queue.join()
278 |
279 | t = time.time()
280 |
281 | def split_text_to_fit_token_limit(text, encoder, index_text, max_length=280):
282 | tokens = encoder.encode(text)
283 | if len(tokens) <= max_length:
284 | return [(text, len(tokens), index_text)]
285 |
286 | split_points = [i for i, token in enumerate(tokens) if encoder.decode([token]).strip() in [' ', '.', '?', '!','!','?','。']]
287 | parts = []
288 | last_split = 0
289 | for i, point in enumerate(split_points + [len(tokens)]):
290 | if point - last_split > max_length:
291 | part_tokens = tokens[last_split:split_points[i - 1]]
292 | parts.append((encoder.decode(part_tokens), len(part_tokens), index_text))
293 | last_split = split_points[i - 1]
294 | elif i == len(split_points):
295 | part_tokens = tokens[last_split:]
296 | parts.append((encoder.decode(part_tokens), len(part_tokens), index_text))
297 |
298 | return parts
299 |
300 | def process_texts(texts, encoder):
301 | processed_texts = []
302 | for i, text in enumerate(texts):
303 | sub_texts = split_text_to_fit_token_limit(text, encoder, i)
304 | processed_texts.extend(sub_texts)
305 | return processed_texts
306 |
307 | def calculate_split_points(processed_texts, max_tokens=425):
308 | split_points = []
309 | current_tokens = 0
310 |
311 | for i in range(len(processed_texts) - 1):
312 | current_tokens = processed_texts[i][1]
313 | next_tokens = processed_texts[i + 1][1]
314 |
315 | if current_tokens + next_tokens > max_tokens:
316 | split_points.append(i)
317 |
318 | split_points.append(len(processed_texts) - 1)
319 |
320 | return split_points
321 | #
322 | # def translate(texts,original_language,target_language):
323 | # from transformers import pipeline, AutoTokenizer
324 | #
325 | # model_name = f"./opus-mt-{original_language}-{target_language}"
326 | # pipe = pipeline("translation", model=model_name)
327 | # tokenizer = AutoTokenizer.from_pretrained(model_name)
328 | #
329 | # result = pipe(texts)
330 | #
331 | # result_values = [d['translation_text'] for d in result]
332 | #
333 | # return result_values
334 | #
335 | # def batch_translate(processed_texts, split_points,original_language,target_language):
336 | # translated_texts = []
337 | # index_mapping = {}
338 | #
339 | # start_index = 0
340 | #
341 | # for split_point in split_points:
342 | # batch = processed_texts[start_index:split_point + 1]
343 | # batch_texts = [text for text, _, _ in batch]
344 | # translated_batch = translate(texts=batch_texts,original_language=original_language,target_language=target_language)
345 | #
346 | # for translated_text, (_, _, int_value) in zip(translated_batch, batch):
347 | # if int_value in index_mapping:
348 | # translated_texts[index_mapping[int_value]] += " " + translated_text
349 | # else:
350 | # index_mapping[int_value] = len(translated_texts)
351 | # translated_texts.append(translated_text)
352 | #
353 | # start_index = split_point + 1
354 | #
355 | # return translated_texts
356 | #
357 |
--------------------------------------------------------------------------------
/Bing_translation.py:
--------------------------------------------------------------------------------
1 | import re
2 | import requests
3 | import time
4 | import threading
5 | import asyncio
6 | import aiohttp
7 | from concurrent.futures import ThreadPoolExecutor
8 |
9 | def translate(texts, original_lang, target_lang):
10 | """
11 | 使用Bing翻译API翻译文本列表 - 高性能实现
12 |
13 | Args:
14 | texts: 要翻译的文本列表
15 | original_lang: 源语言代码
16 | target_lang: 目标语言代码
17 |
18 | Returns:
19 | 翻译后的文本列表
20 | """
21 | # 确保输入文本为列表格式
22 | if isinstance(texts, str):
23 | texts = [texts]
24 |
25 | # 如果文本量小,使用简单的并发线程池
26 | if len(texts) <= 20:
27 | return translate_with_threadpool(texts, original_lang, target_lang)
28 |
29 | # 对于大量文本,使用异步IO处理
30 | return translate_with_asyncio(texts, original_lang, target_lang)
31 |
32 |
33 | def translate_with_threadpool(texts, original_lang, target_lang, max_workers=5):
34 | """使用线程池并发翻译小批量文本"""
35 | translator = BingTranslator(lang_in=original_lang, lang_out=target_lang)
36 | translated_texts = [""] * len(texts)
37 |
38 | def translate_one(index, text):
39 | try:
40 | translated_texts[index] = translator.do_translate(text)
41 | except Exception as e:
42 | print(f"翻译文本时出错 (索引 {index}): {e}")
43 | translated_texts[index] = ""
44 |
45 | # 使用线程池并发处理
46 | with ThreadPoolExecutor(max_workers=max_workers) as executor:
47 | futures = [executor.submit(translate_one, i, text)
48 | for i, text in enumerate(texts)]
49 |
50 | # 等待所有任务完成
51 | for future in futures:
52 | future.result()
53 |
54 | return translated_texts
55 |
56 |
57 | def translate_with_asyncio(texts, original_lang, target_lang):
58 | """使用asyncio异步处理大批量文本"""
59 | # 定义异步主函数
60 | async def main():
61 | translator = AsyncBingTranslator(lang_in=original_lang, lang_out=target_lang)
62 | return await translator.translate_batch(texts)
63 |
64 | # 如果当前线程没有事件循环,创建一个新的
65 | try:
66 | loop = asyncio.get_event_loop()
67 | except RuntimeError:
68 | loop = asyncio.new_event_loop()
69 | asyncio.set_event_loop(loop)
70 |
71 | # 运行异步函数并返回结果
72 | return loop.run_until_complete(main())
73 |
74 |
75 | def split_text_intelligently(text, max_length=1000):
76 | """智能分段文本,尽量在句子边界处断开"""
77 | if len(text) <= max_length:
78 | return [text]
79 |
80 | parts = []
81 | start = 0
82 |
83 | while start < len(text):
84 | # 如果剩余文本不足max_length,直接添加
85 | if len(text) - start <= max_length:
86 | parts.append(text[start:])
87 | break
88 |
89 | # 计算当前段落的结束位置
90 | end = start + max_length
91 |
92 | # 尝试在句子结束处断开(优先级:段落 > 句号 > 逗号 > 空格)
93 | paragraph_break = text.rfind('\n', start, end)
94 | if paragraph_break != -1 and paragraph_break > start + max_length * 0.5:
95 | end = paragraph_break + 1
96 | else:
97 | # 寻找句号、问号、感叹号等
98 | for sep in ['. ', '。', '?', '!', '? ', '! ']:
99 | pos = text.rfind(sep, start, end)
100 | if pos != -1 and pos > start + max_length * 0.5:
101 | end = pos + len(sep)
102 | break
103 | else:
104 | # 如果没找到句号,尝试在逗号处断开
105 | for sep in [', ', ',', '; ', ';']:
106 | pos = text.rfind(sep, start, end)
107 | if pos != -1 and pos > start + max_length * 0.7:
108 | end = pos + len(sep)
109 | break
110 | else:
111 | # 实在没有好的断点就在空格处断开
112 | pos = text.rfind(' ', start + max_length * 0.8, end)
113 | if pos != -1:
114 | end = pos + 1
115 |
116 | parts.append(text[start:end])
117 | start = end
118 |
119 | return parts
120 |
121 |
122 | class BingTranslator:
123 | name = "bing"
124 | lang_map = {"zh": "zh-Hans"}
125 |
126 | # 会话参数缓存
127 | _cache_lock = threading.Lock()
128 | _sid_cache = None
129 | _sid_timestamp = 0
130 | _sid_cache_ttl = 300 # 5分钟缓存有效期
131 |
132 | def __init__(self, lang_in, lang_out, model=None, ignore_cache=False):
133 | # 处理语言代码映射
134 | self.lang_in = self.lang_map.get(lang_in, lang_in)
135 | self.lang_out = self.lang_map.get(lang_out, lang_out)
136 |
137 | # 自动语言检测处理
138 | if self.lang_in == "auto":
139 | self.lang_in = "auto-detect"
140 |
141 | self.model = model
142 | self.ignore_cache = ignore_cache
143 | self.session = requests.Session()
144 | self.endpoint = "https://www.bing.com/translator"
145 | self.headers = {
146 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0",
147 | }
148 |
149 | def find_sid(self):
150 | """获取必要的会话参数,使用缓存减少请求"""
151 | current_time = time.time()
152 |
153 | # 检查缓存是否有效
154 | with self._cache_lock:
155 | if (not self.ignore_cache and
156 | BingTranslator._sid_cache is not None and
157 | (current_time - BingTranslator._sid_timestamp) < BingTranslator._sid_cache_ttl):
158 | return BingTranslator._sid_cache
159 |
160 | # 缓存无效,重新获取参数
161 | response = self.session.get(self.endpoint, headers=self.headers)
162 | response.raise_for_status()
163 | url = response.url[:-10]
164 | ig = re.findall(r"\"ig\":\"(.*?)\"", response.text)[0]
165 | iid = re.findall(r"data-iid=\"(.*?)\"", response.text)[-1]
166 | key, token = re.findall(
167 | r"params_AbusePreventionHelper\s=\s\[(.*?),\"(.*?)\",", response.text
168 | )[0]
169 |
170 | # 更新缓存
171 | result = (url, ig, iid, key, token)
172 | with self._cache_lock:
173 | BingTranslator._sid_cache = result
174 | BingTranslator._sid_timestamp = current_time
175 |
176 | return result
177 |
178 | def do_translate(self, text):
179 | """执行翻译"""
180 | if not text or not text.strip():
181 | return ""
182 |
183 | # 如果文本超过1000字符,分段翻译
184 | if len(text) > 1000:
185 | parts = split_text_intelligently(text)
186 | translated_parts = []
187 |
188 | for part in parts:
189 | url, ig, iid, key, token = self.find_sid()
190 | response = self.session.post(
191 | f"{url}ttranslatev3?IG={ig}&IID={iid}",
192 | data={
193 | "fromLang": self.lang_in,
194 | "to": self.lang_out,
195 | "text": part[:1000], # 确保不超过1000
196 | "token": token,
197 | "key": key,
198 | },
199 | headers=self.headers,
200 | )
201 | response.raise_for_status()
202 | translated_parts.append(response.json()[0]["translations"][0]["text"])
203 |
204 | return ''.join(translated_parts)
205 |
206 | url, ig, iid, key, token = self.find_sid()
207 | response = self.session.post(
208 | f"{url}ttranslatev3?IG={ig}&IID={iid}",
209 | data={
210 | "fromLang": self.lang_in,
211 | "to": self.lang_out,
212 | "text": text,
213 | "token": token,
214 | "key": key,
215 | },
216 | headers=self.headers,
217 | )
218 | response.raise_for_status()
219 | return response.json()[0]["translations"][0]["text"]
220 |
221 |
222 | class AsyncBingTranslator:
223 | """异步Bing翻译器实现"""
224 | lang_map = {"zh": "zh-Hans"}
225 |
226 | # 会话参数缓存
227 | _sid_cache = None
228 | _sid_timestamp = 0
229 | _sid_cache_ttl = 300 # 5分钟缓存有效期
230 |
231 | def __init__(self, lang_in, lang_out):
232 | self.lang_in = self.lang_map.get(lang_in, lang_in)
233 | self.lang_out = self.lang_map.get(lang_out, lang_out)
234 |
235 | if self.lang_in == "auto":
236 | self.lang_in = "auto-detect"
237 |
238 | self.headers = {
239 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0",
240 | }
241 | self.endpoint = "https://www.bing.com/translator"
242 |
243 | async def find_sid(self, session):
244 | """异步获取会话参数,带缓存"""
245 | current_time = time.time()
246 |
247 | # 检查缓存是否有效
248 | if (AsyncBingTranslator._sid_cache is not None and
249 | (current_time - AsyncBingTranslator._sid_timestamp) < AsyncBingTranslator._sid_cache_ttl):
250 | return AsyncBingTranslator._sid_cache
251 |
252 | # 缓存无效,异步获取新参数
253 | async with session.get(self.endpoint, headers=self.headers) as response:
254 | if response.status != 200:
255 | raise Exception(f"获取会话参数失败: HTTP {response.status}")
256 |
257 | text = await response.text()
258 | url = str(response.url)[:-10]
259 | ig = re.findall(r"\"ig\":\"(.*?)\"", text)[0]
260 | iid = re.findall(r"data-iid=\"(.*?)\"", text)[-1]
261 | key, token = re.findall(
262 | r"params_AbusePreventionHelper\s=\s\[(.*?),\"(.*?)\",", text
263 | )[0]
264 |
265 | # 更新缓存
266 | result = (url, ig, iid, key, token)
267 | AsyncBingTranslator._sid_cache = result
268 | AsyncBingTranslator._sid_timestamp = current_time
269 |
270 | return result
271 |
272 | async def translate_text(self, session, text):
273 | """翻译单个文本"""
274 | if not text or not text.strip():
275 | return ""
276 |
277 | # 如果文本超过1000字符,分段翻译
278 | if len(text) > 1000:
279 | parts = split_text_intelligently(text)
280 | translated_parts = []
281 |
282 | # 非递归异步处理每个文本块
283 | for part in parts:
284 | url, ig, iid, key, token = await self.find_sid(session)
285 |
286 | async with session.post(
287 | f"{url}ttranslatev3?IG={ig}&IID={iid}",
288 | data={
289 | "fromLang": self.lang_in,
290 | "to": self.lang_out,
291 | "text": part[:1000], # 确保不超过1000
292 | "token": token,
293 | "key": key,
294 | },
295 | headers=self.headers,
296 | ) as response:
297 | if response.status == 200:
298 | result = await response.json()
299 | translated_parts.append(result[0]["translations"][0]["text"])
300 | else:
301 | print(f"翻译请求失败: HTTP {response.status}")
302 | translated_parts.append("")
303 |
304 | return ''.join(translated_parts)
305 |
306 | try:
307 | url, ig, iid, key, token = await self.find_sid(session)
308 | response = await session.post(
309 | f"{url}ttranslatev3?IG={ig}&IID={iid}",
310 | data={
311 | "fromLang": self.lang_in,
312 | "to": self.lang_out,
313 | "text": text,
314 | "token": token,
315 | "key": key,
316 | },
317 | headers=self.headers,
318 | )
319 | if response.status == 200:
320 | result = await response.json()
321 | return result[0]["translations"][0]["text"]
322 | else:
323 | print(f"翻译请求失败: HTTP {response.status}")
324 | return ""
325 | except Exception as e:
326 | print(f"翻译过程中发生错误: {e}")
327 | print(f"原文: {text}")
328 | return ""
329 |
330 | async def translate_batch(self, texts, batch_size=10, max_concurrent=5):
331 | """批量翻译文本,控制并发数量和请求批次"""
332 | async with aiohttp.ClientSession() as session:
333 | results = [""] * len(texts)
334 | semaphore = asyncio.Semaphore(max_concurrent)
335 |
336 | async def translate_with_limit(index, text):
337 | retry_count = 0
338 | max_retries = 10
339 | backoff_time = 1.0 # 初始重试等待时间
340 |
341 | while retry_count < max_retries:
342 | try:
343 | async with semaphore:
344 | # 每批次间隔较小的延迟
345 | if index > 0 and index % batch_size == 0:
346 | await asyncio.sleep(0.1)
347 |
348 |
349 | translated = await self.translate_text(session, text)
350 | if translated: # 如果翻译成功
351 | results[index] = translated
352 | if retry_count > 0: # 如果是重试成功的
353 | print(f"第{index}个文本重试成功!")
354 | return
355 | except Exception as e:
356 | print(f"第{index}个文本翻译失败 (尝试 {retry_count+1}/{max_retries}): {e}")
357 | print(f"原文: {text}")
358 |
359 | # 如果到这里,说明需要重试
360 | retry_count += 1
361 | if retry_count < max_retries:
362 | print(f"将在{backoff_time}秒后重试...")
363 | await asyncio.sleep(backoff_time)
364 | backoff_time *= 2 # 指数退避策略
365 | else:
366 | print(f"已达到最大重试次数,翻译失败")
367 | results[index] = ""
368 |
369 | # 创建所有任务
370 | tasks = [
371 | asyncio.create_task(translate_with_limit(i, text))
372 | for i, text in enumerate(texts)
373 | ]
374 |
375 | # 等待所有任务完成
376 | await asyncio.gather(*tasks)
377 | return results
378 |
379 |
380 | # 测试代码
381 | if __name__ == "__main__":
382 | test_texts = ["Hello, world!", "How are you today?", "Python is amazing", "I love programming"]
383 | results = translate(test_texts, "en", "zh")
384 |
385 | for original, translated in zip(test_texts, results):
386 | print(f"Original: {original}")
387 | print(f"Translated: {translated}")
388 | print("-" * 30)
--------------------------------------------------------------------------------
/Deepl_Translation.py:
--------------------------------------------------------------------------------
1 | import deepl
2 | import load_config
3 | def translate(texts,original_lang,target_lang):
4 |
5 | # 你的 DeepL 授权密钥
6 |
7 |
8 | # 获取指定服务的认证信息
9 |
10 |
11 | config = load_config.load_config()
12 |
13 | auth_key = config['translation_services']['deepl']['auth_key']
14 | # print(auth_key)
15 |
16 | translator = deepl.Translator(auth_key)
17 |
18 | # 要翻译的文本列表
19 |
20 |
21 | # 翻译文本列表,目标语言设置为中文
22 | print(original_lang,target_lang)
23 | if original_lang == 'auto':
24 | results = translator.translate_text(texts, target_lang=target_lang)
25 | else:
26 | results = translator.translate_text(texts, source_lang=original_lang, target_lang=target_lang)
27 |
28 |
29 | # 初始化一个空列表来收集翻译结果
30 | translated_texts = []
31 |
32 | # 遍历翻译结果,将它们添加到列表中
33 | for result in results:
34 | translated_texts.append(result.text)
35 | return translated_texts
36 |
37 |
38 |
39 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 |
2 | # 1. 使用官方 Python 3.9 的精简版镜像作为基础
3 | FROM python:3.9-slim
4 |
5 | # 2. 如果你需要一些系统库支持,可在此处安装
6 | # 比如安装 gcc、libssl-dev 等 (仅举例)
7 | # RUN apt-get update && apt-get install -y --no-install-recommends \
8 | # gcc \
9 | # libssl-dev \
10 | # && rm -rf /var/lib/apt/lists/*
11 |
12 | # 3. 设置工作目录
13 | WORKDIR /app
14 |
15 | # 4. 将 requirements.txt 复制到容器内
16 | COPY requirements.txt /app/
17 |
18 | # 5. 安装 Python 依赖包
19 | RUN pip install --no-cache-dir -r requirements.txt
20 |
21 | # 6. 复制项目源代码到容器内
22 | COPY . /app
23 |
24 | # 7. 暴露端口 12226(如你的项目需要此端口)
25 | EXPOSE 12226
26 |
27 | # 8. 容器启动时,默认执行 Python 脚本
28 | CMD ["python", "app.py"]
29 |
--------------------------------------------------------------------------------
/EbookTranslator/EbookTranslator/All_Translation.py:
--------------------------------------------------------------------------------
1 | import time
2 | import os
3 | from .import Deepl_Translation as dt
4 | from .import YouDao_translation as yt
5 | from .import LLMS_translation as lt
6 | import asyncio
7 |
8 | loop = asyncio.new_event_loop()
9 | asyncio.set_event_loop(loop)
10 | # #
11 | # Get the encoder of a specific model, assume gpt3.5, tiktoken is extremely fast,
12 | # and the error of this statistical token method is small and can be ignored
13 |
14 |
15 | class Online_translation:
16 | def __init__(self, original_language, target_language, translation_type, texts_to_process=[]):
17 | self.model_name = f"opus-mt-{original_language}-{target_language}"
18 | self.original_text = texts_to_process
19 | self.target_language = target_language
20 | self.original_lang = original_language
21 | self.translation_type = translation_type
22 |
23 | def run_async(self, coro):
24 | # 往往只要 run_until_complete(),不手动 close() 即可
25 | return loop.run_until_complete(coro)
26 |
27 | def translation(self):
28 | print('translation api',self.translation_type)
29 | if self.translation_type == 'deepl':
30 | translated_list = self.deepl_translation()
31 | elif self.translation_type == 'youdao':
32 | translated_list = self.youdao_translation()
33 | elif self.translation_type == 'bing':
34 | # 使用同步包装器运行异步函数
35 | translated_list = self.run_async(self.bing_translation())
36 | elif self.translation_type == 'openai':
37 | # 使用同步包装器运行异步函数
38 | translated_list = self.run_async(self.openai_translation())
39 | elif self.translation_type == 'deepseek':
40 | # 使用同步包装器运行异步函数
41 | translated_list = self.run_async(self.deepseek_translation())
42 | elif self.translation_type == 'Doubao':
43 | # 使用同步包装器运行异步函数
44 | translated_list = self.run_async(self.Doubao_translation())
45 | elif self.translation_type == 'Qwen':
46 | # 使用同步包装器运行异步函数
47 | translated_list = self.run_async(self.Qwen_translation())
48 | elif self.translation_type == 'Grok':
49 | # 使用同步包装器运行异步函数
50 | translated_list = self.run_async(self.Grok_translation())
51 | elif self.translation_type == 'ThirdParty':
52 | # 使用同步包装器运行异步函数
53 | translated_list = self.run_async(self.ThirdParty_translation())
54 | elif self.translation_type == 'GLM':
55 | # 使用同步包装器运行异步函数
56 | translated_list = self.run_async(self.GLM_translation())
57 | else:
58 | translated_list = self.deepl_translation()
59 |
60 | return translated_list
61 |
62 | def deepl_translation(self):
63 |
64 | translated_texts = dt.translate(texts=self.original_text,original_lang=self.original_lang,target_lang=self.target_language)
65 |
66 | return translated_texts
67 |
68 |
69 | def youdao_translation(self):
70 |
71 | translated_texts = yt.translate(texts=self.original_text,original_lang=self.original_lang,target_lang=self.target_language)
72 |
73 | return translated_texts
74 |
75 |
76 |
77 | async def openai_translation(self):
78 | translator = lt.Openai_translation()
79 | translated_texts = await translator.translate(
80 | texts=self.original_text,
81 | original_lang=self.original_lang,
82 | target_lang=self.target_language
83 | )
84 | return translated_texts
85 |
86 | async def deepseek_translation(self):
87 | translator = lt.Deepseek_translation()
88 | translated_texts = await translator.translate(
89 | texts=self.original_text,
90 | original_lang=self.original_lang,
91 | target_lang=self.target_language
92 | )
93 | return translated_texts
94 | async def Doubao_translation(self):
95 | translator = lt.Doubao_translation()
96 | translated_texts = await translator.translate(
97 | texts=self.original_text,
98 | original_lang=self.original_lang,
99 | target_lang=self.target_language
100 | )
101 | return translated_texts
102 | async def Qwen_translation(self):
103 | translator = lt.Qwen_translation()
104 | translated_texts = await translator.translate(
105 | texts=self.original_text,
106 | original_lang=self.original_lang,
107 | target_lang=self.target_language
108 | )
109 | return translated_texts
110 | async def Grok_translation(self):
111 | translator = lt.Grok_translation()
112 | try:
113 | translated_texts = await translator.translate(
114 | texts=self.original_text,
115 | original_lang=self.original_lang,
116 | target_lang=self.target_language
117 | )
118 | print(f"Grok translation completed: {len(translated_texts)} texts processed")
119 | return translated_texts
120 | except Exception as e:
121 | print(f"Error in Grok translation: {e}")
122 | return [""] * len(self.original_text)
123 |
124 | async def ThirdParty_translation(self):
125 | translator = lt.ThirdParty_translation()
126 | try:
127 | translated_texts = await translator.translate(
128 | texts=self.original_text,
129 | original_lang=self.original_lang,
130 | target_lang=self.target_language
131 | )
132 | print(f"ThirdParty translation completed: {len(translated_texts)} texts processed")
133 | return translated_texts
134 | except Exception as e:
135 | print(f"Error in ThirdParty translation: {e}")
136 | return [""] * len(self.original_text)
137 |
138 | async def GLM_translation(self):
139 | translator = lt.GLM_translation()
140 | try:
141 | translated_texts = await translator.translate(
142 | texts=self.original_text,
143 | original_lang=self.original_lang,
144 | target_lang=self.target_language
145 | )
146 | print(f"GLM translation completed: {len(translated_texts)} texts processed")
147 | return translated_texts
148 | except Exception as e:
149 | print(f"Error in GLM translation: {e}")
150 | return [""] * len(self.original_text)
151 |
152 | async def bing_translation(self):
153 | translator = lt.Bing_translation()
154 | try:
155 | translated_texts = await translator.translate(
156 | texts=self.original_text,
157 | original_lang=self.original_lang,
158 | target_lang=self.target_language
159 | )
160 | print(f"Bing translation completed: {len(translated_texts)} texts processed")
161 | return translated_texts
162 | except Exception as e:
163 | print(f"Error in Bing translation: {e}")
164 | return [""] * len(self.original_text)
165 |
166 |
167 | t = time.time()
168 | def split_text_to_fit_token_limit(text, encoder, index_text, max_length=280):
169 | tokens = encoder.encode(text)
170 | if len(tokens) <= max_length:
171 | return [(text, len(tokens), index_text)] # Return text along with its token count and original index 返回文本及其标记计数和原始索引
172 |
173 | # Pre-calculate possible split points (spaces, periods, etc.)
174 | split_points = [i for i, token in enumerate(tokens) if encoder.decode([token]).strip() in [' ', '.', '?', '!','!','?','。']]
175 | parts = []
176 | last_split = 0
177 | for i, point in enumerate(split_points + [len(tokens)]): # Ensure the last segment is included
178 | if point - last_split > max_length:
179 | part_tokens = tokens[last_split:split_points[i - 1]]
180 | parts.append((encoder.decode(part_tokens), len(part_tokens), index_text))
181 | last_split = split_points[i - 1]
182 | elif i == len(split_points): # Handle the last part
183 | part_tokens = tokens[last_split:]
184 | parts.append((encoder.decode(part_tokens), len(part_tokens), index_text))
185 |
186 | return parts
187 |
188 | def process_texts(texts, encoder):
189 | processed_texts = []
190 | for i, text in enumerate(texts):
191 | sub_texts = split_text_to_fit_token_limit(text, encoder, i)
192 | processed_texts.extend(sub_texts)
193 | return processed_texts
194 |
195 |
196 |
197 | def calculate_split_points(processed_texts, max_tokens=425):
198 | split_points = [] # 存储划分点的索引
199 | current_tokens = 0 # 当前累积的token数
200 |
201 | for i in range(len(processed_texts) - 1): # 遍历到倒数第二个元素
202 | current_tokens = processed_texts[i][1]
203 | next_tokens = processed_texts[i + 1][1]
204 |
205 | # 如果当前元素和下一个元素的token数之和超过了限制
206 | if current_tokens + next_tokens > max_tokens:
207 | split_points.append(i) # 当前元素作为一个划分点
208 | # 注意:这里不需要重置 current_tokens,因为每次循环都是新的一对元素
209 |
210 | # 最后一个元素总是一个划分点,因为它后面没有元素与之相邻
211 | split_points.append(len(processed_texts) - 1)
212 |
213 | return split_points
214 |
215 |
216 | def translate(texts,original_language,target_language):
217 | # 这里仅返回相同的文本列表作为示例,实际中应返回翻译后的文本
218 | from transformers import pipeline, AutoTokenizer
219 |
220 | model_name = f"./opus-mt-{original_language}-{target_language}" # 请替换为实际路径
221 | # 创建翻译管道,指定本地模型路径
222 | pipe = pipeline("translation", model=model_name)
223 | # 获取tokenizer,指定本地模型路径
224 | tokenizer = AutoTokenizer.from_pretrained(model_name)
225 |
226 | result = pipe(texts)
227 |
228 |
229 | # 提取值并组合成新的列表
230 | result_values = [d['translation_text'] for d in result]
231 |
232 | return result_values
233 |
234 |
235 |
236 | def batch_translate(processed_texts, split_points,original_language,target_language):
237 | translated_texts = [] # 存储翻译后的文本的列表
238 | index_mapping = {} # 存储每个int_value对应在translated_texts中的索引
239 |
240 | start_index = 0 # 当前批次的起始索引
241 |
242 | # 遍历划分点,按批次翻译文本
243 | for split_point in split_points:
244 | # 提取当前批次的文本(不包括划分点的下一个元素)
245 | batch = processed_texts[start_index:split_point + 1]
246 | batch_texts = [text for text, _, _ in batch]
247 | # 翻译函数
248 | translated_batch = translate(texts=batch_texts,original_language=original_language,target_language=target_language)
249 |
250 | # 遍历当前批次的翻译结果
251 | for translated_text, (_, _, int_value) in zip(translated_batch, batch):
252 | if int_value in index_mapping:
253 | # 如果键已存在,将新的翻译文本与原有的值拼接
254 | translated_texts[index_mapping[int_value]] += " " + translated_text
255 | else:
256 | # 如果键不存在,直接添加到列表,并记录其索引
257 | index_mapping[int_value] = len(translated_texts)
258 | translated_texts.append(translated_text)
259 |
260 | # 更新下一批次的起始索引
261 | start_index = split_point + 1
262 |
263 | return translated_texts
264 |
265 |
--------------------------------------------------------------------------------
/EbookTranslator/EbookTranslator/Deepl_Translation.py:
--------------------------------------------------------------------------------
1 | import deepl
2 | from .import load_config
3 | def translate(texts,original_lang,target_lang):
4 |
5 | # 你的 DeepL 授权密钥
6 |
7 |
8 | # 获取指定服务的认证信息
9 |
10 |
11 | config = load_config.load_config()
12 |
13 | auth_key = config['translation_services']['deepl']['auth_key']
14 | # print(auth_key)
15 |
16 | translator = deepl.Translator(auth_key)
17 |
18 | # 要翻译的文本列表
19 |
20 |
21 | # 翻译文本列表,目标语言设置为中文
22 | print(original_lang,target_lang)
23 | if original_lang == 'auto':
24 | results = translator.translate_text(texts, target_lang=target_lang)
25 | else:
26 | results = translator.translate_text(texts, source_lang=original_lang, target_lang=target_lang)
27 |
28 |
29 | # 初始化一个空列表来收集翻译结果
30 | translated_texts = []
31 |
32 | # 遍历翻译结果,将它们添加到列表中
33 | for result in results:
34 | translated_texts.append(result.text)
35 | return translated_texts
36 |
37 |
38 |
39 |
--------------------------------------------------------------------------------
/EbookTranslator/EbookTranslator/YouDao_translation.py:
--------------------------------------------------------------------------------
1 | import uuid
2 | import requests
3 | import hashlib
4 | import time
5 | import json
6 |
7 |
8 | def translate(texts,original_lang, target_lang):
9 | """
10 | 有道翻译API接口
11 |
12 | 参数:
13 | texts: list, 要翻译的文本列表
14 | target_lang: str, 目标语言代码
15 | credentials: dict, 包含 app_key 和 app_secret 的字典
16 |
17 | 返回:
18 | list: 翻译后的文本列表
19 | """
20 | YOUDAO_URL = 'https://openapi.youdao.com/v2/api'
21 |
22 | with open("config.json", 'r', encoding='utf-8') as f:
23 | config = json.load(f)
24 |
25 | # 获取指定服务的认证信息
26 | if target_lang == 'zh':
27 | target_lang='zh-CHS'
28 | service_name = "youdao"
29 | credentials = config['translation_services'].get(service_name)
30 | if not credentials:
31 | raise ValueError(f"Translation service '{service_name}' not found in config")
32 |
33 |
34 | def encrypt(sign_str):
35 | hash_algorithm = hashlib.sha256()
36 | hash_algorithm.update(sign_str.encode('utf-8'))
37 | return hash_algorithm.hexdigest()
38 |
39 | def truncate(q):
40 | if q is None:
41 | return None
42 | size = len(q)
43 | return q if size <= 20 else q[0:10] + str(size) + q[size - 10:size]
44 |
45 | def do_request(data):
46 | headers = {'Content-Type': 'application/x-www-form-urlencoded'}
47 | return requests.post(YOUDAO_URL, data=data, headers=headers)
48 |
49 | try:
50 | # 确保输入文本为列表格式
51 | if isinstance(texts, str):
52 | texts = [texts]
53 |
54 | print(type(texts))
55 |
56 | # 准备请求数据
57 | data = {
58 | 'from': original_lang,
59 | 'to': target_lang,
60 | 'signType': 'v3',
61 | 'curtime': str(int(time.time())),
62 | 'appKey': credentials['app_key'],
63 | 'q': texts,
64 | 'salt': str(uuid.uuid1()),
65 | 'vocabId': "您的用户词表ID"
66 | }
67 |
68 | # 生成签名
69 | sign_str = (credentials['app_key'] +
70 | truncate(''.join(texts)) +
71 | data['salt'] +
72 | data['curtime'] +
73 | credentials['app_secret'])
74 | data['sign'] = encrypt(sign_str)
75 |
76 | # 发送请求
77 | response = do_request(data)
78 | response_data = json.loads(response.content.decode("utf-8"))
79 |
80 | # 提取翻译结果
81 | translations = [result["translation"] for result in response_data["translateResults"]]
82 | print(translations)
83 | return translations
84 |
85 | except Exception as e:
86 | print(f"翻译出错: {str(e)}")
87 | return None
88 | # 使用示例:
89 | if __name__ == '__main__':
90 | # 认证信息
91 |
92 |
93 | # 要翻译的文本
94 | texts = ["hello", '待输入的文字"2', "待输入的文字3"]
95 | original_lang = 'auto'
96 |
97 | # 目标语言
98 | target_lang = 'zh'
99 |
100 | # 调用翻译
101 | results = translate(texts,original_lang='auto', target_lang=target_lang)
102 | print(results,'ggg')
103 |
104 | if results:
105 | for original, translated in zip(texts, results):
106 | print(f"原文: {original}")
107 | print(f"译文: {translated}\n")
108 |
109 |
--------------------------------------------------------------------------------
/EbookTranslator/EbookTranslator/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | EbookTranslator - 世界上性能最高的电子书保留布局翻译库
3 | The world's highest performing e-book retention layout translation library
4 | """
5 |
6 | __version__ = '0.1.0'
7 |
8 | from .main_function import main_function
9 |
10 | # 导出主要类和函数
11 | __all__ = ['main_function']
12 |
--------------------------------------------------------------------------------
/EbookTranslator/EbookTranslator/cli.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | """
3 | EbookTranslator的命令行界面
4 | """
5 |
6 | import argparse
7 | import sys
8 | import os
9 | from pathlib import Path
10 | from .main_function import main_function
11 |
12 |
13 | def main():
14 | """命令行入口点"""
15 | parser = argparse.ArgumentParser(description='翻译PDF文档')
16 | parser.add_argument('pdf_path', type=str, help='PDF文件路径')
17 | parser.add_argument('-o', '--original', default='auto', help='原始语言 (默认: auto)')
18 | parser.add_argument('-t', '--target', default='zh', help='目标语言 (默认: zh)')
19 | parser.add_argument('-b', '--begin', type=int, default=1, help='开始页码 (默认: 1)')
20 | parser.add_argument('-e', '--end', type=int, default=None, help='结束页码 (默认: 最后一页)')
21 | parser.add_argument('-c', '--config', type=str, default=None, help='配置文件路径')
22 | parser.add_argument('-d', '--dpi', type=int, default=72, help='OCR模式的DPI (默认: 72)')
23 |
24 | args = parser.parse_args()
25 |
26 | # 检查PDF文件是否存在
27 | print('路径',args.pdf_path)
28 |
29 | if not os.path.exists(args.pdf_path):
30 | print(f"错误: 找不到文件 '{args.pdf_path}'")
31 | sys.exit(1)
32 |
33 | try:
34 | # 运行主函数
35 | translator = main_function(
36 | pdf_path=args.pdf_path,
37 | original_language=args.original,
38 | target_language=args.target,
39 | bn=args.begin,
40 | en=args.end,
41 | config_path=args.config,
42 | DPI=args.dpi
43 | )
44 | translator.main()
45 | print(f"翻译完成! 输出文件保存在 target 目录")
46 | except Exception as e:
47 | print(f"翻译过程中发生错误: {e}")
48 | sys.exit(1)
49 |
50 |
51 | if __name__ == '__main__':
52 | main()
53 |
--------------------------------------------------------------------------------
/EbookTranslator/EbookTranslator/convert2pdf.py:
--------------------------------------------------------------------------------
1 | import fitz
2 | import os
3 |
4 |
5 | def convert_to_pdf(input_file, output_file=None):
6 | """
7 | 将支持的文档格式转换为 PDF,支持跨平台路径处理
8 |
9 | Args:
10 | input_file (str): 输入文件的完整路径
11 | output_file (str, optional): 输出PDF文件的完整路径。如果为None,则使用输入文件名+.pdf
12 |
13 | Returns:
14 | bool: 转换是否成功
15 | """
16 | try:
17 | # 规范化路径,处理不同平台的路径分隔符
18 | input_file = os.path.normpath(input_file)
19 |
20 | if not os.path.exists(input_file):
21 | print(f"错误:输入文件 '{input_file}' 不存在")
22 | return False
23 |
24 | # 如果未指定输出文件,则基于输入文件生成输出路径
25 | if output_file is None:
26 | # 获取文件名和目录
27 | file_dir = os.path.dirname(input_file)
28 | file_name = os.path.basename(input_file)
29 | name_without_ext = os.path.splitext(file_name)[0]
30 |
31 | # 在同一目录下创建同名PDF文件
32 | output_file = os.path.join(file_dir, f"{name_without_ext}.pdf")
33 |
34 | # 确保输出目录存在
35 | output_dir = os.path.dirname(output_file)
36 | if output_dir and not os.path.exists(output_dir):
37 | os.makedirs(output_dir, exist_ok=True)
38 |
39 | print(f"正在处理文件: {input_file}")
40 | print(f"输出文件将保存为: {output_file}")
41 |
42 | # 1. 先用 fitz.open 打开文档(EPUB、XPS、FB2 等格式)
43 | doc = fitz.open(input_file)
44 | print(f"文档页数: {len(doc)}")
45 |
46 | # 2. 调用 convert_to_pdf() 得到 PDF 格式字节流
47 | pdf_bytes = doc.convert_to_pdf()
48 |
49 | # 3. 再以 "pdf" 格式打开这段字节流
50 | pdf_doc = fitz.open("pdf", pdf_bytes)
51 |
52 | # 4. 保存为真正的 PDF 文件
53 | pdf_doc.save(output_file)
54 |
55 | # 关闭文档
56 | pdf_doc.close()
57 | doc.close()
58 |
59 | # 检查输出文件是否成功创建
60 | if os.path.exists(output_file):
61 | print(f"转换成功!PDF文件已保存为: {output_file}")
62 | return True
63 | else:
64 | print("转换似乎完成,但输出文件未找到")
65 | return False
66 |
67 | except fitz.FileDataError as e:
68 | print(f"文件格式错误或文件损坏:{str(e)}")
69 | except PermissionError as e:
70 | print(f"权限错误:无法访问或写入文件 - {str(e)}")
71 | except Exception as e:
72 | print(f"转换失败,错误类型: {type(e).__name__}")
73 | print(f"错误详情: {str(e)}")
74 | # 在调试模式下打印完整的堆栈跟踪
75 | import traceback
76 | traceback.print_exc()
77 |
78 | return False
79 | # 使用示例
80 | if __name__ == "__main__":
81 | # 单个文件转换示例
82 | input_file = "666 (1).epub"
83 |
84 | # 验证文件扩展名
85 | if not input_file.lower().endswith(('.xps', '.epub', '.fb2', '.cbz', '.mobi')):
86 | print(f"不支持的文件格式。支持的格式包括: XPS, EPUB, FB2, CBZ, MOBI")
87 | else:
88 | convert_to_pdf(input_file)
89 |
90 | # 批量转换示例
91 | # input_directory = "documents"
92 | # batch_convert_to_pdf(input_directory)
93 |
--------------------------------------------------------------------------------
/EbookTranslator/EbookTranslator/load_config.py:
--------------------------------------------------------------------------------
1 | import os
2 | import json
3 | import requests
4 | from pathlib import Path
5 | from typing import Optional, Dict
6 |
7 |
8 | def get_working_dir() -> Path:
9 | """
10 | 获取工作目录
11 | 返回当前工作目录(即命令行执行目录或调用脚本所在目录)
12 | """
13 | return Path.cwd()
14 |
15 |
16 | # 定义应用数据目录
17 | WORKING_DIR = get_working_dir()
18 | APP_DATA_DIR = WORKING_DIR # 示例:将 APP_DATA_DIR 定义为工作目录
19 | print(f"Working directory: {WORKING_DIR}")
20 |
21 |
22 | def resolve_path(path: str) -> Path:
23 | """
24 | 解析路径,支持绝对路径、相对路径和文件名。
25 |
26 | Args:
27 | path (str): 输入路径,可以是绝对路径、相对路径或文件名。
28 |
29 | Returns:
30 | Path: 解析后的完整路径。
31 | """
32 | # 如果 path 是绝对路径,直接返回
33 | if Path(path).is_absolute():
34 | return Path(path)
35 |
36 | # 如果 path 是相对路径或文件名,与 APP_DATA_DIR 拼接
37 | return APP_DATA_DIR / path
38 |
39 |
40 | def load_config(config_path: Optional[str] = None) -> Optional[Dict]:
41 | """
42 | 加载主配置文件,优先使用传入的 config_path 路径。
43 | 如果未传入或路径无效,则尝试使用 APP_DATA_DIR 中的文件。
44 | 如果 APP_DATA_DIR 中也没有 config.json,则从指定 URL 下载。
45 |
46 | Args:
47 | config_path (Optional[str]): 配置文件路径,可以是绝对路径、相对路径或文件名。
48 |
49 | Returns:
50 | Dict: 配置数据,如果加载失败则返回 None。
51 | """
52 | try:
53 | # 如果传入了 config_path 参数,优先使用
54 | if config_path:
55 | config_path = resolve_path(config_path) # 解析路径
56 | if config_path.exists():
57 | with config_path.open("r", encoding="utf-8") as f:
58 | return json.load(f)
59 | else:
60 | print(f"Specified config path does not exist: {config_path}")
61 |
62 | # 如果没有传入 config_path 或路径无效,则使用 APP_DATA_DIR 中的 config.json
63 | app_config_path = APP_DATA_DIR / "config.json"
64 | if app_config_path.exists():
65 | with app_config_path.open("r", encoding="utf-8") as f:
66 | return json.load(f)
67 | else:
68 | # 如果 APP_DATA_DIR 中没有,则尝试从指定 URL 下载 config.json
69 | url = "https://raw.githubusercontent.com/CBIhalsen/PolyglotPDF/refs/heads/main/config.json"
70 | response = requests.get(url, timeout=20)
71 | if response.status_code == 200:
72 | # 将下载的内容保存到 APP_DATA_DIR
73 | APP_DATA_DIR.mkdir(parents=True, exist_ok=True) # 确保 APP_DATA_DIR 存在
74 | print(
75 | f"config.json file not found, downloading config.json from: {url}"
76 | )
77 | with app_config_path.open("w", encoding="utf-8") as f:
78 | f.write(response.text)
79 | return response.json()
80 | else:
81 | print(f"Failed to download config.json, HTTP status code: {response.status_code}")
82 | return None
83 | except Exception as e:
84 | print(f"Error loading config: {str(e)}")
85 | return None
86 |
87 |
88 | def get_file_path(filename: str) -> Path:
89 | """
90 | 获取配置文件的完整路径,优先使用 APP_DATA_DIR 中的文件。
91 |
92 | Args:
93 | filename (str): 配置文件名。
94 |
95 | Returns:
96 | Path: 配置文件的完整路径。
97 | """
98 | # 首先检查 APP_DATA_DIR 中是否有该文件
99 | app_data_file = APP_DATA_DIR / filename
100 | if app_data_file.exists():
101 | return app_data_file
102 |
103 | # 如果 APP_DATA_DIR 中没有,则使用当前脚本所在目录的文件
104 | return Path(__file__).parent / filename
105 |
--------------------------------------------------------------------------------
/EbookTranslator/README.md:
--------------------------------------------------------------------------------
1 | English | [简体中文](https://github.com/CBIhalsen/PolyglotPDF/blob/main//README_CN.md) | [繁體中文](https://github.com/CBIhalsen/PolyglotPDF/blob/main/README_TW.md) | [日本語](https://github.com/CBIhalsen/PolyglotPDF/blob/main/README_JA.md) | [한국어](https://github.com/CBIhalsen/PolyglotPDF/blob/main/README_KO.md)
2 | # PolyglotPDF
3 |
4 | [](https://www.python.org/)
5 | [](https://example.com)
6 | [](https://www.latex-project.org/)
7 | [](https://example.com)
8 | [](https://example.com)
9 | [](https://pymupdf.readthedocs.io/)
10 |
11 |
12 | ## Demo
13 |
14 |
15 | ## Speed comparison
16 |
17 |
18 | ### [🎬 Watch Full Video](https://github.com/CBIhalsen/PolyglotPDF/blob/main/demo.mp4)
19 | llms has been added as the translation api of choice, Doubao ,Qwen ,deepseek v3 , gpt4-o-mini are recommended. The color space error can be resolved by filling the white areas in PDF files. The old text to text translation api has been removed.
20 |
21 | In addition, consider adding arxiv search function and rendering arxiv papers after latex translation.
22 |
23 | ### Pasges show
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 | # Chinese LLM API Application
35 |
36 | ## Doubao & Deepseek
37 | Apply through Volcengine platform:
38 | - Application URL: [Volcengine-Doubao](https://www.volcengine.com/product/doubao/)
39 | - Available Models: Doubao, Deepseek series models
40 |
41 | ## Tongyi Qwen
42 | Apply through Alibaba Cloud platform:
43 | - Application URL: [Alibaba Cloud-Tongyi Qwen](https://cn.aliyun.com/product/tongyi?from_alibabacloud=&utm_content=se_1019997984)
44 | - Available Models: Qwen-Max, Qwen-Plus series models
45 |
46 |
47 | ## Overview
48 | PolyglotPDF(EbookTranslation) is an advanced PDF processing tool that employs specialized techniques for ultra-fast text, table, and formula recognition in PDF documents, typically completing processing within 1 second. It features OCR capabilities and layout-preserving translation, with full document translations usually completed within 10 seconds (speed may vary depending on the translation API provider).
49 |
50 | ## Features
51 | - **Ultra-Fast Recognition**: Processes text, tables, and formulas in PDFs within ~1 second
52 | - **Layout-Preserving Translation**: Maintains original document formatting while translating content
53 | - **OCR Support**: Handles scanned documents efficiently
54 | - **Text-based PDF**:No GPU required
55 | - **Quick Translation**: Complete PDF translation in approximately 10 seconds
56 | - **Flexible API Integration**: Compatible with various translation service providers
57 | - **Web-based Comparison Interface**: Side-by-side comparison of original and translated documents
58 | - **Enhanced OCR Capabilities**: Improved accuracy in text recognition and processing
59 | - **Support for offline translation**: Use smaller translation model
60 |
61 | ## Installation and Setup
62 |
63 |
64 |
65 | ### There are several ways to use it. One is to install the library,
66 |
67 | ```bash
68 | pip install EbookTranslator
69 | ```
70 |
71 |
72 |
73 | Basic usage:
74 |
75 | ```bash
76 | EbookTranslator your_file.pdf
77 | ```
78 |
79 | Usage with parameters:
80 |
81 | ```bash
82 | EbookTranslator your_file.pdf -o en -t zh -b 1 -e 10 -c /path/to/config.json -d 300
83 | ```
84 |
85 | #### Using in Python Code
86 |
87 | ```python
88 | from EbookTranslator import main_function
89 |
90 | translator = main_function(
91 | pdf_path="your_file.pdf",
92 | original_language="en",
93 | target_language="zh",
94 | bn=1,
95 | en=10,
96 | config_path="/path/to/config.json",
97 | DPI=300
98 | )
99 | translator.main()
100 | ```
101 |
102 | ## Parameter Description
103 |
104 | | Parameter | Command Line Option | Description | Default Value |
105 | |-----------|---------------------|-------------|---------------|
106 | | `pdf_path` | Positional argument | PDF file path | Required |
107 | | `original_language` | `-o, --original` | Source language | `auto` |
108 | | `target_language` | `-t, --target` | Target language | `zh` |
109 | | `bn` | `-b, --begin` | Starting page number | `1` |
110 | | `en` | `-e, --end` | Ending page number | Last page of the document |
111 | | `config_path` | `-c, --config` | Configuration file path | `config.json` in the current working directory |
112 | | `DPI` | `-d, --dpi` | DPI for OCR mode | `72` |
113 |
114 | #### Configuration File
115 |
116 | The configuration file is a JSON file, by default located at `config.json` in the current working directory. If it doesn't exist, the program will use built-in default settings.
117 |
118 | #### Configuration File Example
119 |
120 | ```json
121 | {
122 | "count": 4,
123 | "PPC": 20,
124 | "translation_services": {
125 | "Doubao": {
126 | "auth_key": "",
127 | "model_name": ""
128 | },
129 | "Qwen": {
130 | "auth_key": "",
131 | "model_name": "qwen-plus"
132 | },
133 | "deepl": {
134 | "auth_key": ""
135 | },
136 | "deepseek": {
137 | "auth_key": "",
138 | "model_name": "ep-20250218224909-gps4n"
139 | },
140 | "openai": {
141 | "auth_key": "",
142 | "model_name": "gpt-4o-mini"
143 | },
144 | "youdao": {
145 | "app_key": "",
146 | "app_secret": ""
147 | }
148 | },
149 | "ocr_services": {
150 | "tesseract": {
151 | "path": "C:\\Program Files\\Tesseract-OCR\\tesseract.exe"
152 | }
153 | },
154 | "default_services": {
155 | "ocr_model": false,
156 | "line_model": false,
157 | "Enable_translation": true,
158 | "Translation_api": "openai"
159 | }
160 | }
161 | ```
162 |
163 | #### Configuration Options
164 |
165 | - `translation_service`: Translation service provider (e.g., "google", "deepl", "baidu")
166 | - `api_key`: Translation API key (if required)
167 | - `translation_mode`: Translation mode, "online" or "offline"
168 | - `ocr_enabled`: Whether to enable OCR recognition
169 | - `tesseract_path`: Path to Tesseract OCR engine (if not in system PATH)
170 | - `output_dir`: Output directory
171 | - `language_codes`: Language code mapping
172 | - `font_mapping`: Fonts corresponding to different languages
173 |
174 |
175 | #### Output
176 |
177 | Translated PDF files will be saved in the directory specified by `output_dir` (default is the `target` folder in the current working directory).
178 |
179 |
180 |
181 |
182 | ## License
183 |
184 | MIT
185 |
186 | ## Use method for friendly UI interface
187 |
188 | 1. Clone the repository:
189 | ```bash
190 | git clone https://github.com/CBIhalsen/PolyglotPDF.git
191 | cd polyglotpdf
192 | ```
193 |
194 | 2. Install required packages:
195 | ```bash
196 | pip install -r requirements.txt
197 | ```
198 | 3. Configure your API key in config.json. The alicloud translation API is not recommended.
199 |
200 | 4. Run the application:
201 | ```bash
202 | python app.py
203 | ```
204 |
205 | 5. Access the web interface:
206 | Open your browser and navigate to `http://127.0.0.1:8000`
207 |
208 | ## Requirements
209 | - Python 3.8+
210 | - deepl==1.17.0
211 | - Flask==2.0.1
212 | - Flask-Cors==5.0.0
213 | - langdetect==1.0.9
214 | - Pillow==10.2.0
215 | - PyMuPDF==1.24.0
216 | - pytesseract==0.3.10
217 | - requests==2.31.0
218 | - tiktoken==0.6.0
219 | - Werkzeug==2.0.1
220 |
221 | ## Acknowledgments
222 | This project leverages PyMuPDF's capabilities for efficient PDF processing and layout preservation.
223 |
224 | ## Upcoming Improvements
225 | - PDF chat functionality
226 | - Academic PDF search integration
227 | - Optimization for even faster processing speeds
228 |
229 | ### Known Issues
230 | - **Issue Description**: Error during text re-editing: `code=4: only Gray, RGB, and CMYK colorspaces supported`
231 | - **Symptom**: Unsupported color space encountered during text block editing
232 | - **Current Workaround**: Skip text blocks with unsupported color spaces
233 | - **Proposed Solution**: Switch to OCR mode for entire pages containing unsupported color spaces
234 | - **Example**: [View PDF sample with unsupported color spaces](https://github.com/CBIhalsen/PolyglotPDF/blob/main/static/colorspace_issue_sample.pdf)
235 |
236 |
237 | ### Font Optimization
238 | Current font configuration in the `start` function of `main.py`:
239 | ```python
240 | # Current configuration
241 | css=f"* {{font-family:{get_font_by_language(self.target_language)};font-size:auto;color: #111111 ;font-weight:normal;}}"
242 | ```
243 |
244 | You can optimize font display through the following methods:
245 |
246 | 1. **Modify Default Font Configuration**
247 | ```python
248 | # Custom font styles
249 | css=f"""* {{
250 | font-family: {get_font_by_language(self.target_language)};
251 | font-size: auto;
252 | color: #111111;
253 | font-weight: normal;
254 | letter-spacing: 0.5px; # Adjust letter spacing
255 | line-height: 1.5; # Adjust line height
256 | }}"""
257 | ```
258 |
259 | 2. **Embed Custom Fonts**
260 | You can embed custom fonts by following these steps:
261 | - Place font files (.ttf, .otf) in the project's `fonts` directory
262 | - Use `@font-face` to declare custom fonts in CSS
263 | ```python
264 | css=f"""
265 | @font-face {{
266 | font-family: 'CustomFont';
267 | src: url('fonts/your-font.ttf') format('truetype');
268 | }}
269 | * {{
270 | font-family: 'CustomFont', {get_font_by_language(self.target_language)};
271 | font-size: auto;
272 | font-weight: normal;
273 | }}
274 | """
275 | ```
276 |
277 | ### Basic Principles
278 | This project follows similar basic principles as Adobe Acrobat DC's PDF editing, using PyMuPDF for text block recognition and manipulation:
279 |
280 | - **Core Process**:
281 | ```python
282 | # Get text blocks from the page
283 | blocks = page.get_text("dict")["blocks"]
284 |
285 | # Process each text block
286 | for block in blocks:
287 | if block.get("type") == 0: # text block
288 | bbox = block["bbox"] # get text block boundary
289 | text = ""
290 | font_info = None
291 | # Collect text and font information
292 | for line in block["lines"]:
293 | for span in line["spans"]:
294 | text += span["text"] + " "
295 | ```
296 | This approach directly processes PDF text blocks, maintaining the original layout while achieving efficient text extraction and modification.
297 |
298 | - **Technical Choices**:
299 | - Utilizes PyMuPDF for PDF parsing and editing
300 | - Focuses on text processing
301 | - Avoids complex operations like AI formula recognition, table processing, or page restructuring
302 |
303 | - **Why Avoid Complex Processing**:
304 | - AI recognition of formulas, tables, and PDF restructuring faces severe performance bottlenecks
305 | - Complex AI processing leads to high computational costs
306 | - Significantly increased processing time (potentially tens of seconds or more)
307 | - Difficult to deploy at scale with low costs in production environments
308 | - Not suitable for online services requiring quick response times
309 |
310 | - **Project Scope**:
311 | - This project only serves to demonstrate the correct approach for layout-preserved PDF translation and AI-assisted PDF reading. Converting PDF files to markdown format for large language models to read, in my opinion, is not a wise approach.
312 | - Aims for optimal performance-to-cost ratio
313 |
314 | - **Performance**:
315 | - PolyglotPDF API response time: ~1 second per page
316 | - Low computational resource requirements, suitable for scale deployment
317 | - High cost-effectiveness for commercial applications
318 |
319 | - * Contact author:
320 | QQ: 1421243966
321 | email: 1421243966@qq.com
322 |
323 | Related questions answered and discussed:
324 |
325 | QQ group:
326 | 1031477425
327 |
328 |
329 |
330 |
--------------------------------------------------------------------------------
/EbookTranslator/requirements.txt:
--------------------------------------------------------------------------------
1 | deepl==1.17.0
2 | Flask
3 | flask-cors
4 | Pillow==10.2.0
5 | PyMuPDF==1.24.0
6 | pytesseract==0.3.10
7 | requests==2.31.0
8 | Werkzeug==2.0.1
9 | aiohttp
10 |
--------------------------------------------------------------------------------
/EbookTranslator/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, find_packages
2 |
3 | with open("README.md", "r", encoding="utf-8") as fh:
4 | long_description = fh.read()
5 |
6 | setup(
7 | name="EbookTranslator",
8 | version="0.3.3",
9 | author="Chen",
10 | author_email="1421243966@qq.com",
11 | description="The world's highest performing e-book retention layout translation library",
12 | long_description=long_description, # 添加这一行
13 | long_description_content_type="text/markdown",
14 | url="https://github.com/1421243966/EbookTranslator", # 更新为您的实际GitHub仓库
15 | packages=find_packages(),
16 | classifiers=[
17 | "Programming Language :: Python :: 3",
18 | "Programming Language :: Python :: 3.6",
19 | "Programming Language :: Python :: 3.7",
20 | "Programming Language :: Python :: 3.8",
21 | "Programming Language :: Python :: 3.9",
22 | "Programming Language :: Python :: 3.10",
23 | "License :: OSI Approved :: MIT License",
24 | "Operating System :: OS Independent",
25 | "Development Status :: 4 - Beta",
26 | "Intended Audience :: Developers",
27 | "Intended Audience :: Education",
28 | "Intended Audience :: Science/Research",
29 | "Topic :: Text Processing :: Linguistic",
30 | "Topic :: Utilities",
31 | ],
32 | python_requires=">=3.6",
33 | install_requires=[
34 | "pymupdf>=1.18.0",
35 | "Pillow>=8.0.0",
36 | "pytesseract>=0.3.0",
37 | "deepl>=1.17.0",
38 | "requests>=2.25.0",
39 | "Werkzeug>=2.0.0",
40 | "aiohttp>=3.7.4",
41 | ],
42 | entry_points={
43 | "console_scripts": [
44 | "EbookTranslator=EbookTranslator.cli:main",
45 | ],
46 | },
47 | include_package_data=True,
48 | keywords=["ebook", "translation", "pdf", "ocr", "nlp", "language"],
49 | project_urls={
50 | "Bug Reports": "https://github.com/1421243966/EbookTranslator/issues",
51 | "Source": "https://github.com/1421243966/EbookTranslator",
52 | "Documentation": "https://github.com/1421243966/EbookTranslator#readme",
53 | },
54 | )
55 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | python包在2.2版本之前预计不会更新,2.2版本预估采取解析最底层span获取更信息的布局逻辑解决,预估解决:行内公式错误判断为公式块,错误将粗体文本进行分段bug,以及insert_html方法重复嵌入字体文件导致处理页数较大pdf时浪费计算资源极其卡顿。 目前效果,对于基于文本的pdf,polyglotpdf的解析方式依旧是最优解。 ocr和布局分析并不总是完美。(考虑处理文本上下标问题,大部分pdf文件中上标下标文本通过指定坐标和字体大小实现伪上下标,考虑替换为真正的上下标文字对应的Unicode编码,但并不完美),对于报告型表格文档,polyglotpdf效果相当完美,当然表格中的复杂矢量数学公式依旧无法正确处理)。
2 | 寻求意见的改进方法,对于复杂的颜色布局文本或者粗体参杂常规字体文本,提出以下方法,对于流内容我们可以解析为html格式如下:
3 |
4 | ABSTRACT:
5 |
6 | The swine industry annually suffers significant economic losses caused by porcine reproductive and respiratory syndrome virus (PRRSV). Because the available commercial vaccines have limited protective efficacy against epidemic PRRSV, there is an urgent need for innovative solutions. Nanoparticle vaccines induce robust immune responses and have become a promising direction in vaccine development. In this study, we designed and produced a self-assembling nanoparticle vaccine derived from thermophilic archaeal ferritin to combat epidemic PRRSV. First, multiple T cell epitopes targeting viral structural proteins were identified by IFN-γ screening after PRRSV infection. Three different self-assembled nanoparticles with epitopes targeting viral GP3, GP4, and GP5.
7 |
8 |
9 | 这种解析内容只能由llms翻译,翻译结果如下:
10 | ```html
11 | 摘要:
12 |
13 | 猪产业每年因猪繁殖与呼吸综合征病毒(PRRSV)造成显著的经济损失。由于现有的商业疫苗对流行性PRRSV的保护效果有限,迫切需要创新的解决方案。纳米粒子疫苗能够引发强烈的免疫反应,已成为疫苗开发的一个有前景的方向。在本研究中,我们设计并生产了一种源自嗜热古细菌铁蛋白的自组装纳米粒子疫苗,以对抗流行性PRRSV。首先,通过PRRSV感染后的IFN-γ筛选,识别出针对病毒结构蛋白的多个T细胞表位。三种不同的自组装纳米粒子携带针对病毒GP3、GP4和GP5的表位。
14 |
15 | ```
16 | 甚至包括粗体:
17 | ```html
18 | 摘要:
19 |
20 | 猪产业每年因猪繁殖与呼吸综合征病毒(PRRSV)造成显著的经济损失。由于现有的商业疫苗对流行性PRRSV的保护效果有限,迫切需要创新的解决方案。纳米粒子疫苗能够引发强烈的免疫反应,已成为疫苗开发的一个有前景的方向。在本研究中,我们设计并生产了一种源自嗜热古细菌铁蛋白的自组装纳米粒子疫苗,以对抗流行性PRRSV。首先,通过PRRSV感染后的IFN-γ筛选,识别出针对病毒结构蛋白的多个T细胞表位。三种不同的自组装纳米粒子携带针对病毒GP3、GP4和GP5的表位。
21 |
22 | ```
23 | 这种方法会无线接近于完美的处理,目前考虑将此方法作为强化功能选用
24 |
25 | English | [简体中文](/README_CN.md) | [繁體中文](README_TW.md) | [日本語](README_JA.md) | [한국어](README_KO.md)
26 | # PolyglotPDF
27 |
28 | [](https://www.python.org/)
29 | [](https://example.com)
30 | [](https://www.latex-project.org/)
31 | [](https://example.com)
32 | [](https://example.com)
33 | [](https://pymupdf.readthedocs.io/)
34 |
35 |
36 | ## Demo
37 |
38 |
39 | ### [🎬 Watch Full Video](https://github.com/CBIhalsen/PolyglotPDF/blob/main/demo.mp4)
40 | llms has been added as the translation api of choice, Doubao ,Qwen ,deepseek v3 , gpt4-o-mini are recommended. The color space error can be resolved by filling the white areas in PDF files. The old text to text translation api has been removed.
41 |
42 | In addition, consider adding arxiv search function and rendering arxiv papers after latex translation.
43 |
44 | ### Pages show
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 | # Chinese LLM API Application
56 |
57 | ## Doubao & Deepseek
58 | Apply through Volcengine platform:
59 | - Application URL: [Volcengine-Doubao](https://www.volcengine.com/product/doubao/)
60 | - Available Models: Doubao, Deepseek series models
61 |
62 | ## Tongyi Qwen
63 | Apply through Alibaba Cloud platform:
64 | - Application URL: [Alibaba Cloud-Tongyi Qwen](https://cn.aliyun.com/product/tongyi?from_alibabacloud=&utm_content=se_1019997984)
65 | - Available Models: Qwen-Max, Qwen-Plus series models
66 |
67 |
68 | ## Overview
69 | PolyglotPDF is an advanced PDF processing tool that employs specialized techniques for ultra-fast text, table, and formula recognition in PDF documents, typically completing processing within 1 second. It features OCR capabilities and layout-preserving translation, with full document translations usually completed within 10 seconds (speed may vary depending on the translation API provider).
70 |
71 |
72 | ## Features
73 | - **Ultra-Fast Recognition**: Processes text, tables, and formulas in PDFs within ~1 second
74 | - **Layout-Preserving Translation**: Maintains original document formatting while translating content
75 | - **OCR Support**: Handles scanned documents efficiently
76 | - **Text-based PDF**:No GPU required
77 | - **Quick Translation**: Complete PDF translation in approximately 10 seconds
78 | - **Flexible API Integration**: Compatible with various translation service providers
79 | - **Web-based Comparison Interface**: Side-by-side comparison of original and translated documents
80 | - **Enhanced OCR Capabilities**: Improved accuracy in text recognition and processing
81 | - **Support for offline translation**: Use smaller translation model
82 |
83 | ## Installation and Usage
84 |
85 |
86 | Standard Installation
87 |
88 | 1. Clone the repository:
89 | ```bash
90 | git clone https://github.com/CBIhalsen/PolyglotPDF.git
91 | cd polyglotpdf
92 | ```
93 |
94 | 2. Install required packages:
95 | ```bash
96 | pip install -r requirements.txt
97 | ```
98 | 3. Configure your API key in config.json. The alicloud translation API is not recommended.
99 |
100 | 4. Run the application:
101 | ```bash
102 | python app.py
103 | ```
104 |
105 | 5. Access the web interface:
106 | Open your browser and navigate to `http://127.0.0.1:8000`
107 |
108 |
109 |
110 | Docker Installation
111 |
112 | ## Quick Start Without Persistence
113 |
114 | If you want to quickly test PolyglotPDF without setting up persistent directories:
115 |
116 | ```bash
117 | # Pull the image first
118 | docker pull 2207397265/polyglotpdf:latest
119 |
120 | # Run container without mounting volumes (data will be lost when container is removed)
121 | docker run -d -p 12226:12226 --name polyglotpdf 2207397265/polyglotpdf:latest
122 | ```
123 |
124 | This is the fastest way to try PolyglotPDF, but all uploaded PDFs and configuration changes will be lost when the container stops.
125 |
126 | ## Installation with Persistent Storage
127 |
128 | ```bash
129 | # Create necessary directories
130 | mkdir -p config fonts static/original static/target static/merged_pdf
131 |
132 | # Create config file
133 | nano config/config.json # or use any text editor
134 | # Copy configuration template from the project into this file
135 | # Make sure to fill in your API keys and other configuration details
136 |
137 | # Set permissions
138 | chmod -R 755 config fonts static
139 | ```
140 |
141 | ### Quick Start
142 |
143 | Use the following commands to pull and run the PolyglotPDF Docker image:
144 |
145 | ```bash
146 | # Pull image
147 | docker pull 2207397265/polyglotpdf:latest
148 |
149 | # Run container
150 | docker run -d -p 12226:12226 --name polyglotpdf \
151 | -v ./config/config.json:/app/config.json \
152 | -v ./fonts:/app/fonts \
153 | -v ./static/original:/app/static/original \
154 | -v ./static/target:/app/static/target \
155 | -v ./static/merged_pdf:/app/static/merged_pdf \
156 | 2207397265/polyglotpdf:latest
157 | ```
158 |
159 | ### Access the Application
160 |
161 | After the container starts, open in your browser:
162 | ```
163 | http://localhost:12226
164 | ```
165 |
166 | ### Using Docker Compose
167 |
168 | Create a `docker-compose.yml` file:
169 |
170 | ```yaml
171 | version: '3'
172 | services:
173 | polyglotpdf:
174 | image: 2207397265/polyglotpdf:latest
175 | ports:
176 | - "12226:12226"
177 | volumes:
178 | - ./config.json:/app/config.json # Configuration file
179 | - ./fonts:/app/fonts # Font files
180 | - ./static/original:/app/static/original # Original PDFs
181 | - ./static/target:/app/static/target # Translated PDFs
182 | - ./static/merged_pdf:/app/static/merged_pdf # Merged PDFs
183 | restart: unless-stopped
184 | ```
185 |
186 | Then run:
187 |
188 | ```bash
189 | docker-compose up -d
190 | ```
191 |
192 | ### Common Docker Commands
193 |
194 | ```bash
195 | # Stop container
196 | docker stop polyglotpdf
197 |
198 | # Restart container
199 | docker restart polyglotpdf
200 |
201 | # View logs
202 | docker logs polyglotpdf
203 | ```
204 |
205 |
206 | ## Requirements
207 | - Python 3.8+
208 | - deepl==1.17.0
209 | - Flask==2.0.1
210 | - Flask-Cors==5.0.0
211 | - langdetect==1.0.9
212 | - Pillow==10.2.0
213 | - PyMuPDF==1.24.0
214 | - pytesseract==0.3.10
215 | - requests==2.31.0
216 | - tiktoken==0.6.0
217 | - Werkzeug==2.0.1
218 |
219 | ## Acknowledgments
220 | This project leverages PyMuPDF's capabilities for efficient PDF processing and layout preservation.
221 |
222 | ## Upcoming Improvements
223 | - PDF chat functionality
224 | - Academic PDF search integration
225 | - Optimization for even faster processing speeds
226 |
227 | ### Known Issues
228 | - **Issue Description**: Error during text re-editing: `code=4: only Gray, RGB, and CMYK colorspaces supported`
229 | - **Symptom**: Unsupported color space encountered during text block editing
230 | - **Current Workaround**: Skip text blocks with unsupported color spaces
231 | - **Proposed Solution**: Switch to OCR mode for entire pages containing unsupported color spaces
232 | - **Example**: [View PDF sample with unsupported color spaces](https://github.com/CBIhalsen/PolyglotPDF/blob/main/static/colorspace_issue_sample.pdf)
233 |
234 | ### TODO
235 | - □ **Custom Terminology Database**: Support custom terminology databases with prompts for domain-specific professional translation
236 | - □ **AI Reflow Feature**: Convert double-column PDFs to single-column HTML blog format for easier reading on mobile devices
237 | - □ **Multi-format Export**: Export translation results to PDF, HTML, Markdown and other formats
238 | - □ **Multi-device Synchronization**: Read translations on mobile after processing on desktop
239 | - □ **Enhanced Merge Logic**: Improve the current merge logic by disabling font name detection and enabling horizontal, vertical, x, y range overlap merging
240 |
241 | ### Font Optimization
242 | Current font configuration in the `start` function of `main.py`:
243 | ```python
244 | # Current configuration
245 | css=f"* {{font-family:{get_font_by_language(self.target_language)};font-size:auto;color: #111111 ;font-weight:normal;}}"
246 | ```
247 |
248 | You can optimize font display through the following methods:
249 |
250 | 1. **Modify Default Font Configuration**
251 | ```python
252 | # Custom font styles
253 | css=f"""* {{
254 | font-family: {get_font_by_language(self.target_language)};
255 | font-size: auto;
256 | color: #111111;
257 | font-weight: normal;
258 | letter-spacing: 0.5px; # Adjust letter spacing
259 | line-height: 1.5; # Adjust line height
260 | }}"""
261 | ```
262 |
263 | 2. **Embed Custom Fonts**
264 | You can embed custom fonts by following these steps:
265 | - Place font files (.ttf, .otf) in the project's `fonts` directory
266 | - Use `@font-face` to declare custom fonts in CSS
267 | ```python
268 | css=f"""
269 | @font-face {{
270 | font-family: 'CustomFont';
271 | src: url('fonts/your-font.ttf') format('truetype');
272 | }}
273 | * {{
274 | font-family: 'CustomFont', {get_font_by_language(self.target_language)};
275 | font-size: auto;
276 | font-weight: normal;
277 | }}
278 | """
279 | ```
280 |
281 | ### Basic Principles
282 | This project follows similar basic principles as Adobe Acrobat DC's PDF editing, using PyMuPDF for text block recognition and manipulation:
283 |
284 | - **Core Process**:
285 | ```python
286 | # Get text blocks from the page
287 | blocks = page.get_text("dict")["blocks"]
288 |
289 | # Process each text block
290 | for block in blocks:
291 | if block.get("type") == 0: # text block
292 | bbox = block["bbox"] # get text block boundary
293 | text = ""
294 | font_info = None
295 | # Collect text and font information
296 | for line in block["lines"]:
297 | for span in line["spans"]:
298 | text += span["text"] + " "
299 | ```
300 | This approach directly processes PDF text blocks, maintaining the original layout while achieving efficient text extraction and modification.
301 |
302 | - **Technical Choices**:
303 | - Utilizes PyMuPDF for PDF parsing and editing
304 | - Focuses on text processing
305 | - Avoids complex operations like AI formula recognition, table processing, or page restructuring
306 |
307 | - **Why Avoid Complex Processing**:
308 | - AI recognition of formulas, tables, and PDF restructuring faces severe performance bottlenecks
309 | - Complex AI processing leads to high computational costs
310 | - Significantly increased processing time (potentially tens of seconds or more)
311 | - Difficult to deploy at scale with low costs in production environments
312 | - Not suitable for online services requiring quick response times
313 |
314 | - **Project Scope**:
315 | - This project only serves to demonstrate the correct approach for layout-preserved PDF translation and AI-assisted PDF reading. Converting PDF files to markdown format for large language models to read, in my opinion, is not a wise approach.
316 | - Aims for optimal performance-to-cost ratio
317 |
318 | - **Performance**:
319 | - PolyglotPDF API response time: ~1 second per page
320 | - Low computational resource requirements, suitable for scale deployment
321 | - High cost-effectiveness for commercial applications
322 |
323 | - * Contact author:
324 | QQ: 1421243966
325 | email: 1421243966@qq.com
326 |
327 | Related questions answered and discussed:
328 |
329 | QQ group:
330 | 1031477425
331 |
332 |
--------------------------------------------------------------------------------
/README_CN.md:
--------------------------------------------------------------------------------
1 | 注: 对于pdf这种棘手的文件处理,对于文字版pdf的最优解:参考开源项目mupdf重构block识别算法只需要达到Adobe Acrobat Dc精度即可,不要舍近求远使用ocr扫描文字版pdf。 使用ai模型去理解pdf布局未来成本绝对会高于使用gpt4o mini这类价格! 对于pdf种公式识别出要么不处理,要么通过字体文件名称和对应unicode值进行映射。 ocr扫描文字版pdf相当愚蠢
2 | # PolyglotPDF
3 |
4 | [](https://www.python.org/)
5 | [](https://example.com)
6 | [](https://www.latex-project.org/)
7 | [](https://example.com)
8 | [](https://example.com)
9 | [](https://pymupdf.readthedocs.io/)
10 |
11 | ## Demo
12 |
13 |
14 | ### [🎬 Watch Full Video](https://github.com/CBIhalsen/PolyglotPDF/blob/main/demo.mp4)
15 | 已经加入llms作为翻译api的选择,建议选择:Doubao ,Qwen ,deepseek v3 ,gpt4-o-mini。色彩空间错误可以通过填充PDF文件中的白色区域来解决。 古老text to text翻译api已删除
16 |
17 | 另外,考虑添加arxiv搜索功能及对arxiv论文进行latex翻译后渲染。
18 |
19 | ### 页面展示
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 | # 国内大语言模型API申请
30 |
31 | ## Doubao & Deepseek
32 | 通过火山引擎平台申请:
33 | - 申请地址: [火山引擎-豆包](https://www.volcengine.com/product/doubao/)
34 | - 支持模型: 豆包(Doubao)、Deepseek系列模型
35 |
36 | ## 通义千问(Qwen)
37 | 通过阿里云平台申请:
38 | - 申请地址: [阿里云-通义千问](https://cn.aliyun.com/product/tongyi?from_alibabacloud=&utm_content=se_1019997984)
39 | - 支持模型: Qwen-Max、Qwen-Plus等系列模型
40 |
41 |
42 | ## 概述
43 | PolyglotPDF 是一款先进的 PDF 处理工具,采用特殊技术实现对 PDF 文档中的文字、表格和公式的超快速识别,通常仅需 1 秒即可完成处理。它支持 OCR 功能和完美保留版面的翻译功能,整篇文档的翻译通常可在 10 秒内完成(具体速度取决于翻译 API 服务商)。
44 |
45 | ## 主要特点
46 | - **超快识别**:在约 1 秒内完成对 PDF 中文字、表格和公式的处理
47 | - **保留版面翻译**:翻译过程中完整保持原文档的排版格式
48 | - **OCR 支持**:高效处理扫描版文档
49 | - **基于文本的 PDF**:不需要GPU
50 | - **快速翻译**:约 10 秒内完成整个 PDF 的翻译
51 | - **灵活的 API 集成**:可对接各种翻译服务提供商
52 | - **网页对比界面**:支持原文与译文的并排对比
53 | - **增强的 OCR 功能**:提供更准确的文本识别和处理能力
54 | - **支持离线翻译**:使用较小翻译模型
55 |
56 | ## 安装和设置
57 |
58 |
59 | 标准安装
60 |
61 | 1. 克隆仓库:
62 | ```bash
63 | git clone https://github.com/CBIhalsen/Polyglotpdf.git
64 | cd polyglotpdf
65 | ```
66 |
67 | 2. 安装依赖包:
68 | ```bash
69 | pip install -r requirements.txt
70 | ```
71 | 3. 在config.json内配置API密钥,不建议使用alicloud翻译API.
72 |
73 | 4. 运行应用:
74 | ```bash
75 | python app.py
76 | ```
77 |
78 | 5. 访问网页界面:
79 | 在浏览器中打开 `http://127.0.0.1:8000`
80 |
81 |
82 |
83 | Docker 安装
84 |
85 | ## 无持久化快速启动
86 |
87 | 如果您想快速测试PolyglotPDF而不设置持久化目录:
88 |
89 | ```bash
90 | # 先拉取镜像
91 | docker pull 2207397265/polyglotpdf:latest
92 |
93 | # 不挂载卷的容器运行(容器删除后数据将丢失)
94 | docker run -d -p 12226:12226 --name polyglotpdf 2207397265/polyglotpdf:latest
95 | ```
96 |
97 | 这是尝试PolyglotPDF最快的方式,但容器停止后,所有上传的PDF和配置更改都会丢失。
98 |
99 | ## 持久化存储安装
100 |
101 | ```bash
102 | # 创建必要目录
103 | mkdir -p config fonts static/original static/target static/merged_pdf
104 |
105 | # 创建配置文件
106 | nano config/config.json # 或使用任何文本编辑器
107 | # 复制项目中的配置模板到该文件
108 | # 注意填写您的API密钥等配置信息
109 |
110 | # 设置权限
111 | chmod -R 755 config fonts static
112 | ```
113 |
114 | ## 快速启动
115 |
116 | 使用以下命令拉取并运行 PolyglotPDF Docker 镜像:
117 |
118 | ```bash
119 | # 拉取镜像
120 | docker pull 2207397265/polyglotpdf:latest
121 |
122 | # 运行容器
123 | docker run -d -p 12226:12226 --name polyglotpdf \
124 | -v ./config/config.json:/app/config.json \
125 | -v ./fonts:/app/fonts \
126 | -v ./static/original:/app/static/original \
127 | -v ./static/target:/app/static/target \
128 | -v ./static/merged_pdf:/app/static/merged_pdf \
129 | 2207397265/polyglotpdf:latest
130 | ```
131 |
132 | ## 访问应用
133 |
134 | 容器启动后,在浏览器中打开:
135 | ```
136 | http://localhost:12226
137 | ```
138 |
139 | ## 使用 Docker Compose
140 |
141 | 创建 `docker-compose.yml` 文件:
142 |
143 | ```yaml
144 | version: '3'
145 | services:
146 | polyglotpdf:
147 | image: 2207397265/polyglotpdf:latest
148 | ports:
149 | - "12226:12226"
150 | volumes:
151 | - ./config/config.json:/app/config.json # 配置文件
152 | - ./fonts:/app/fonts # 字体文件
153 | - ./static/original:/app/static/original # 原始PDF
154 | - ./static/target:/app/static/target # 翻译后PDF
155 | - ./static/merged_pdf:/app/static/merged_pdf # 合并PDF
156 | restart: unless-stopped
157 | ```
158 |
159 | 然后运行:
160 |
161 | ```bash
162 | docker-compose up -d
163 | ```
164 | ## 常用 Docker 命令
165 |
166 | ```bash
167 | # 停止容器
168 | docker stop polyglotpdf
169 |
170 | # 重启容器
171 | docker restart polyglotpdf
172 |
173 | # 查看日志
174 | docker logs polyglotpdf
175 | ```
176 |
177 |
178 |
179 |
180 | ## 环境要求
181 | - Python 3.8+
182 | - deepl==1.17.0
183 | - Flask==2.0.1
184 | - Flask-Cors==5.0.0
185 | - langdetect==1.0.9
186 | - Pillow==10.2.0
187 | - PyMuPDF==1.24.0
188 | - pytesseract==0.3.10
189 | - requests==2.31.0
190 | - tiktoken==0.6.0
191 | - Werkzeug==2.0.1
192 |
193 | ## 致谢
194 | 本项目得益于 PyMuPDF 强大的 PDF 处理和版面保持功能。
195 |
196 | ## 即将推出的改进
197 | - PDF 聊天功能
198 | - 学术 PDF 搜索集成
199 | - 进一步提升处理速度
200 |
201 | ### 待修复问题
202 | - **问题描述**:应用重编辑时发生错误: `code=4: only Gray, RGB, and CMYK colorspaces supported`
203 | - **现象**:文本块应用编辑时遇到不支持的色彩空间
204 | - **当前解决方案**:遇到不支持的色彩空间时跳过该文本块
205 | - **待解决思路**:对于包含不支持色彩空间的页面,整页切换至OCR模式处理
206 | - **复现示例**:[查看不支持色彩空间的PDF样例](https://github.com/CBIhalsen/PolyglotPDF/blob/main/static/colorspace_issue_sample.pdf)
207 |
208 |
209 | ### TODO
210 | - □ **自定义术语库**:支持自定义术语库,设置prompt进行领域专业翻译
211 | - □ **AI重排功能**:把双栏的PDF转换成HTML博客的单栏线性阅读格式,便于移动端阅读
212 | - □ **多格式导出**:翻译结果可以导出为PDF、HTML、Markdown等格式
213 | - □ **多端同步**:电脑上翻译完,手机上也能看
214 | - □ **增强合并逻辑**:现版本默认合并逻辑把检测字体名字全部关闭,加上水平、垂直、x、y范围重叠全部合并
215 |
216 |
217 | ### 字体优化
218 | 当前在 `main.py` 的 `start` 函数中,文本插入使用了默认字体配置:
219 | ```python
220 | # 当前配置
221 | css=f"* {{font-family:{get_font_by_language(self.target_language)};font-size:auto;color: #111111 ;font-weight:normal;}}"
222 | ```
223 |
224 | 你可以通过以下方式优化字体显示:
225 |
226 | 1. **修改默认字体配置**
227 | ```python
228 | # 自定义字体样式
229 | css=f"""* {{
230 | font-family: {get_font_by_language(self.target_language)};
231 | font-size: auto;
232 | color: #111111;
233 | font-weight: normal;
234 | letter-spacing: 0.5px; # 调整字间距
235 | line-height: 1.5; # 调整行高
236 | }}"""
237 | ```
238 |
239 | 2. **嵌入自定义字体**
240 | 你可以通过以下步骤嵌入自定义字体:
241 | - 将字体文件(如.ttf,.otf)放置在项目的 `fonts` 目录下
242 | - 在CSS中使用 `@font-face` 声明自定义字体
243 | ```python
244 | css=f"""
245 | @font-face {{
246 | font-family: 'CustomFont';
247 | src: url('fonts/your-font.ttf') format('truetype');
248 | }}
249 | * {{
250 | font-family: 'CustomFont', {get_font_by_language(self.target_language)};
251 | font-size: auto;
252 | font-weight: normal;
253 | }}
254 | """
255 | ```
256 |
257 | ### 基本原理
258 | 本项目采用与 Adobe Acrobat DC 编辑 PDF 类似的基本原理,基于 PyMuPDF 识别和处理 PDF 文本块:
259 |
260 | - **核心处理流程**:
261 | ```python
262 | # 获取页面中的文本块
263 | blocks = page.get_text("dict")["blocks"]
264 |
265 | # 遍历处理每个文本块
266 | for block in blocks:
267 | if block.get("type") == 0: # 文本块
268 | bbox = block["bbox"] # 获取文本块边界框
269 | text = ""
270 | font_info = None
271 | # 收集文本和字体信息
272 | for line in block["lines"]:
273 | for span in line["spans"]:
274 | text += span["text"] + " "
275 | ```
276 | 这种方式直接处理 PDF 文本块,保持原有布局不变,实现高效的文本提取和修改。
277 |
278 | - **技术选择**:
279 | - 使用 PyMuPDF 进行 PDF 解析和编辑
280 | - 专注于文本处理,避免复杂化问题
281 | - 不进行 AI 识别公式、表格或页面重组等复杂操作
282 |
283 | - **为什么避免复杂处理**:
284 | - AI 识别公式、表格和重组 PDF 页面的方式存在严重的性能瓶颈
285 | - 复杂的 AI 处理导致计算成本高昂
286 | - 处理时间显著增加(可能需要数十秒甚至更长)
287 | - 难以在生产环境中大规模低成本部署
288 | - 不适合需要快速响应的在线服务
289 |
290 | - **项目定位**:
291 | - 主要用于保留布局的 PDF 文件翻译
292 | - 为 AI 辅助阅读 PDF 提供高效实现方式
293 | - 追求最佳性能价格比
294 |
295 | - **性能表现**:
296 | - PolyglotPDF API 服务响应时间:约 1 秒/页
297 | - 低计算资源消耗,适合规模化部署
298 | - 成本效益高,适合商业应用
299 |
300 |
--------------------------------------------------------------------------------
/README_JA.md:
--------------------------------------------------------------------------------
1 | # PolyglotPDF
2 |
3 | [](https://www.python.org/)
4 | [](https://example.com)
5 | [](https://www.latex-project.org/)
6 | [](https://example.com)
7 | [](https://example.com)
8 | [](https://pymupdf.readthedocs.io/)
9 |
10 | ## デモ
11 |
12 |
13 | ### [🎬 フルビデオを見る](https://github.com/CBIhalsen/PolyglotPDF/blob/main/demo.mp4)
14 | 翻訳APIの選択肢としてLLMsが追加されました。推奨モデル:Doubao、Qwen、deepseek v3、gpt4-o-miniです。カラースペースエラーはPDFファイルの白色領域を埋めることで解決できます。古いtext to text翻訳APIは削除されました。
15 |
16 | また、arXiv検索機能とarXiv論文のLaTeX翻訳後のレンダリングの追加を検討中です。
17 |
18 | ### ページ表示
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 | # 中国の大規模言語モデルAPIの申請
29 |
30 | ## Doubao & Deepseek
31 | 火山エンジンプラットフォームから申請:
32 | - 申請先: [火山エンジン-Doubao](https://www.volcengine.com/product/doubao/)
33 | - 対応モデル: Doubao、Deepseekシリーズモデル
34 |
35 | ## 通義千問(Qwen)
36 | アリババクラウドプラットフォームから申請:
37 | - 申請先: [アリババクラウド-通義千問](https://cn.aliyun.com/product/tongyi?from_alibabacloud=&utm_content=se_1019997984)
38 | - 対応モデル: Qwen-Max、Qwen-Plusなどのシリーズモデル
39 |
40 | ## 概要
41 | PolyglotPDFは、特殊技術を用いてPDF文書内のテキスト、表、数式を超高速で認識する先進的なPDF処理ツールです。通常1秒以内で処理を完了し、OCR機能と完全なレイアウト保持翻訳機能をサポートしています。文書全体の翻訳は通常10秒以内で完了します(翻訳APIプロバイダーによって速度は異なります)。
42 |
43 | ## 主な特徴
44 | - **超高速認識**:約1秒でPDF内のテキスト、表、数式の処理を完了
45 | - **レイアウト保持翻訳**:翻訳時に原文書の書式を完全に保持
46 | - **OCRサポート**:スキャン版文書の効率的な処理
47 | - **テキストベースPDF**:GPUは不要
48 | - **高速翻訳**:約10秒でPDF全体の翻訳を完了
49 | - **柔軟なAPI統合**:各種翻訳サービスプロバイダーと連携可能
50 | - **Webベース比較インターフェース**:原文と訳文の並列比較をサポート
51 | - **強化されたOCR機能**:より正確なテキスト認識と処理能力
52 | - **オフライン翻訳対応**:小規模翻訳モデルの使用
53 |
54 | ## インストールとセットアップ
55 |
56 |
57 | 標準インストール
58 |
59 | 1. リポジトリのクローン:
60 | ```bash
61 | git clone https://github.com/CBIhalsen/Polyglotpdf.git
62 | cd polyglotpdf
63 | ```
64 |
65 | 2. 依存パッケージのインストール:
66 | ```bash
67 | pip install -r requirements.txt
68 | ```
69 |
70 | 3. config.json内でAPIキーを設定。alicloud翻訳APIの使用は推奨されません。
71 |
72 | 4. アプリケーションの実行:
73 | ```bash
74 | python app.py
75 | ```
76 |
77 | 5. Webインターフェースへのアクセス:
78 | ブラウザで `http://127.0.0.1:8000` を開く
79 |
80 |
81 |
82 | Docker 使用方法
83 |
84 | ## 永続化なしの簡易起動
85 |
86 | 永続化ディレクトリを設定せずにPolyglotPDFをすぐにテストしたい場合:
87 |
88 | ```bash
89 | # まずイメージをプル
90 | docker pull 2207397265/polyglotpdf:latest
91 |
92 | # ボリュームをマウントせずにコンテナを実行(コンテナ削除後にデータは失われます)
93 | docker run -d -p 12226:12226 --name polyglotpdf 2207397265/polyglotpdf:latest
94 | ```
95 |
96 | これはPolyglotPDFを試す最速の方法ですが、コンテナ停止後はアップロードしたPDFと設定変更がすべて失われます。
97 |
98 | ## 永続化ストレージでのインストール
99 |
100 | ```bash
101 | # 必要なディレクトリを作成
102 | mkdir -p config fonts static/original static/target static/merged_pdf
103 |
104 | # 設定ファイルを作成
105 | nano config/config.json # または任意のテキストエディタを使用
106 | # プロジェクトの設定テンプレートをこのファイルにコピー
107 | # APIキーなどの設定情報を入力してください
108 |
109 | # 権限を設定
110 | chmod -R 755 config fonts static
111 | ```
112 |
113 | ## クイックスタート
114 |
115 | 以下のコマンドでPolyglotPDF Dockerイメージをプルして実行:
116 |
117 | ```bash
118 | # イメージをプル
119 | docker pull 2207397265/polyglotpdf:latest
120 |
121 | # コンテナを実行
122 | docker run -d -p 12226:12226 --name polyglotpdf \
123 | -v ./config/config.json:/app/config.json \
124 | -v ./fonts:/app/fonts \
125 | -v ./static/original:/app/static/original \
126 | -v ./static/target:/app/static/target \
127 | -v ./static/merged_pdf:/app/static/merged_pdf \
128 | 2207397265/polyglotpdf:latest
129 | ```
130 |
131 | ## アプリケーションへのアクセス
132 |
133 | コンテナ起動後、ブラウザで開く:
134 | ```
135 | http://localhost:12226
136 | ```
137 |
138 | ## Docker Composeの使用
139 |
140 | `docker-compose.yml`ファイルを作成:
141 |
142 | ```yaml
143 | version: '3'
144 | services:
145 | polyglotpdf:
146 | image: 2207397265/polyglotpdf:latest
147 | ports:
148 | - "12226:12226"
149 | volumes:
150 | - ./config.json:/app/config.json # 設定ファイル
151 | - ./fonts:/app/fonts # フォントファイル
152 | - ./static/original:/app/static/original # 原本PDF
153 | - ./static/target:/app/static/target # 翻訳後PDF
154 | - ./static/merged_pdf:/app/static/merged_pdf # 結合PDF
155 | restart: unless-stopped
156 | ```
157 |
158 | そして実行:
159 |
160 | ```bash
161 | docker-compose up -d
162 | ```
163 |
164 | ## よく使うDockerコマンド
165 |
166 | ```bash
167 | # コンテナを停止
168 | docker stop polyglotpdf
169 |
170 | # コンテナを再起動
171 | docker restart polyglotpdf
172 |
173 | # ログの確認
174 | docker logs polyglotpdf
175 | ```
176 |
177 |
178 | ## 環境要件
179 | - Python 3.8+
180 | - deepl==1.17.0
181 | - Flask==2.0.1
182 | - Flask-Cors==5.0.0
183 | - langdetect==1.0.9
184 | - Pillow==10.2.0
185 | - PyMuPDF==1.24.0
186 | - pytesseract==0.3.10
187 | - requests==2.31.0
188 | - tiktoken==0.6.0
189 | - Werkzeug==2.0.1
190 |
191 | ## 謝辞
192 | 本プロジェクトはPyMuPDFの強力なPDF処理とレイアウト保持機能の恩恵を受けています。
193 |
194 | ## 今後の改善予定
195 | - PDFチャット機能
196 | - 学術PDF検索の統合
197 | - 処理速度のさらなる向上
198 |
199 | ### 修正待ちの問題
200 | - **問題の説明**:アプリケーション再編集時のエラー: `code=4: only Gray, RGB, and CMYK colorspaces supported`
201 | - **現象**:テキストブロックの編集時に非対応のカラースペースが発生
202 | - **現在の解決策**:非対応のカラースペースを含むテキストブロックをスキップ
203 | - **解決へのアプローチ**:非対応のカラースペースを含むページ全体をOCRモードで処理
204 | - **再現サンプル**:[非対応カラースペースのPDFサンプルを見る](https://github.com/CBIhalsen/PolyglotPDF/blob/main/static/colorspace_issue_sample.pdf)
205 |
206 | ### TODO
207 | - □ **カスタム用語集**: カスタム用語集をサポートし、特定分野の専門的な翻訳のためのプロンプト設定
208 | - □ **AI再配置機能**: 二段組みPDFをHTMLブログの一列リニア読書形式に変換し、モバイル端末での読書を容易にする
209 | - □ **複数形式エクスポート**: 翻訳結果をPDF、HTML、Markdown等の形式にエクスポート可能
210 | - □ **マルチデバイス同期**: コンピュータで翻訳完了後、スマートフォンでも閲覧可能
211 | - □ **強化されたマージロジック**: 現バージョンのデフォルトマージロジックではフォント名検出を完全に無効にし、水平・垂直・x・y範囲の重複をすべてマージする
212 |
213 | ### フォントの最適化
214 | 現在、`main.py`の`start`関数では、デフォルトのフォント設定でテキストを挿入しています:
215 | ```python
216 | # 現在の設定
217 | css=f"* {{font-family:{get_font_by_language(self.target_language)};font-size:auto;color: #111111 ;font-weight:normal;}}"
218 | ```
219 |
220 | フォント表示は以下の方法で最適化できます:
221 |
222 | 1. **デフォルトフォント設定の変更**
223 | ```python
224 | # カスタムフォントスタイル
225 | css=f"""* {{
226 | font-family: {get_font_by_language(self.target_language)};
227 | font-size: auto;
228 | color: #111111;
229 | font-weight: normal;
230 | letter-spacing: 0.5px; # 文字間隔の調整
231 | line-height: 1.5; # 行の高さの調整
232 | }}"""
233 | ```
234 |
235 | 2. **カスタムフォントの埋め込み**
236 | 以下の手順でカスタムフォントを埋め込むことができます:
237 | - フォントファイル(.ttf、.otfなど)をプロジェクトの`fonts`ディレクトリに配置
238 | - CSSで`@font-face`を使用してカスタムフォントを宣言
239 | ```python
240 | css=f"""
241 | @font-face {{
242 | font-family: 'CustomFont';
243 | src: url('fonts/your-font.ttf') format('truetype');
244 | }}
245 | * {{
246 | font-family: 'CustomFont', {get_font_by_language(self.target_language)};
247 | font-size: auto;
248 | font-weight: normal;
249 | }}
250 | """
251 | ```
252 |
253 | ### 基本原理
254 | 本プロジェクトはAdobe Acrobat DCのPDF編集と同様の基本原理を採用し、PyMuPDFを使用してPDFテキストブロックを認識・処理します:
255 |
256 | - **コア処理フロー**:
257 | ```python
258 | # ページからテキストブロックを取得
259 | blocks = page.get_text("dict")["blocks"]
260 |
261 | # 各テキストブロックを処理
262 | for block in blocks:
263 | if block.get("type") == 0: # テキストブロック
264 | bbox = block["bbox"] # テキストブロックの境界ボックスを取得
265 | text = ""
266 | font_info = None
267 | # テキストとフォント情報の収集
268 | for line in block["lines"]:
269 | for span in line["spans"]:
270 | text += span["text"] + " "
271 | ```
272 | この方法でPDFテキストブロックを直接処理し、元のレイアウトを保持したまま、効率的なテキストの抽出と修正を実現します。
273 |
274 | - **技術選択**:
275 | - PyMuPDFを使用してPDFの解析と編集を行う
276 | - テキスト処理に特化し、問題の複雑化を避ける
277 | - 数式、表、ページ再構成などの複雑なAI認識は行わない
278 |
279 | - **複雑な処理を避ける理由**:
280 | - 数式、表、PDFページ再構成のAI認識には深刻なパフォーマンスのボトルネックが存在
281 | - 複雑なAI処理は計算コストが高額
282 | - 処理時間が大幅に増加(数十秒以上かかる可能性)
283 | - 本番環境での大規模な低コスト展開が困難
284 | - オンラインサービスの迅速なレスポンスに不適
285 |
286 | - **プロジェクトの位置づけ**:
287 | - レイアウトを保持したPDFファイルの翻訳が主目的
288 | - PDFのAI支援読書に効率的な実装方法を提供
289 | - 最適なパフォーマンスとコスト比を追求
290 |
291 | - **パフォーマンス**:
292 | - PolyglotPDF APIサービスのレスポンス時間:約1秒/ページ
293 | - 低計算リソース消費で、スケーラブルな展開が可能
294 | - コスト効率が高く、商用利用に適している
295 |
--------------------------------------------------------------------------------
/README_KO.md:
--------------------------------------------------------------------------------
1 | # PolyglotPDF
2 |
3 | [](https://www.python.org/)
4 | [](https://example.com)
5 | [](https://www.latex-project.org/)
6 | [](https://example.com)
7 | [](https://example.com)
8 | [](https://pymupdf.readthedocs.io/)
9 |
10 | ## 데모
11 |
12 |
13 | ### [🎬 전체 영상 보기](https://github.com/CBIhalsen/PolyglotPDF/blob/main/demo.mp4)
14 | 번역 API 선택지로 LLMs가 추가되었습니다. 권장 모델: Doubao, Qwen, deepseek v3, gpt4-o-mini입니다. 색상 공간 오류는 PDF 파일의 흰색 영역을 채우는 것으로 해결할 수 있습니다. 기존 text to text 번역 API는 삭제되었습니다.
15 |
16 | 또한, arXiv 검색 기능과 arXiv 논문의 LaTeX 번역 후 렌더링 추가를 고려 중입니다.
17 |
18 | ### 페이지 표시
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 | # 중국 대규모 언어 모델 API 신청
29 |
30 | ## Doubao & Deepseek
31 | 화산 엔진 플랫폼을 통한 신청:
32 | - 신청 주소: [화산 엔진-Doubao](https://www.volcengine.com/product/doubao/)
33 | - 지원 모델: Doubao, Deepseek 시리즈 모델
34 |
35 | ## 통의천문(Qwen)
36 | 알리바바 클라우드 플랫폼을 통한 신청:
37 | - 신청 주소: [알리바바 클라우드-통의천문](https://cn.aliyun.com/product/tongyi?from_alibabacloud=&utm_content=se_1019997984)
38 | - 지원 모델: Qwen-Max, Qwen-Plus 등 시리즈 모델
39 |
40 | ## 개요
41 | PolyglotPDF는 특수 기술을 사용하여 PDF 문서 내의 텍스트, 표, 수식을 초고속으로 인식하는 선진적인 PDF 처리 도구입니다. 보통 1초 이내에 처리를 완료하며, OCR 기능과 완벽한 레이아웃 유지 번역 기능을 지원합니다. 문서 전체의 번역은 보통 10초 이내에 완료됩니다(번역 API 제공업체에 따라 속도가 다릅니다).
42 |
43 | ## 주요 특징
44 | - **초고속 인식**: 약 1초 내에 PDF 내의 텍스트, 표, 수식 처리 완료
45 | - **레이아웃 유지 번역**: 번역 시 원문서의 서식을 완벽하게 유지
46 | - **OCR 지원**: 스캔 버전 문서의 효율적인 처리
47 | - **텍스트 기반 PDF**: GPU 불필요
48 | - **고속 번역**: 약 10초 내에 PDF 전체 번역 완료
49 | - **유연한 API 통합**: 각종 번역 서비스 제공업체와 연동 가능
50 | - **웹 기반 비교 인터페이스**: 원문과 번역문의 병렬 비교 지원
51 | - **강화된 OCR 기능**: 더 정확한 텍스트 인식과 처리 능력
52 | - **오프라인 번역 지원**: 소규모 번역 모델 사용
53 |
54 | ## 설치 및 설정
55 |
56 |
57 | 표준 설치
58 |
59 | 1. 저장소 클론:
60 | ```bash
61 | git clone https://github.com/CBIhalsen/Polyglotpdf.git
62 | cd polyglotpdf
63 | ```
64 |
65 | 2. 의존성 패키지 설치:
66 | ```bash
67 | pip install -r requirements.txt
68 | ```
69 |
70 | 3. config.json에서 API 키 설정. alicloud 번역 API 사용은 권장되지 않습니다.
71 |
72 | 4. 애플리케이션 실행:
73 | ```bash
74 | python app.py
75 | ```
76 |
77 | 5. 웹 인터페이스 접속:
78 | 브라우저에서 `http://127.0.0.1:8000` 열기
79 |
80 |
81 |
82 | Docker 사용 방법
83 |
84 | ## 비지속성 빠른 시작
85 |
86 | 영구 디렉토리 설정 없이 PolyglotPDF를 빠르게 테스트하려면:
87 |
88 | ```bash
89 | # 먼저 이미지 가져오기
90 | docker pull 2207397265/polyglotpdf:latest
91 |
92 | # 볼륨 마운트 없이 컨테이너 실행(컨테이너 삭제 시 데이터 손실)
93 | docker run -d -p 12226:12226 --name polyglotpdf 2207397265/polyglotpdf:latest
94 | ```
95 |
96 | 이것은 PolyglotPDF를 시도하는 가장 빠른 방법이지만, 컨테이너가 중지되면 업로드된 모든 PDF와 구성 변경 사항이 손실됩니다.
97 |
98 | ## 영구 저장소 설치
99 |
100 | ```bash
101 | # 필요한 디렉토리 생성
102 | mkdir -p config fonts static/original static/target static/merged_pdf
103 |
104 | # 설정 파일 생성
105 | nano config/config.json # 또는 원하는 텍스트 편집기 사용
106 | # 프로젝트의 설정 템플릿을 이 파일에 복사
107 | # API 키 등의 설정 정보를 입력하세요
108 |
109 | # 권한 설정
110 | chmod -R 755 config fonts static
111 | ```
112 |
113 | ## 빠른 시작
114 |
115 | 다음 명령을 사용하여 PolyglotPDF Docker 이미지를 가져와 실행:
116 |
117 | ```bash
118 | # 이미지 가져오기
119 | docker pull 2207397265/polyglotpdf:latest
120 |
121 | # 컨테이너 실행
122 | docker run -d -p 12226:12226 --name polyglotpdf \
123 | -v ./config/config.json:/app/config.json \
124 | -v ./fonts:/app/fonts \
125 | -v ./static/original:/app/static/original \
126 | -v ./static/target:/app/static/target \
127 | -v ./static/merged_pdf:/app/static/merged_pdf \
128 | 2207397265/polyglotpdf:latest
129 | ```
130 |
131 | ## 애플리케이션 접속
132 |
133 | 컨테이너가 시작된 후, 브라우저에서 열기:
134 | ```
135 | http://localhost:12226
136 | ```
137 |
138 | ## Docker Compose 사용
139 |
140 | `docker-compose.yml` 파일 생성:
141 |
142 | ```yaml
143 | version: '3'
144 | services:
145 | polyglotpdf:
146 | image: 2207397265/polyglotpdf:latest
147 | ports:
148 | - "12226:12226"
149 | volumes:
150 | - ./config.json:/app/config.json # 설정 파일
151 | - ./fonts:/app/fonts # 폰트 파일
152 | - ./static/original:/app/static/original # 원본 PDF
153 | - ./static/target:/app/static/target # 번역된 PDF
154 | - ./static/merged_pdf:/app/static/merged_pdf # 병합된 PDF
155 | restart: unless-stopped
156 | ```
157 |
158 | 그리고 실행:
159 |
160 | ```bash
161 | docker-compose up -d
162 | ```
163 |
164 | ## 자주 사용하는 Docker 명령어
165 |
166 | ```bash
167 | # 컨테이너 중지
168 | docker stop polyglotpdf
169 |
170 | # 컨테이너 재시작
171 | docker restart polyglotpdf
172 |
173 | # 로그 확인
174 | docker logs polyglotpdf
175 | ```
176 |
177 |
178 | ## 환경 요구사항
179 | - Python 3.8+
180 | - deepl==1.17.0
181 | - Flask==2.0.1
182 | - Flask-Cors==5.0.0
183 | - langdetect==1.0.9
184 | - Pillow==10.2.0
185 | - PyMuPDF==1.24.0
186 | - pytesseract==0.3.10
187 | - requests==2.31.0
188 | - tiktoken==0.6.0
189 | - Werkzeug==2.0.1
190 |
191 | ## 감사의 말
192 | 본 프로젝트는 PyMuPDF의 강력한 PDF 처리와 레이아웃 유지 기능의 혜택을 받았습니다.
193 |
194 | ## 향후 개선 예정
195 | - PDF 채팅 기능
196 | - 학술 PDF 검색 통합
197 | - 처리 속도 추가 향상
198 |
199 | ### 수정 대기 중인 문제
200 | - **문제 설명**: 애플리케이션 재편집 시 오류: `code=4: only Gray, RGB, and CMYK colorspaces supported`
201 | - **현상**: 텍스트 블록 편집 시 지원되지 않는 색상 공간 발생
202 | - **현재 해결책**: 지원되지 않는 색상 공간을 포함한 텍스트 블록 건너뛰기
203 | - **해결 접근 방식**: 지원되지 않는 색상 공간을 포함한 페이지 전체를 OCR 모드로 처리
204 | - **재현 샘플**: [지원되지 않는 색상 공간의 PDF 샘플 보기](https://github.com/CBIhalsen/PolyglotPDF/blob/main/static/colorspace_issue_sample.pdf)
205 |
206 | ### TODO
207 | - □ **사용자 정의 용어집**: 사용자 정의 용어집을 지원하고, 특정 분야의 전문적인 번역을 위한 프롬프트 설정
208 | - □ **AI 재배치 기능**: 두 칸 PDF를 HTML 블로그의 한 줄 선형 읽기 형식으로 변환하여 모바일 장치에서 읽기 편하게 함
209 | - □ **다중 형식 내보내기**: 번역 결과를 PDF, HTML, Markdown 등 다양한 형식으로 내보내기
210 | - □ **다중 기기 동기화**: 컴퓨터에서 번역 완료한 후 모바일에서도 볼 수 있음
211 | - □ **향상된 병합 로직**: 현재 버전의 기본 병합 로직에서 글꼴 이름 감지를 모두 비활성화하고, 가로, 세로, x, y 범위 중복이 모두 병합되도록 함
212 |
213 | ### 폰트 최적화
214 | 현재 `main.py`의 `start` 함수에서는 기본 폰트 설정으로 텍스트를 삽입합니다:
215 | ```python
216 | # 현재 설정
217 | css=f"* {{font-family:{get_font_by_language(self.target_language)};font-size:auto;color: #111111 ;font-weight:normal;}}"
218 | ```
219 |
220 | 폰트 표시는 다음 방법으로 최적화할 수 있습니다:
221 |
222 | 1. **기본 폰트 설정 변경**
223 | ```python
224 | # 사용자 정의 폰트 스타일
225 | css=f"""* {{
226 | font-family: {get_font_by_language(self.target_language)};
227 | font-size: auto;
228 | color: #111111;
229 | font-weight: normal;
230 | letter-spacing: 0.5px; # 자간 조정
231 | line-height: 1.5; # 행간 조정
232 | }}"""
233 | ```
234 |
235 | 2. **사용자 정의 폰트 임베딩**
236 | 다음 단계로 사용자 정의 폰트를 임베딩할 수 있습니다:
237 | - 폰트 파일(.ttf, .otf 등)을 프로젝트의 `fonts` 디렉토리에 배치
238 | - CSS에서 `@font-face`를 사용하여 사용자 정의 폰트 선언
239 | ```python
240 | css=f"""
241 | @font-face {{
242 | font-family: 'CustomFont';
243 | src: url('fonts/your-font.ttf') format('truetype');
244 | }}
245 | * {{
246 | font-family: 'CustomFont', {get_font_by_language(self.target_language)};
247 | font-size: auto;
248 | font-weight: normal;
249 | }}
250 | """
251 | ```
252 |
253 | ### 기본 원리
254 | 본 프로젝트는 Adobe Acrobat DC의 PDF 편집과 유사한 기본 원리를 채택하고, PyMuPDF를 사용하여 PDF 텍스트 블록을 인식하고 처리합니다:
255 |
256 | - **핵심 처리 흐름**:
257 | ```python
258 | # 페이지에서 텍스트 블록 가져오기
259 | blocks = page.get_text("dict")["blocks"]
260 |
261 | # 각 텍스트 블록 처리
262 | for block in blocks:
263 | if block.get("type") == 0: # 텍스트 블록
264 | bbox = block["bbox"] # 텍스트 블록의 경계 상자 가져오기
265 | text = ""
266 | font_info = None
267 | # 텍스트와 폰트 정보 수집
268 | for line in block["lines"]:
269 | for span in line["spans"]:
270 | text += span["text"] + " "
271 | ```
272 | 이 방법으로 PDF 텍스트 블록을 직접 처리하여 원래 레이아웃을 유지한 채 효율적인 텍스트 추출과 수정을 실현합니다.
273 |
274 | - **기술 선택**:
275 | - PyMuPDF를 사용하여 PDF 분석과 편집 수행
276 | - 텍스트 처리에 특화하여 문제의 복잡화 방지
277 | - 수식, 표, 페이지 재구성 등의 복잡한 AI 인식은 수행하지 않음
278 |
279 | - **복잡한 처리를 피하는 이유**:
280 | - 수식, 표, PDF 페이지 재구성의 AI 인식에는 심각한 성능 병목 현상 존재
281 | - 복잡한 AI 처리는 계산 비용이 높음
282 | - 처리 시간이 크게 증가(수십 초 이상 소요 가능)
283 | - 프로덕션 환경에서의 대규모 저비용 배포가 어려움
284 | - 온라인 서비스의 신속한 응답에 부적합
285 |
286 | - **프로젝트 위치**:
287 | - 레이아웃을 유지한 PDF 파일의 번역이 주목적
288 | - PDF의 AI 지원 읽기에 효율적인 구현 방법 제공
289 | - 최적의 성능과 비용 비율 추구
290 |
291 | - **성능**:
292 | - PolyglotPDF API 서비스의 응답 시간: 약 1초/페이지
293 | - 낮은 계산 리소스 소비로 확장 가능한 배포 가능
294 | - 비용 효율이 높아 상업적 사용에 적합
295 |
--------------------------------------------------------------------------------
/README_TW.md:
--------------------------------------------------------------------------------
1 | # PolyglotPDF
2 |
3 | [](https://www.python.org/)
4 | [](https://example.com)
5 | [](https://www.latex-project.org/)
6 | [](https://example.com)
7 | [](https://example.com)
8 | [](https://pymupdf.readthedocs.io/)
9 |
10 | ## 演示
11 |
12 |
13 | ### [🎬 觀看完整影片](https://github.com/CBIhalsen/PolyglotPDF/blob/main/demo.mp4)
14 | 翻譯API選項已新增LLMs。推薦模型:Doubao、Qwen、deepseek v3、gpt4-o-mini。色彩空間錯誤可透過填充PDF檔案的白色區域來解決。舊有的text to text翻譯API已被移除。
15 |
16 | 此外,我們正在考慮新增arXiv搜尋功能和arXiv論文的LaTeX翻譯後渲染功能。
17 |
18 | ### 頁面展示
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 | # 中國大型語言模型API申請
29 |
30 | ## Doubao & Deepseek
31 | 從火山引擎平台申請:
32 | - 申請地址:[火山引擎-Doubao](https://www.volcengine.com/product/doubao/)
33 | - 支援模型:Doubao、Deepseek系列模型
34 |
35 | ## 通義千問(Qwen)
36 | 從阿里雲平台申請:
37 | - 申請地址:[阿里雲-通義千問](https://cn.aliyun.com/product/tongyi?from_alibabacloud=&utm_content=se_1019997984)
38 | - 支援模型:Qwen-Max、Qwen-Plus等系列模型
39 |
40 | ## 概述
41 | PolyglotPDF是一款使用特殊技術,能夠超高速識別PDF文件中文字、表格、數學公式的先進PDF處理工具。通常能在1秒內完成處理,並支援OCR功能和完整的版面保持翻譯功能。整份文件的翻譯通常能在10秒內完成(速度依翻譯API提供商而異)。
42 |
43 | ## 主要特點
44 | - **超高速識別**:約1秒內完成PDF中文字、表格、數學公式的處理
45 | - **版面保持翻譯**:翻譯時完整保持原文件的格式
46 | - **OCR支援**:高效處理掃描版文件
47 | - **文字基礎PDF**:無需GPU
48 | - **快速翻譯**:約10秒完成PDF整體翻譯
49 | - **靈活API整合**:可與各種翻譯服務提供商連接
50 | - **網頁基礎比較介面**:支援原文與譯文並列比較
51 | - **強化OCR功能**:更準確的文字識別和處理能力
52 | - **離線翻譯支援**:使用小型翻譯模型
53 |
54 | ## 安裝與設定
55 |
56 |
57 | 標準安裝
58 |
59 | 1. 複製儲存庫:
60 | ```bash
61 | git clone https://github.com/CBIhalsen/Polyglotpdf.git
62 | cd polyglotpdf
63 | ```
64 |
65 | 2. 安裝相依套件:
66 | ```bash
67 | pip install -r requirements.txt
68 | ```
69 |
70 | 3. 在config.json中設定API金鑰。不建議使用alicloud翻譯API。
71 |
72 | 4. 執行應用程式:
73 | ```bash
74 | python app.py
75 | ```
76 |
77 | 5. 存取網頁介面:
78 | 在瀏覽器中開啟 `http://127.0.0.1:8000`
79 |
80 |
81 |
82 | Docker 使用說明
83 |
84 | ## 無持久化快速啟動
85 |
86 | 如果您想快速測試PolyglotPDF而不設置持久化目錄:
87 |
88 | ```bash
89 | # 先拉取映像
90 | docker pull 2207397265/polyglotpdf:latest
91 |
92 | # 不掛載卷的容器運行(容器刪除後數據將丟失)
93 | docker run -d -p 12226:12226 --name polyglotpdf 2207397265/polyglotpdf:latest
94 | ```
95 |
96 | 這是嘗試PolyglotPDF最快的方式,但容器停止後,所有上傳的PDF和配置更改都會丟失。
97 |
98 | ## 持久化存儲安裝
99 |
100 | ```bash
101 | # 創建必要目錄
102 | mkdir -p config fonts static/original static/target static/merged_pdf
103 |
104 | # 創建配置文件
105 | nano config/config.json # 或使用任何文本編輯器
106 | # 將項目中的配置模板複製到該文件
107 | # 請注意填寫您的API金鑰等配置信息
108 |
109 | # 設置權限
110 | chmod -R 755 config fonts static
111 | ```
112 |
113 | ## 快速啟動
114 |
115 | 使用以下命令拉取並運行 PolyglotPDF Docker 映像:
116 |
117 | ```bash
118 | # 拉取映像
119 | docker pull 2207397265/polyglotpdf:latest
120 |
121 | # 運行容器
122 | docker run -d -p 12226:12226 --name polyglotpdf \
123 | -v ./config/config.json:/app/config.json \
124 | -v ./fonts:/app/fonts \
125 | -v ./static/original:/app/static/original \
126 | -v ./static/target:/app/static/target \
127 | -v ./static/merged_pdf:/app/static/merged_pdf \
128 | 2207397265/polyglotpdf:latest
129 | ```
130 |
131 | ## 訪問應用
132 |
133 | 容器啟動後,在瀏覽器中打開:
134 | ```
135 | http://localhost:12226
136 | ```
137 |
138 | ## 使用 Docker Compose
139 |
140 | 創建 `docker-compose.yml` 文件:
141 |
142 | ```yaml
143 | version: '3'
144 | services:
145 | polyglotpdf:
146 | image: 2207397265/polyglotpdf:latest
147 | ports:
148 | - "12226:12226"
149 | volumes:
150 | - ./config.json:/app/config.json # 配置文件
151 | - ./fonts:/app/fonts # 字體文件
152 | - ./static/original:/app/static/original # 原始PDF
153 | - ./static/target:/app/static/target # 翻譯後PDF
154 | - ./static/merged_pdf:/app/static/merged_pdf # 合併PDF
155 | restart: unless-stopped
156 | ```
157 |
158 | 然後運行:
159 |
160 | ```bash
161 | docker-compose up -d
162 | ```
163 |
164 | ## 常用 Docker 命令
165 |
166 | ```bash
167 | # 停止容器
168 | docker stop polyglotpdf
169 |
170 | # 重啟容器
171 | docker restart polyglotpdf
172 |
173 | # 查看日誌
174 | docker logs polyglotpdf
175 | ```
176 |
177 |
178 | ## 環境需求
179 | - Python 3.8+
180 | - deepl==1.17.0
181 | - Flask==2.0.1
182 | - Flask-Cors==5.0.0
183 | - langdetect==1.0.9
184 | - Pillow==10.2.0
185 | - PyMuPDF==1.24.0
186 | - pytesseract==0.3.10
187 | - requests==2.31.0
188 | - tiktoken==0.6.0
189 | - Werkzeug==2.0.1
190 |
191 | ## 致謝
192 | 本專案受益於PyMuPDF強大的PDF處理和版面保持功能。
193 |
194 | ## 未來改進計劃
195 | - PDF聊天功能
196 | - 學術PDF搜尋整合
197 | - 進一步提升處理速度
198 |
199 | ### 待修正問題
200 | - **問題描述**:應用程式重新編輯時的錯誤:`code=4: only Gray, RGB, and CMYK colorspaces supported`
201 | - **現象**:編輯文字區塊時出現不支援的色彩空間
202 | - **目前解決方案**:跳過包含不支援色彩空間的文字區塊
203 | - **解決方向**:使用OCR模式處理包含不支援色彩空間的整個頁面
204 | - **重現範例**:[查看不支援色彩空間的PDF範例](https://github.com/CBIhalsen/PolyglotPDF/blob/main/static/colorspace_issue_sample.pdf)
205 |
206 | ### TODO
207 | - □ **自定義術語庫**:支援自定義術語庫,設置prompt進行領域專業翻譯
208 | - □ **AI重排功能**:把雙欄的PDF轉換成HTML部落格的單欄線性閱讀格式,便於移動端閱讀
209 | - □ **多格式匯出**:翻譯結果可以匯出為PDF、HTML、Markdown等格式
210 | - □ **多端同步**:電腦上翻譯完,手機上也能看
211 | - □ **增強合併邏輯**:現版本預設合併邏輯把檢測字體名字全部關閉,加上水平、垂直、x、y範圍重疊全部合併
212 |
213 | ### 字型最佳化
214 | 目前在`main.py`的`start`函數中,使用預設字型設定插入文字:
215 | ```python
216 | # 目前設定
217 | css=f"* {{font-family:{get_font_by_language(self.target_language)};font-size:auto;color: #111111 ;font-weight:normal;}}"
218 | ```
219 |
220 | 字型顯示可透過以下方式最佳化:
221 |
222 | 1. **修改預設字型設定**
223 | ```python
224 | # 自訂字型樣式
225 | css=f"""* {{
226 | font-family: {get_font_by_language(self.target_language)};
227 | font-size: auto;
228 | color: #111111;
229 | font-weight: normal;
230 | letter-spacing: 0.5px; # 調整字距
231 | line-height: 1.5; # 調整行高
232 | }}"""
233 | ```
234 |
235 | 2. **嵌入自訂字型**
236 | 可透過以下步驟嵌入自訂字型:
237 | - 將字型檔案(.ttf、.otf等)放置在專案的`fonts`目錄中
238 | - 在CSS中使用`@font-face`宣告自訂字型
239 | ```python
240 | css=f"""
241 | @font-face {{
242 | font-family: 'CustomFont';
243 | src: url('fonts/your-font.ttf') format('truetype');
244 | }}
245 | * {{
246 | font-family: 'CustomFont', {get_font_by_language(self.target_language)};
247 | font-size: auto;
248 | font-weight: normal;
249 | }}
250 | """
251 | ```
252 |
253 | ### 基本原理
254 | 本專案採用與Adobe Acrobat DC的PDF編輯類似的基本原理,使用PyMuPDF識別和處理PDF文字區塊:
255 |
256 | - **核心處理流程**:
257 | ```python
258 | # 從頁面取得文字區塊
259 | blocks = page.get_text("dict")["blocks"]
260 |
261 | # 處理每個文字區塊
262 | for block in blocks:
263 | if block.get("type") == 0: # 文字區塊
264 | bbox = block["bbox"] # 取得文字區塊的邊界框
265 | text = ""
266 | font_info = None
267 | # 收集文字和字型資訊
268 | for line in block["lines"]:
269 | for span in line["spans"]:
270 | text += span["text"] + " "
271 | ```
272 | 這種方式直接處理PDF文字區塊,在保持原始版面的同時,實現高效的文字擷取和修改。
273 |
274 | - **技術選擇**:
275 | - 使用PyMuPDF進行PDF解析和編輯
276 | - 專注於文字處理,避免問題複雜化
277 | - 不進行複雜的AI識別,如數學公式、表格、頁面重構
278 |
279 | - **避免複雜處理的原因**:
280 | - 數學公式、表格、PDF頁面重構的AI識別存在嚴重的效能瓶頸
281 | - 複雜的AI處理計算成本高昂
282 | - 處理時間大幅增加(可能需要數十秒以上)
283 | - 難以在生產環境中進行大規模低成本部署
284 | - 不適合線上服務的快速回應
285 |
286 | - **專案定位**:
287 | - 主要目的是保持版面的PDF檔案翻譯
288 | - 提供PDF AI輔助閱讀的高效實現方式
289 | - 追求最佳效能和成本比
290 |
291 | - **效能表現**:
292 | - PolyglotPDF API服務回應時間:約1秒/頁
293 | - 低計算資源消耗,可擴展部署
294 | - 成本效益高,適合商業使用
295 |
--------------------------------------------------------------------------------
/Subset_Font.py:
--------------------------------------------------------------------------------
1 | from fontTools.subset import Subsetter, Options
2 | from fontTools.ttLib import TTFont
3 | import datetime
4 | import os
5 | import requests
6 |
7 |
8 | def download_font_from_github(language, font_filename, target_path):
9 | """
10 | 从GitHub下载字体文件
11 | """
12 |
13 | # 构建GitHub原始文件URL
14 | github_base_url = "https://raw.githubusercontent.com/CBIhalsen/PolyglotPDF-fonts/main"
15 | font_folder = f"{language}_fonts"
16 | github_url = f"{github_base_url}/{font_folder}/{font_filename}"
17 |
18 | try:
19 | # 下载文件
20 | response = requests.get(github_url)
21 |
22 | # 检查是否存在(GitHub返回404表示文件不存在)
23 | if response.status_code == 404:
24 | print("\n=== 字体文件未找到 ===")
25 | print(f"在GitHub仓库中未找到所需的字体文件:")
26 | print(f"- 语言: {language}")
27 | print(f"- 字体文件: {font_filename}")
28 | print(f"- 预期路径: {font_folder}/{font_filename}")
29 | print("\n请通过以下步骤请求添加字体:")
30 | print("1. 访问: https://github.com/CBIhalsen/PolyglotPDF-fonts")
31 | print("2. 创建新的Issue")
32 | print("3. 标题: [Font Request] Add font for {language}")
33 | print("4. 内容:")
34 | print(f" - Language: {language}")
35 | print(f" - Font filename: {font_filename}")
36 | print(f" - Expected path: {font_folder}/{font_filename}")
37 | print(" - Additional details: (请描述使用场景和需求)\n")
38 | return False
39 |
40 | response.raise_for_status() # 检查其他可能的错误
41 |
42 | # 创建目标文件夹并保存文件
43 | os.makedirs(os.path.dirname(target_path), exist_ok=True)
44 | with open(target_path, 'wb') as f:
45 | f.write(response.content)
46 |
47 | print(f"成功从GitHub下载字体文件到: {target_path}")
48 | return True
49 |
50 | except requests.exceptions.RequestException as e:
51 | if isinstance(e, requests.exceptions.ConnectionError):
52 | print(f"网络连接错误: 无法连接到GitHub。请检查您的网络连接。")
53 | elif isinstance(e, requests.exceptions.Timeout):
54 | print(f"请求超时: GitHub响应时间过长。")
55 | else:
56 | print(f"下载字体文件失败: {str(e)}")
57 | return False
58 |
59 |
60 | def check_glyph_coverage(font, text):
61 | """
62 | 检查字体是否包含所需的所有字形
63 | 返回未找到的字符列表
64 | """
65 | cmap = font.getBestCmap()
66 | missing_chars = []
67 |
68 | for char in text:
69 | if ord(char) not in cmap:
70 | missing_chars.append(char)
71 |
72 | return missing_chars
73 |
74 |
75 | def subset_font(in_font_path, out_font_path, text, language):
76 | b = datetime.datetime.now()
77 | """
78 | 使用 fontTools 对 in_font_path 做子集化,
79 | 只保留 text 中出现的字符,输出到 out_font_path。
80 | """
81 |
82 | # 检查输入字体文件是否存在
83 | if not os.path.exists(in_font_path):
84 | print(f"输入字体文件不存在: {in_font_path}")
85 | print("尝试从GitHub下载字体文件...")
86 |
87 | # 获取原始字体文件名
88 | font_filename = os.path.basename(in_font_path)
89 |
90 | # 尝试下载字体
91 | if not download_font_from_github(language, font_filename, in_font_path):
92 | print("无法获取字体文件,子集化操作终止")
93 | return
94 |
95 | # 确保输出文件夹存在
96 | output_dir = os.path.dirname(out_font_path)
97 | if output_dir and not os.path.exists(output_dir):
98 | os.makedirs(output_dir)
99 | print(f"创建输出目录: {output_dir}")
100 |
101 | # 去重并排序要保留的字符
102 | unique_chars = "".join(sorted(set(text)))
103 |
104 | # 读取原字体
105 | font = TTFont(in_font_path)
106 |
107 | # 检查字形覆盖
108 | missing_chars = check_glyph_coverage(font, unique_chars)
109 | if missing_chars:
110 | print("\n=== 字形缺失警告 ===")
111 | print(f"字体文件 {os.path.basename(in_font_path)} 中未找到以下字符:")
112 | print("".join(missing_chars))
113 | print("这些字符将使用 PyMuPDF 默认字体进行显示")
114 | print("==================\n")
115 |
116 | # 从text中移除缺失的字符,只对有字形的字符进行子集化
117 | for char in missing_chars:
118 | unique_chars = unique_chars.replace(char, '')
119 |
120 | # 配置子集化选项
121 | options = Options()
122 |
123 | # 创建子集器并指定要包含的字符
124 | subsetter = Subsetter(options=options)
125 | subsetter.populate(text=unique_chars)
126 |
127 | # 对字体做子集化
128 | subsetter.subset(font)
129 |
130 | # 保存子集化后的 TTF
131 | font.save(out_font_path)
132 | print(f"生成子集字体: {out_font_path} (仅包含所需字形)")
133 |
134 | e = datetime.datetime.now()
135 | elapsed_time = (e - b).total_seconds()
136 | print(f"子集化运行时间: {elapsed_time} 秒")
137 |
--------------------------------------------------------------------------------
/YouDao_translation.py:
--------------------------------------------------------------------------------
1 | import uuid
2 | import requests
3 | import hashlib
4 | import time
5 | import json
6 |
7 |
8 | def translate(texts,original_lang, target_lang):
9 | """
10 | 有道翻译API接口
11 |
12 | 参数:
13 | texts: list, 要翻译的文本列表
14 | target_lang: str, 目标语言代码
15 | credentials: dict, 包含 app_key 和 app_secret 的字典
16 |
17 | 返回:
18 | list: 翻译后的文本列表
19 | """
20 | YOUDAO_URL = 'https://openapi.youdao.com/v2/api'
21 |
22 | with open("config.json", 'r', encoding='utf-8') as f:
23 | config = json.load(f)
24 |
25 | # 获取指定服务的认证信息
26 | if target_lang == 'zh':
27 | target_lang='zh-CHS'
28 | service_name = "youdao"
29 | credentials = config['translation_services'].get(service_name)
30 | if not credentials:
31 | raise ValueError(f"Translation service '{service_name}' not found in config")
32 |
33 |
34 | def encrypt(sign_str):
35 | hash_algorithm = hashlib.sha256()
36 | hash_algorithm.update(sign_str.encode('utf-8'))
37 | return hash_algorithm.hexdigest()
38 |
39 | def truncate(q):
40 | if q is None:
41 | return None
42 | size = len(q)
43 | return q if size <= 20 else q[0:10] + str(size) + q[size - 10:size]
44 |
45 | def do_request(data):
46 | headers = {'Content-Type': 'application/x-www-form-urlencoded'}
47 | return requests.post(YOUDAO_URL, data=data, headers=headers)
48 |
49 | try:
50 | # 确保输入文本为列表格式
51 | if isinstance(texts, str):
52 | texts = [texts]
53 |
54 |
55 | # 准备请求数据
56 | data = {
57 | 'from': original_lang,
58 | 'to': target_lang,
59 | 'signType': 'v3',
60 | 'curtime': str(int(time.time())),
61 | 'appKey': credentials['app_key'],
62 | 'q': texts,
63 | 'salt': str(uuid.uuid1()),
64 | 'vocabId': "您的用户词表ID"
65 | }
66 |
67 | # 生成签名
68 | sign_str = (credentials['app_key'] +
69 | truncate(''.join(texts)) +
70 | data['salt'] +
71 | data['curtime'] +
72 | credentials['app_secret'])
73 | data['sign'] = encrypt(sign_str)
74 |
75 | # 发送请求
76 | response = do_request(data)
77 | response_data = json.loads(response.content.decode("utf-8"))
78 |
79 | # 提取翻译结果
80 | translations = [result["translation"] for result in response_data["translateResults"]]
81 | print(translations)
82 | return translations
83 |
84 | except Exception as e:
85 | print(f"翻译出错: {str(e)}")
86 | return None
87 | # 使用示例:
88 | if __name__ == '__main__':
89 | # 认证信息
90 |
91 |
92 | # 要翻译的文本
93 | texts = ["很久很久以前", '待输入的文字"2', "待输入的文字3"]
94 | original_lang = 'auto'
95 |
96 | # 目标语言
97 | target_lang = 'zh'
98 |
99 | # 调用翻译
100 | results = translate(texts,original_lang='auto', target_lang=target_lang)
101 | print(results,'ggg')
102 |
103 | if results:
104 | for original, translated in zip(texts, results):
105 | print(f"原文: {original}")
106 | print(f"译文: {translated}\n")
107 |
108 |
--------------------------------------------------------------------------------
/build.py:
--------------------------------------------------------------------------------
1 | # import os
2 | # import sys
3 | # import platform
4 | # import subprocess
5 | # import shutil
6 | # from pathlib import Path
7 | #
8 | # def main():
9 | # #────────────────────────────────────────────────────────────────────────
10 | # # 1. 准备工作:获取当前目录、检查 PyInstaller
11 | # #────────────────────────────────────────────────────────────────────────
12 | # current_dir = Path(__file__).parent.absolute()
13 | # print(f"当前目录: {current_dir}")
14 | #
15 | # try:
16 | # import PyInstaller
17 | # print("PyInstaller 已安装")
18 | # except ImportError:
19 | # print("安装 PyInstaller...")
20 | # subprocess.run([sys.executable, "-m", "pip", "install", "pyinstaller"], check=True)
21 | #
22 | # system = platform.system().lower()
23 | # print(f"当前系统: {system}")
24 | #
25 | # # 设置路径分隔符 (Windows 下为 ;,其他平台为 :)
26 | # separator = ';' if system == 'windows' else ':'
27 | #
28 | # # 生成可执行文件名称 (Windows 上会变成 EbookTranslator.exe,其它系统就没有后缀)
29 | # exe_name = "EbookTranslator"
30 | #
31 | # #────────────────────────────────────────────────────────────────────────
32 | # # 2. 创建输出目录(供后面使用,onedir 模式下可自由放置打包产物)
33 | # #────────────────────────────────────────────────────────────────────────
34 | # dist_app_dir = current_dir / "dist" / exe_name
35 | # os.makedirs(dist_app_dir, exist_ok=True)
36 | #
37 | # #────────────────────────────────────────────────────────────────────────
38 | # # 3. 根据你的需要,检查关键资源文件
39 | # #────────────────────────────────────────────────────────────────────────
40 | # required_files = {
41 | # 'app.py': True,
42 | # 'index.html': True,
43 | # 'config.json': True,
44 | # 'static': True,
45 | # 'recent.json': True
46 | # }
47 | # for file_name, required in required_files.items():
48 | # file_path = current_dir / file_name
49 | # if not file_path.exists() and required:
50 | # print(f"错误: 必要文件 '{file_name}' 不存在")
51 | # sys.exit(1)
52 | #
53 | # #────────────────────────────────────────────────────────────────────────
54 | # # 4. 构建 PyInstaller 的命令
55 | # # 使用 --onedir 模式,并设置可执行文件名称为 EbookTranslator
56 | # #────────────────────────────────────────────────────────────────────────
57 | # pyinstaller_cmd = [
58 | # sys.executable, '-m', 'PyInstaller',
59 | # '--noconfirm',
60 | # '--onedir', # onedir 模式
61 | # '--name', exe_name # 生成文件(文件夹)名
62 | # ]
63 | #
64 | # # 如果在 Windows 平台,并且有 icon.ico,就使用图标
65 | # icon_file = current_dir / "icon.ico"
66 | # if system == 'windows' and icon_file.exists():
67 | # pyinstaller_cmd.extend(["--icon", str(icon_file)])
68 | #
69 | # #────────────────────────────────────────────────────────────────────────
70 | # # 5. 设置 --add-data 参数,打包静态资源与需要的文件
71 | # #────────────────────────────────────────────────────────────────────────
72 | # data_files = []
73 | # if (current_dir / 'static').exists():
74 | # data_files.append((str(current_dir / 'static'), 'static'))
75 | # if (current_dir / 'index.html').exists():
76 | # data_files.append((str(current_dir / 'index.html'), '.'))
77 | # if (current_dir / 'config.json').exists():
78 | # data_files.append((str(current_dir / 'config.json'), '.'))
79 | # if (current_dir / 'recent.json').exists():
80 | # data_files.append((str(current_dir / 'recent.json'), '.'))
81 | #
82 | # for src, dst in data_files:
83 | # pyinstaller_cmd.extend(['--add-data', f"{src}{separator}{dst}"])
84 | #
85 | # # 最后指定主脚本(app.py)
86 | # pyinstaller_cmd.append(str(current_dir / 'app.py'))
87 | #
88 | # #────────────────────────────────────────────────────────────────────────
89 | # # 6. 打印并执行命令
90 | # #────────────────────────────────────────────────────────────────────────
91 | # print("执行 PyInstaller 命令:\n", " ".join(map(str, pyinstaller_cmd)))
92 | # try:
93 | # subprocess.run(pyinstaller_cmd, check=True)
94 | # print("PyInstaller 打包完成")
95 | # except Exception as e:
96 | # print(f"PyInstaller 打包失败: {e}")
97 | # sys.exit(1)
98 | #
99 | # #────────────────────────────────────────────────────────────────────────
100 | # # 7. 打包完成后,一般会在 dist/EbookTranslator 目录下看到:
101 | # # ├─ EbookTranslator.exe (Windows) 或 EbookTranslator(其它系统)
102 | # # ├─ 静态资源、依赖库、.. 等文件
103 | # #────────────────────────────────────────────────────────────────────────
104 | # build_dir = current_dir / "build"
105 | # spec_file = current_dir / f"{exe_name}.spec"
106 | #
107 | # # 清理临时文件
108 | # if build_dir.exists():
109 | # shutil.rmtree(build_dir)
110 | # if spec_file.exists():
111 | # spec_file.unlink()
112 | #
113 | # print("Flask 应用打包完成!\n请查看 dist/EbookTranslator 文件夹,"
114 | # "其中的 EbookTranslator.exe(Windows) 或 EbookTranslator(其他平台) 即可运行。")
115 | #
116 | #
117 | # if __name__ == "__main__":
118 | # main()
119 |
120 |
121 | import os
122 | import sys
123 | import platform
124 | import subprocess
125 | import shutil
126 | from pathlib import Path
127 |
128 |
129 | def main():
130 | # ────────────────────────────────────────────────────────────────────────
131 | # 1. 准备工作:获取当前目录、检查 PyInstaller
132 | # ────────────────────────────────────────────────────────────────────────
133 | current_dir = Path(__file__).parent.absolute()
134 | print(f"当前目录: {current_dir}")
135 |
136 | try:
137 | import PyInstaller
138 | print("PyInstaller 已安装")
139 | except ImportError:
140 | print("安装 PyInstaller...")
141 | subprocess.run([sys.executable, "-m", "pip", "install", "pyinstaller"], check=True)
142 |
143 | system = platform.system().lower()
144 | print(f"当前系统: {system}")
145 |
146 | # 生成可执行文件名称 (Windows 上会变成 EbookTranslator.exe,其它系统就没有后缀)
147 | exe_name = "EbookTranslator"
148 |
149 | # ────────────────────────────────────────────────────────────────────────
150 | # 2. 创建输出目录
151 | # ────────────────────────────────────────────────────────────────────────
152 | dist_dir = current_dir / "dist"
153 | dist_app_dir = dist_dir / exe_name
154 |
155 | # 如果已存在,先删除
156 | if dist_app_dir.exists():
157 | print(f"清理已存在的输出目录: {dist_app_dir}")
158 | shutil.rmtree(dist_app_dir)
159 |
160 | os.makedirs(dist_app_dir, exist_ok=True)
161 |
162 | # ────────────────────────────────────────────────────────────────────────
163 | # 3. 检查关键资源文件
164 | # ────────────────────────────────────────────────────────────────────────
165 | required_files = {
166 | 'app.py': True,
167 | 'index.html': True,
168 | 'pdfviewer.html': True,
169 | 'pdfviewer2.html': True,
170 | 'merge_pdf.py': True,
171 | 'config.json': True,
172 | 'static': True
173 | }
174 | for file_name, required in required_files.items():
175 | file_path = current_dir / file_name
176 | if not file_path.exists() and required:
177 | print(f"错误: 必要文件 '{file_name}' 不存在")
178 | sys.exit(1)
179 |
180 | # ────────────────────────────────────────────────────────────────────────
181 | # 4. 构建 PyInstaller 的命令 - 不添加任何资源文件
182 | # ────────────────────────────────────────────────────────────────────────
183 | pyinstaller_cmd = [
184 | sys.executable, '-m', 'PyInstaller',
185 | '--noconfirm',
186 | '--onedir', # onedir 模式
187 | '--name', exe_name, # 生成文件(文件夹)名
188 | ##'--windowed' # 生成 macOS 的 .app 文件
189 | ]
190 |
191 | # 如果在 Windows 平台,并且有 icon.ico,就使用图标
192 | icon_file = current_dir / "icon.ico"
193 | if system == 'windows' and icon_file.exists():
194 | pyinstaller_cmd.extend(["--icon", str(icon_file)])
195 |
196 | # 最后指定主脚本(app.py)
197 | pyinstaller_cmd.append(str(current_dir / 'app.py'))
198 |
199 | # ────────────────────────────────────────────────────────────────────────
200 | # 5. 执行 PyInstaller 命令
201 | # ────────────────────────────────────────────────────────────────────────
202 | print("执行 PyInstaller 命令:\n", " ".join(map(str, pyinstaller_cmd)))
203 | try:
204 | subprocess.run(pyinstaller_cmd, check=True)
205 | print("PyInstaller 打包完成")
206 | except Exception as e:
207 | print(f"PyInstaller 打包失败: {e}")
208 | sys.exit(1)
209 |
210 | # ────────────────────────────────────────────────────────────────────────
211 | # 6. 手动复制所有资源文件到输出目录
212 | # ────────────────────────────────────────────────────────────────────────
213 | print("\n开始复制资源文件到输出目录...")
214 |
215 | # 复制 index.html
216 | if (current_dir / 'index.html').exists():
217 | print(f"复制 index.html 到 {dist_app_dir}")
218 | shutil.copy2(current_dir / 'index.html', dist_app_dir / 'index.html')
219 |
220 |
221 | if (current_dir / 'pdfviewer.html').exists():
222 | print(f"复制 pdfviewer.html 到 {dist_app_dir}")
223 | shutil.copy2(current_dir / 'pdfviewer.html', dist_app_dir / 'pdfviewer.html')
224 |
225 | # 复制 pdfviewer2.html
226 | if (current_dir / 'pdfviewer2.html').exists():
227 | print(f"复制 pdfviewer2.html 到 {dist_app_dir}")
228 | shutil.copy2(current_dir / 'pdfviewer2.html', dist_app_dir / 'pdfviewer2.html')
229 |
230 | # 复制 merge_pdf.py
231 | if (current_dir / 'merge_pdf.py').exists():
232 | print(f"复制 merge_pdf.py 到 {dist_app_dir}")
233 | shutil.copy2(current_dir / 'merge_pdf.py', dist_app_dir / 'merge_pdf.py')
234 |
235 | # 复制 config.json
236 | if (current_dir / 'config.json').exists():
237 | print(f"复制 config.json 到 {dist_app_dir}")
238 | shutil.copy2(current_dir / 'config.json', dist_app_dir / 'config.json')
239 |
240 | # 复制 recent.json (如果存在)
241 | if (current_dir / 'recent.json').exists():
242 | print(f"复制 recent.json 到 {dist_app_dir}")
243 | shutil.copy2(current_dir / 'recent.json', dist_app_dir / 'recent.json')
244 |
245 | # 复制 static 目录
246 | if (current_dir / 'static').exists():
247 | static_dest = dist_app_dir / 'static'
248 | print(f"复制 static 目录到 {static_dest}")
249 | if static_dest.exists():
250 | shutil.rmtree(static_dest)
251 | shutil.copytree(current_dir / 'static', static_dest)
252 |
253 | # 复制其他可能需要的文件
254 | other_files = ['README.md', 'LICENSE', 'requirements.txt']
255 | for file_name in other_files:
256 | if (current_dir / file_name).exists():
257 | print(f"复制 {file_name} 到 {dist_app_dir}")
258 | shutil.copy2(current_dir / file_name, dist_app_dir / file_name)
259 |
260 | # ────────────────────────────────────────────────────────────────────────
261 | # 7. 清理临时文件
262 | # ────────────────────────────────────────────────────────────────────────
263 | build_dir = current_dir / "build"
264 | spec_file = current_dir / f"{exe_name}.spec"
265 |
266 | if build_dir.exists():
267 | print(f"清理 build 目录: {build_dir}")
268 | shutil.rmtree(build_dir)
269 | if spec_file.exists():
270 | print(f"删除 spec 文件: {spec_file}")
271 | spec_file.unlink()
272 |
273 | # ────────────────────────────────────────────────────────────────────────
274 | # 8. 完成
275 | # ────────────────────────────────────────────────────────────────────────
276 | print("\n打包完成!")
277 | print(f"应用程序位于: {dist_app_dir}")
278 | print(f"可执行文件: {dist_app_dir / exe_name}{'.exe' if system == 'windows' else ''}")
279 | print("所有资源文件已直接复制到输出目录,可以直接查看和编辑。")
280 |
281 |
282 | if __name__ == "__main__":
283 | main()
--------------------------------------------------------------------------------
/config.json:
--------------------------------------------------------------------------------
1 | {
2 | "count": 2,
3 | "PPC": 20,
4 | "translation_services": {
5 | "Doubao": {
6 | "auth_key": "",
7 | "model_name": ""
8 | },
9 | "GLM": {
10 | "auth_key": "",
11 | "model_name": "glm-4-flash"
12 | },
13 | "Grok": {
14 | "auth_key": "",
15 | "model_name": "grok-2-latest"
16 | },
17 | "Qwen": {
18 | "auth_key": "",
19 | "model_name": "qwen-plus"
20 | },
21 | "ThirdParty": {
22 | "api_url": "https://api.chatanywhere.tech/v1/chat/completions",
23 | "auth_key": "",
24 | "model_name": "gpt-4o-mini"
25 | },
26 | "deepl": {
27 | "auth_key": ""
28 | },
29 | "deepseek": {
30 | "auth_key": "",
31 | "model_name": "deepseek-chat"
32 | },
33 | "openai": {
34 | "auth_key": "",
35 | "model_name": "gpt-4o-mini"
36 | },
37 | "youdao": {
38 | "app_key": "",
39 | "app_secret": ""
40 | }
41 | },
42 | "ocr_services": {
43 | "tesseract": {
44 | "path": "C:\\Program Files\\Tesseract-OCR\\tesseract.exe"
45 | }
46 | },
47 | "default_services": {
48 | "ocr_model": false,
49 | "Enable_translation": true,
50 | "Translation_api": "GLM"
51 | }
52 | }
--------------------------------------------------------------------------------
/convert2pdf.py:
--------------------------------------------------------------------------------
1 | import fitz
2 | import os
3 |
4 |
5 | def convert_to_pdf(input_file, output_file=None):
6 | """
7 | 将支持的文档格式转换为 PDF,支持跨平台路径处理
8 |
9 | Args:
10 | input_file (str): 输入文件的完整路径
11 | output_file (str, optional): 输出PDF文件的完整路径。如果为None,则使用输入文件名+.pdf
12 |
13 | Returns:
14 | bool: 转换是否成功
15 | """
16 | try:
17 | # 规范化路径,处理不同平台的路径分隔符
18 | input_file = os.path.normpath(input_file)
19 |
20 | if not os.path.exists(input_file):
21 | print(f"错误:输入文件 '{input_file}' 不存在")
22 | return False
23 |
24 | # 如果未指定输出文件,则基于输入文件生成输出路径
25 | if output_file is None:
26 | # 获取文件名和目录
27 | file_dir = os.path.dirname(input_file)
28 | file_name = os.path.basename(input_file)
29 | name_without_ext = os.path.splitext(file_name)[0]
30 |
31 | # 在同一目录下创建同名PDF文件
32 | output_file = os.path.join(file_dir, f"{name_without_ext}.pdf")
33 |
34 | # 确保输出目录存在
35 | output_dir = os.path.dirname(output_file)
36 | if output_dir and not os.path.exists(output_dir):
37 | os.makedirs(output_dir, exist_ok=True)
38 |
39 | print(f"正在处理文件: {input_file}")
40 | print(f"输出文件将保存为: {output_file}")
41 |
42 | # 1. 先用 fitz.open 打开文档(EPUB、XPS、FB2 等格式)
43 | doc = fitz.open(input_file)
44 | print(f"文档页数: {len(doc)}")
45 |
46 | # 2. 调用 convert_to_pdf() 得到 PDF 格式字节流
47 | pdf_bytes = doc.convert_to_pdf()
48 |
49 | # 3. 再以 "pdf" 格式打开这段字节流
50 | pdf_doc = fitz.open("pdf", pdf_bytes)
51 |
52 | # 4. 保存为真正的 PDF 文件
53 | pdf_doc.save(output_file)
54 |
55 | # 关闭文档
56 | pdf_doc.close()
57 | doc.close()
58 |
59 | # 检查输出文件是否成功创建
60 | if os.path.exists(output_file):
61 | print(f"转换成功!PDF文件已保存为: {output_file}")
62 | return True
63 | else:
64 | print("转换似乎完成,但输出文件未找到")
65 | return False
66 |
67 | except fitz.FileDataError as e:
68 | print(f"文件格式错误或文件损坏:{str(e)}")
69 | except PermissionError as e:
70 | print(f"权限错误:无法访问或写入文件 - {str(e)}")
71 | except Exception as e:
72 | print(f"转换失败,错误类型: {type(e).__name__}")
73 | print(f"错误详情: {str(e)}")
74 | # 在调试模式下打印完整的堆栈跟踪
75 | import traceback
76 | traceback.print_exc()
77 |
78 | return False
79 | # 使用示例
80 | if __name__ == "__main__":
81 | # 单个文件转换示例
82 | input_file = "666 (1).epub"
83 |
84 | # 验证文件扩展名
85 | if not input_file.lower().endswith(('.xps', '.epub', '.fb2', '.cbz', '.mobi')):
86 | print(f"不支持的文件格式。支持的格式包括: XPS, EPUB, FB2, CBZ, MOBI")
87 | else:
88 | convert_to_pdf(input_file)
89 |
90 | # 批量转换示例
91 | # input_directory = "documents"
92 | # batch_convert_to_pdf(input_directory)
93 |
--------------------------------------------------------------------------------
/demo.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CBIhalsen/PolyglotPDF/f476650ba6563f574d8cbdcc0840bab27fcc7d35/demo.mp4
--------------------------------------------------------------------------------
/demo.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CBIhalsen/PolyglotPDF/f476650ba6563f574d8cbdcc0840bab27fcc7d35/demo.pdf
--------------------------------------------------------------------------------
/demo_zh.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CBIhalsen/PolyglotPDF/f476650ba6563f574d8cbdcc0840bab27fcc7d35/demo_zh.pdf
--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
1 | version: '3'
2 | services:
3 | polyglotpdf:
4 | image: 2207397265/polyglotpdf:latest
5 | ports:
6 | - "12226:12226"
7 | volumes:
8 | - ./config/config.json:/app/config.json # 配置文件
9 | - ./fonts:/app/fonts # 字体文件
10 | - ./static/original:/app/static/original # 原始PDF
11 | - ./static/target:/app/static/target # 翻译后PDF
12 | - ./static/merged_pdf:/app/static/merged_pdf # 合并PDF
13 | restart: unless-stopped
14 |
15 |
--------------------------------------------------------------------------------
/download_model.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import os
3 |
4 |
5 | support_language = [
6 | "en", # 英语 English
7 | "zh", # 中文 Chinese
8 | "es", # 西班牙语 Spanish
9 | "fr", # 法语 French
10 | "de", # 德语 German
11 | "ru", # 俄语 Russian
12 | "ar", # 阿拉伯语 Arabic
13 | "it", # 意大利语 Italian
14 | "ja", # 日语 Japanese
15 | "ko", # 韩语 Korean
16 | "nl", # 荷兰语 Dutch
17 | "pt", # 葡萄牙语 Portuguese
18 | "tr", # 土耳其语 Turkish
19 | "sv", # 瑞典语 Swedish
20 | "pl", # 波兰语 Polish
21 | "fi", # 芬兰语 Finnish
22 | "da", # 丹麦语 Danish
23 | "no", # 挪威语 Norwegian
24 | "cs", # 捷克语 Czech
25 | "el", # 希腊语 Greek
26 | "hu", # 匈牙利语 Hungarian
27 | "th" # 泰语 Thai
28 | ]
29 |
30 | def download_file(url, dest_folder, file_name):
31 | """
32 | 下载文件并保存到指定的文件夹中。
33 | """
34 | response = requests.get(url, allow_redirects=True)
35 | if response.status_code == 200:
36 | with open(os.path.join(dest_folder, file_name), 'wb') as file:
37 | file.write(response.content)
38 | else:
39 | print(f"Failed to download {file_name}. Status code: {response.status_code}")
40 |
41 | def download_model_files(model_name):
42 | """
43 | 根据模型名称下载模型文件。
44 | """
45 | # 文件列表
46 | files_to_download = [
47 | "config.json",
48 | "pytorch_model.bin",
49 | "tokenizer_config.json",
50 | "vocab.json",
51 | "source.spm",
52 | "target.spm" # 如果模型不使用SentencePiece,这两个文件可能不需要
53 | ]
54 |
55 | # 创建模型文件夹
56 | # 创建模型文件夹
57 | model_folder_name = model_name.split('/')[-1] # 从模型名称中获取文件夹名称
58 | model_folder = os.path.join("translation_models", model_folder_name) # 添加相对路径前缀
59 |
60 | if os.path.exists(model_folder):
61 | return
62 |
63 |
64 | if not os.path.exists(model_folder):
65 | os.makedirs(model_folder)
66 |
67 | # 构建下载链接并下载文件
68 | base_url = f"https://huggingface.co/{model_name}/resolve/main/"
69 | for file_name in files_to_download:
70 | download_url = base_url + file_name
71 | print(f"Downloading {file_name}...")
72 | download_file(download_url, model_folder, file_name)
73 |
74 | # 示例使用
75 | if __name__ == '__main__':
76 |
77 | model_name = "Helsinki-NLP/opus-mt-en-es"
78 | download_model_files(model_name)
79 |
80 |
--------------------------------------------------------------------------------
/icon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CBIhalsen/PolyglotPDF/f476650ba6563f574d8cbdcc0840bab27fcc7d35/icon.ico
--------------------------------------------------------------------------------
/languagedetect.py:
--------------------------------------------------------------------------------
1 |
2 |
3 | text = "今日は(こんにちは)"
4 |
5 | # 方法2:直接使用detect
6 | from langdetect import detect
7 | lang_code = detect(text)
8 | print(lang_code) # 输出: ja
9 |
--------------------------------------------------------------------------------
/merge_pdf.py:
--------------------------------------------------------------------------------
1 | import fitz
2 | import os
3 |
4 | def merge_pdfs_horizontally(pdf1_path, pdf2_path, output_path, spacing=0):
5 | """
6 | 水平合并两个PDF文件的所有页面
7 | :param pdf1_path: 第一个PDF文件的绝对路径
8 | :param pdf2_path: 第二个PDF文件的绝对路径
9 | :param output_path: 输出PDF文件的绝对路径
10 | :param spacing: 两个PDF之间的间距(点)
11 | """
12 | # 确保输入路径存在
13 | if not os.path.exists(pdf1_path):
14 | raise FileNotFoundError(f"找不到第一个PDF文件: {pdf1_path}")
15 | if not os.path.exists(pdf2_path):
16 | raise FileNotFoundError(f"找不到第二个PDF文件: {pdf2_path}")
17 |
18 | # 打开两个源PDF文件
19 | doc1 = fitz.open(pdf1_path)
20 | doc2 = fitz.open(pdf2_path)
21 |
22 | # 创建新的PDF文档
23 | result_doc = fitz.open()
24 |
25 | # 确保两个文档都至少有一页
26 | if doc1.page_count == 0 or doc2.page_count == 0:
27 | raise ValueError("Both PDFs must have at least one page")
28 |
29 | # 确保两个PDF的页数相同
30 | if doc1.page_count != doc2.page_count:
31 | raise ValueError("Both PDFs must have the same number of pages")
32 |
33 | # 处理每一页
34 | for page_num in range(doc1.page_count):
35 | # 获取两个PDF的当前页
36 | page1 = doc1[page_num]
37 | page2 = doc2[page_num]
38 |
39 | # 获取页面尺寸
40 | rect1 = page1.rect
41 | rect2 = page2.rect
42 |
43 | # 计算新页面的尺寸
44 | new_width = rect1.width + rect2.width + spacing
45 | new_height = max(rect1.height, rect2.height)
46 |
47 | # 创建新页面
48 | new_page = result_doc.new_page(width=new_width, height=new_height)
49 |
50 | # 创建第一个PDF的位置矩阵(保持在左侧)
51 | matrix1 = fitz.Matrix(1, 1)
52 |
53 | # 创建第二个PDF的位置矩阵(移动到右侧)
54 | matrix2 = fitz.Matrix(1, 1)
55 | x_shift = rect1.width + spacing
56 | matrix2.pretranslate(x_shift, 0)
57 |
58 | # 将两个页面内容复制到新页面
59 | new_page.show_pdf_page(rect1, doc1, page_num, matrix1)
60 | new_page.show_pdf_page(fitz.Rect(x_shift, 0, x_shift + rect2.width, new_height),
61 | doc2, page_num, matrix2)
62 |
63 | # 确保输出目录存在
64 | output_dir = os.path.dirname(output_path)
65 | if not os.path.exists(output_dir):
66 | os.makedirs(output_dir)
67 |
68 | # 保存结果
69 | result_doc.save(output_path)
70 |
71 | # 关闭所有文档
72 | doc1.close()
73 | doc2.close()
74 | result_doc.close()
75 |
76 | # 使用示例
77 | if __name__ == "__main__":
78 | pdf1_path = r"g6.pdf"
79 | pdf2_path = r"g6_zh.pdf"
80 | output_path = r"./output/merged.pdf"
81 |
82 | try:
83 | merge_pdfs_horizontally(pdf1_path, pdf2_path, output_path)
84 | print("PDFs merged successfully!")
85 | print(f"Output saved to: {output_path}")
86 | except FileNotFoundError as e:
87 | print(f"File error: {str(e)}")
88 | except Exception as e:
89 | print(f"Error occurred: {str(e)}")
90 |
--------------------------------------------------------------------------------
/pdf_thumbnail.py:
--------------------------------------------------------------------------------
1 | import fitz
2 | import os
3 |
4 |
5 | def create_pdf_thumbnail(pdf_path, width=400):
6 | """
7 | 为PDF文件第一页创建缩略图并保存到pdf_path上一层目录的thumbnail文件夹
8 |
9 | 参数:
10 | pdf_path: PDF文件路径
11 | width: 缩略图的宽度(像素)
12 | """
13 | try:
14 | # 获取PDF文件名(不含扩展名)
15 | pdf_filename = os.path.splitext(os.path.basename(pdf_path))[0]
16 |
17 | # 获取PDF文件的绝对路径
18 | pdf_absolute_path = os.path.abspath(pdf_path)
19 |
20 | # 获取PDF文件所在目录的上一层目录
21 | parent_dir = os.path.dirname(os.path.dirname(pdf_absolute_path))
22 |
23 | # 构建保存缩略图的路径(上一层目录的thumbnail文件夹)
24 | thumbnail_dir = os.path.join(parent_dir, 'thumbnail')
25 |
26 | # 如果目录不存在,创建目录
27 | os.makedirs(thumbnail_dir, exist_ok=True)
28 |
29 | # 构建输出路径
30 | output_path = os.path.join(thumbnail_dir, f"{pdf_filename}.png")
31 |
32 | # 打开PDF文件
33 | doc = fitz.open(pdf_path)
34 |
35 | # 获取第一页
36 | first_page = doc[0]
37 |
38 | # 设置缩放参数
39 | zoom = width / first_page.rect.width
40 | matrix = fitz.Matrix(zoom, zoom)
41 |
42 | # 获取页面的像素图
43 | pix = first_page.get_pixmap(matrix=matrix, alpha=False)
44 |
45 | # 保存图片
46 | pix.save(output_path)
47 |
48 | # 关闭PDF文档
49 | doc.close()
50 |
51 | print(f"缩略图已保存到: {output_path}")
52 | return output_path
53 |
54 | except Exception as e:
55 | print(f"生成缩略图时发生错误: {str(e)}")
56 | return None
57 |
58 |
59 | # 使用示例
60 | if __name__ == "__main__":
61 | # PDF文件路径
62 | pdf_file = "g55.pdf"
63 | # 生成并保存缩略图
64 | thumbnail_path = create_pdf_thumbnail(pdf_file, width=400)
65 |
--------------------------------------------------------------------------------
/pdfviewer.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | Enhanced Split PDF Viewer
7 |
8 |
132 |
133 |
134 |
135 |
136 |
155 |
156 |
157 |
158 |
159 |
160 |
179 |
180 |
181 |
282 |
283 |
284 |
--------------------------------------------------------------------------------
/pdfviewer2.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 | Single PDF Viewer
9 |
37 |
38 |
39 |
45 |
66 |
67 |
68 |
69 |
--------------------------------------------------------------------------------
/recent.json:
--------------------------------------------------------------------------------
1 | [
2 | {
3 | "index": 0,
4 | "date": "2025-04-13 01:34:09",
5 | "name": "2403.20127v1.pdf",
6 | "original_language": "auto",
7 | "target_language": "zh",
8 | "read": "0",
9 | "statue": "1"
10 | }
11 | ]
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | deepl==1.17.0
2 | Flask==2.0.1
3 | flask-cors
4 | Pillow==10.2.0
5 | PyMuPDF==1.24.0
6 | pytesseract==0.3.10
7 | requests==2.31.0
8 | tiktoken==0.6.0
9 | Werkzeug==2.0.1
10 | aiohttp
11 | fontTools
12 |
13 |
--------------------------------------------------------------------------------
/static/1.js:
--------------------------------------------------------------------------------
1 |
2 | // 在全局范围定义变量
3 |
4 |
5 | // 显示主页
6 | function showHome() {
7 | document.getElementById('recentread').innerHTML = 'Recent Reading';
8 | document.getElementById('articleContainer').style.display = '';
9 | document.getElementById('viewAllSection').style.display = 'flex';
10 | document.querySelector('.sidebar-menu a[onclick="showHome()"]').classList.add('active');
11 | document.querySelector('.sidebar-menu a[onclick="showAllRecent()"]').classList.remove('active');
12 | document.querySelector('.sidebar-menu a[onclick="showSetup()"]').classList.remove('active'); // 添加这行
13 | loadArticles(true,true);
14 | document.getElementById('t-container').style.display = '';
15 | }
16 |
17 | function showAllRecent() {
18 | document.getElementById('recentread').innerHTML = 'Recent Reading';
19 |
20 | document.getElementById('articleContainer').style.display = '';
21 | document.getElementById('viewAllSection').style.display = 'none';
22 | document.querySelector('.sidebar-menu a[onclick="showHome()"]').classList.remove('active');
23 | document.querySelector('.sidebar-menu a[onclick="showAllRecent()"]').classList.add('active');
24 | document.querySelector('.sidebar-menu a[onclick="showSetup()"]').classList.remove('active'); // 添加这行
25 | loadArticles(false,true);
26 | document.getElementById('t-container').style.display = '';
27 | }
28 | // 添加新的函数处理 Setup steps
29 | function showSetup() {
30 | // 隐藏其他部分(如果需要的话)
31 |
32 |
33 | document.getElementById('recentread').innerHTML = 'config.json';
34 | document.getElementById('articleContainer').style.display = 'none';
35 | document.getElementById('viewAllSection').style.display = 'none';
36 |
37 |
38 | // 移除其他菜单项的 active 类
39 | document.querySelector('.sidebar-menu a[onclick="showHome()"]').classList.remove('active');
40 | document.querySelector('.sidebar-menu a[onclick="showAllRecent()"]').classList.remove('active');
41 |
42 | // 给 Setup steps 添加 active 类
43 | document.querySelector('.sidebar-menu a[onclick="showSetup()"]').classList.add('active');
44 | document.getElementById('t-container').style.display = 'block';
45 | }
46 |
47 | // 显示上传模态框
48 | function showUpload() {
49 | document.getElementById('uploadModal').style.display = 'block';
50 | document.getElementById('upload_content-1').style.display = 'block';
51 | document.getElementById('upload_content-2').style.display = 'none';
52 | document.getElementById('languageSelection').style.display = 'none';
53 |
54 | }
55 |
56 |
57 |
58 | // 显示设置模态框
59 | function showSettings() {
60 | document.getElementById('settingsModal').style.display = 'block';
61 | }
62 |
63 |
64 | async function loadArticles(isLimited,first_reload) {
65 | const container = document.getElementById('articleContainer');
66 | if (first_reload) {
67 | const record_show_staute = document.getElementById('record_show_staute');
68 | record_show_staute.setAttribute('data-value', isLimited);
69 | }
70 |
71 |
72 |
73 |
74 | try {
75 | container.innerHTML = '正在加载数据...
';
76 |
77 | const response = await fetch('/recent.json');
78 | if (!response.ok) {
79 | throw new Error(`HTTP error! status: ${response.status}`);
80 | }
81 |
82 | const data = await response.json();
83 | container.innerHTML = '';
84 |
85 | if (data.length === 0) {
86 | container.innerHTML = 'No reading records yet
';
87 | return;
88 | }
89 |
90 | // 根据 index 排序(从大到小)
91 | let sortedArticles = [...data].sort((a, b) => b.index - a.index);
92 |
93 | // 如果需要限制显示数量
94 | if (isLimited) {
95 | sortedArticles = sortedArticles.slice(0, 3);
96 | }
97 |
98 | sortedArticles.forEach(article => {
99 | const articleCard = document.createElement('a');
100 | articleCard.className = 'article-card';
101 |
102 | // 上半部分div
103 | const topDiv = document.createElement('div');
104 | topDiv.className = 'article-top';
105 | topDiv.innerHTML = `
106 |
107 |
108 | `;
109 |
110 |
111 | // 下半部分div
112 | const bottomDiv = document.createElement('div');
113 | bottomDiv.className = 'article-bottom';
114 |
115 | // 文章标题
116 | const titleDiv = document.createElement('div');
117 | titleDiv.className = 'article-title';
118 | titleDiv.innerHTML = `${article.name} `;
119 |
120 | // 信息行div
121 | const infoDiv = document.createElement('div');
122 | infoDiv.className = 'article-info';
123 | infoDiv.innerHTML = `
124 | ${article.author || 'Unknown author'}
125 | ${article.date}
126 | ${article.original_language} - ${article.target_language}
127 | `;
128 |
129 | bottomDiv.appendChild(titleDiv);
130 | bottomDiv.appendChild(infoDiv);
131 |
132 | // 状态指示器
133 | const statusIndicator = document.createElement('div');
134 | statusIndicator.className = 'status-indicator';
135 |
136 | if (parseInt(article.statue) === 0) {
137 | statusIndicator.innerHTML = ' ';
138 | articleCard.className += ' disabled';
139 | articleCard.addEventListener('click', (e) => {
140 | e.preventDefault();
141 | showToast('Translation is not complete yet, unable to view at this time.');
142 | });
143 | } else {
144 | statusIndicator.innerHTML = ' ';
145 | articleCard.addEventListener('click', () => {
146 | const targetFileName = `${article.name.replace(/\.pdf$/, '')}_${article.target_language}.pdf`;
147 | const url = `/pdfviewer.html?name=${encodeURIComponent(article.name)}&name_target_language=${encodeURIComponent(targetFileName)}&index=${encodeURIComponent(article.index)}`;
148 | window.open(url, '_blank');
149 | });
150 | articleCard.style.cursor = 'pointer';
151 |
152 | }
153 | bottomDiv.appendChild(statusIndicator);
154 |
155 | // 阅读状态标签
156 | const readStatus = document.createElement('div');
157 | readStatus.className = `read-status ${parseInt(article.read) === 0 ? 'unread' : 'read'}`;
158 | readStatus.textContent = parseInt(article.read) === 0 ? 'Unread' : 'Read';
159 |
160 | // 三点菜单按钮
161 | const menuButton = document.createElement('button');
162 | menuButton.className = 'menu-button';
163 | menuButton.innerHTML = ' ';
164 |
165 | articleCard.appendChild(topDiv);
166 | articleCard.appendChild(bottomDiv);
167 | articleCard.appendChild(readStatus);
168 | articleCard.appendChild(menuButton);
169 |
170 | container.appendChild(articleCard);
171 |
172 | // 菜单按钮点击事件
173 | menuButton.addEventListener('click', (e) => {
174 | e.preventDefault();
175 | e.stopPropagation();
176 | showMenu(e, article, e.currentTarget);
177 | });
178 | });
179 | } catch (error) {
180 | console.error('加载数据失败:', error);
181 | container.innerHTML = `
182 |
183 | 加载数据失败,请稍后重试
184 | ${error.message}
185 |
186 | `;
187 | }
188 |
189 | }
190 |
191 | // 显示菜单函数
192 |
193 |
194 |
195 |
196 | // Toast提示函数
197 | function showToast(message) {
198 | const toast = document.createElement('div');
199 | toast.className = 'toast';
200 | toast.textContent = message;
201 | document.body.appendChild(toast);
202 |
203 | setTimeout(() => {
204 | toast.remove();
205 | }, 2000);
206 | }
207 |
208 |
209 | // 显示菜单函数
210 | function showMenu(event, article) {
211 | const menu = document.createElement('div');
212 | articleId = article.index
213 | article_name= article.name
214 | article_tl = article.target_language
215 | article_ol = article.original_language
216 | console.log(2,articleId)
217 | menu.className = 'article-menu';
218 | menu.innerHTML = `
219 |
220 |
221 |
222 | `;
223 |
224 | // 定位菜单
225 | menu.style.position = 'absolute';
226 | menu.style.top = `${event.pageY}px`;
227 | menu.style.left = `${event.pageX}px`;
228 |
229 | document.body.appendChild(menu);
230 |
231 | // 点击其他地方关闭菜单
232 | document.addEventListener('click', function closeMenu(e) {
233 | if (!menu.contains(e.target) && e.target !== event.target) {
234 | menu.remove();
235 | document.removeEventListener('click', closeMenu);
236 | }
237 | });
238 | }
239 |
240 |
241 | function open_bilingual(articleId,article_name,article_tl,article_ol) {
242 | const url = `/pdfviewer2.html?name=${encodeURIComponent(article_name)}&target_language=${encodeURIComponent(article_tl)}&index=${encodeURIComponent(articleId)}&original_language=${encodeURIComponent(article_ol)}`;
243 | window.open(url, '_blank');
244 | }
245 |
246 |
247 | // Toast提示函数
248 | function showToast(message) {
249 | const toast = document.createElement('div');
250 | toast.className = 'toast';
251 | toast.textContent = message;
252 | document.body.appendChild(toast);
253 |
254 | setTimeout(() => {
255 | toast.remove();
256 | }, 2000);
257 | }
258 |
259 |
260 |
261 | // 页面加载完成后初始化
262 | document.addEventListener('DOMContentLoaded', function() {
263 | showHome();
264 | });
265 | function closeUploadModal() {
266 | // 隐藏modal
267 | document.getElementById('uploadModal').style.display = 'none';
268 | // 清空文件列表显示
269 | document.getElementById('uploadFilesList').innerHTML = '';
270 | // 清空uploadFiles Map
271 | uploadFiles.clear();
272 |
273 | // 重置上传界面(如果需要的话)
274 | document.getElementById('upload_content-1').style.display = 'flex';
275 | document.getElementById('upload_content-2').style.display = 'none';
276 | }
277 |
278 |
279 |
--------------------------------------------------------------------------------
/static/2.js:
--------------------------------------------------------------------------------
1 | // 全局变量存储API密钥
2 | let translationKeys = {
3 | deepl: '',
4 | google: '',
5 | youdao: '',
6 | aliyun: '',
7 | tencent: '',
8 | Grok: '', // 修改为大写的Grok
9 | ThirdParty: '', // 添加ThirdParty
10 | GLM: '', // 添加GLM
11 | bing: '' // 添加Bing
12 | };
13 |
14 | // 关闭设置弹窗
15 | function closeSettings() {
16 | document.getElementById('settingsModal').style.display = 'none';
17 |
18 | }
19 | const toggle = document.getElementById('ocrToggle');
20 | const toggle2 = document.getElementById('translationToggle');
21 | function getValue() {
22 | return toggle.checked ?
23 | toggle.getAttribute('data-on') :
24 | toggle.getAttribute('data-off');
25 | }
26 | function getValue2() {
27 | return toggle2.checked ?
28 | toggle2.getAttribute('data-on') :
29 | toggle.getAttribute('data-off');
30 | }
31 |
32 |
33 | function getecount() {
34 | fetch('/api/get-default-services')
35 | .then(response => response.json())
36 | .then(data => {
37 | if (data.success && data.data) {
38 | const settings = data.data;
39 |
40 | document.getElementById('count_article').textContent = ` Articles in Total: ${settings.count} `;
41 | }
42 | })
43 | .catch(error => {
44 | console.error('获取设置失败:', error);
45 | alert('获取设置失败,请稍后重试');
46 | });
47 | }
48 |
49 |
50 |
51 |
--------------------------------------------------------------------------------
/static/4.js:
--------------------------------------------------------------------------------
1 |
2 | // 用于存储批量选择的文章ID
3 | let selectedBatchIds = new Set();
4 |
5 | // 展示批量管理弹窗
6 | function showBatchModal() {
7 | document.getElementById('batchModal').style.display = 'block';
8 | loadBatchData(); // 获取数据并渲染卡片
9 | }
10 |
11 | // 关闭批量管理弹窗
12 | function closeBatchModal() {
13 | document.getElementById('batchModal').style.display = 'none';
14 | // 关闭时清空已选
15 | selectedBatchIds.clear();
16 | }
17 |
18 | // 加载 recent.json 数据并渲染到批量弹窗
19 | async function loadBatchData() {
20 | const container = document.getElementById('batchGrid');
21 | container.innerHTML = 'Loading data...
';
22 | try {
23 | const response = await fetch('/recent.json');
24 | if (!response.ok) {
25 | throw new Error(`HTTP error! status: ${response.status}`);
26 | }
27 | const data = await response.json();
28 | container.innerHTML = '';
29 |
30 | if (!data || data.length === 0) {
31 | container.innerHTML = 'No records to batch manage
';
32 | return;
33 | }
34 |
35 | // 按 index 倒序
36 | const sortedData = data.sort((a, b) => b.index - a.index);
37 |
38 | // 将卡片渲染到 container
39 | sortedData.forEach(item => {
40 | const card = document.createElement('div');
41 | card.className = 'batch-card';
42 | card.dataset.indexId = item.index; // 存一下,方便后续操作
43 |
44 | // 已读 / 未读
45 | const readStatus = item.read === "1" ? "Read" : "Unread";
46 |
47 | // 注意:后端返回没有作者的话,可以用Unknown
48 | const author = item.author || "Unknown author";
49 | const original_lan = item.original_language ;
50 | const target_lan = item.target_language;
51 |
52 | card.innerHTML = `
53 | ${item.name}
54 |
55 |
Date: ${item.date}
56 |
Author: ${author}
57 |
Status: ${readStatus} || Convertion:
58 |
59 | ${original_lan} to ${target_lan}
60 |
61 |
62 | `;
63 |
64 | // 点击选择或取消选择
65 | card.addEventListener('click', () => {
66 | if (selectedBatchIds.has(item.index)) {
67 | selectedBatchIds.delete(item.index);
68 | card.classList.remove('selected');
69 | } else {
70 | selectedBatchIds.add(item.index);
71 | card.classList.add('selected');
72 | }
73 | });
74 |
75 | container.appendChild(card);
76 | });
77 | } catch (error) {
78 | console.error('加载数据失败:', error);
79 | container.innerHTML = `Failed to load data ${error.message}
`;
80 | }
81 | }
82 |
83 | // 全选 / 取消全选
84 | function toggleSelectAll() {
85 | const container = document.getElementById('batchGrid');
86 | const cards = container.querySelectorAll('.batch-card');
87 |
88 | // 如果有一个未选,则本次点击后全选,否则取消全选
89 | let shouldSelectAll = false;
90 | if (selectedBatchIds.size < cards.length) {
91 | // 还有没选的,进行全选
92 | shouldSelectAll = true;
93 | }
94 |
95 | cards.forEach(card => {
96 | const indexId = parseInt(card.dataset.indexId, 10);
97 | if (shouldSelectAll) {
98 | selectedBatchIds.add(indexId);
99 | card.classList.add('selected');
100 | } else {
101 | selectedBatchIds.delete(indexId);
102 | card.classList.remove('selected');
103 | }
104 | });
105 | }
106 |
107 | // 批量删除
108 | async function handleBatchDelete() {
109 | if (selectedBatchIds.size === 0) {
110 | alert('No articles selected!');
111 | return;
112 | }
113 |
114 | // 简单确认
115 | if (!confirm('Are you sure you want to delete the selected items?')) {
116 | return;
117 | }
118 |
119 | // 发送到后端
120 | try {
121 | // 假设后端你新加了一个 /delete_batch 接口
122 | const response = await fetch('/delete_batch', {
123 | method: 'POST',
124 | headers: {
125 | 'Content-Type': 'application/json'
126 | },
127 | body: JSON.stringify({
128 | articleIds: Array.from(selectedBatchIds)
129 | })
130 | });
131 |
132 | if (!response.ok) throw new Error('Delete failed');
133 |
134 | // 删除成功后刷新弹窗数据
135 | selectedBatchIds.clear();
136 | loadBatchData();
137 | getecount();
138 | } catch (error) {
139 | console.error('删除失败:', error);
140 | alert('Delete failed, please try again!');
141 | }
142 | }
143 |
144 | // 生成思维导图
145 | function handleMindMap() {
146 | if (selectedBatchIds.size === 0) {
147 | alert('No articles selected for mind map!');
148 | return;
149 | }
150 |
151 | // 这里演示直接在控制台输出,你可以改为实际的请求
152 | console.log('生成思维导图,选中的ID:', Array.from(selectedBatchIds));
153 | alert('Pretend to generate Mind Map for selected items');
154 | }
155 |
156 | // 总结
157 | function handleSummary() {
158 | if (selectedBatchIds.size === 0) {
159 | alert('No articles selected for summary!');
160 | return;
161 | }
162 |
163 | // 同上,这里可以改成实际的后端接口
164 | console.log('生成总结,选中的ID:', Array.from(selectedBatchIds));
165 | alert('Pretend to generate Summary for selected items');
166 | }
167 |
--------------------------------------------------------------------------------
/static/Figure_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CBIhalsen/PolyglotPDF/f476650ba6563f574d8cbdcc0840bab27fcc7d35/static/Figure_1.png
--------------------------------------------------------------------------------
/static/Line-model-demo.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CBIhalsen/PolyglotPDF/f476650ba6563f574d8cbdcc0840bab27fcc7d35/static/Line-model-demo.pdf
--------------------------------------------------------------------------------
/static/Line-model-demo_zh.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CBIhalsen/PolyglotPDF/f476650ba6563f574d8cbdcc0840bab27fcc7d35/static/Line-model-demo_zh.pdf
--------------------------------------------------------------------------------
/static/PolyglotPDF.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CBIhalsen/PolyglotPDF/f476650ba6563f574d8cbdcc0840bab27fcc7d35/static/PolyglotPDF.png
--------------------------------------------------------------------------------
/static/colorspace_issue_sample.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CBIhalsen/PolyglotPDF/f476650ba6563f574d8cbdcc0840bab27fcc7d35/static/colorspace_issue_sample.pdf
--------------------------------------------------------------------------------
/static/demo.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CBIhalsen/PolyglotPDF/f476650ba6563f574d8cbdcc0840bab27fcc7d35/static/demo.gif
--------------------------------------------------------------------------------
/static/demo.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CBIhalsen/PolyglotPDF/f476650ba6563f574d8cbdcc0840bab27fcc7d35/static/demo.mp4
--------------------------------------------------------------------------------
/static/merged_pdf/2403.20127v1_auto_zh.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CBIhalsen/PolyglotPDF/f476650ba6563f574d8cbdcc0840bab27fcc7d35/static/merged_pdf/2403.20127v1_auto_zh.pdf
--------------------------------------------------------------------------------
/static/original/2403.20127v1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CBIhalsen/PolyglotPDF/f476650ba6563f574d8cbdcc0840bab27fcc7d35/static/original/2403.20127v1.pdf
--------------------------------------------------------------------------------
/static/original/2501.05450v1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CBIhalsen/PolyglotPDF/f476650ba6563f574d8cbdcc0840bab27fcc7d35/static/original/2501.05450v1.pdf
--------------------------------------------------------------------------------
/static/original/demo.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CBIhalsen/PolyglotPDF/f476650ba6563f574d8cbdcc0840bab27fcc7d35/static/original/demo.pdf
--------------------------------------------------------------------------------
/static/page1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CBIhalsen/PolyglotPDF/f476650ba6563f574d8cbdcc0840bab27fcc7d35/static/page1.png
--------------------------------------------------------------------------------
/static/page2.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CBIhalsen/PolyglotPDF/f476650ba6563f574d8cbdcc0840bab27fcc7d35/static/page2.jpeg
--------------------------------------------------------------------------------
/static/page3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CBIhalsen/PolyglotPDF/f476650ba6563f574d8cbdcc0840bab27fcc7d35/static/page3.png
--------------------------------------------------------------------------------
/static/page4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CBIhalsen/PolyglotPDF/f476650ba6563f574d8cbdcc0840bab27fcc7d35/static/page4.png
--------------------------------------------------------------------------------
/static/setup.css:
--------------------------------------------------------------------------------
1 |
2 | select.t-input {
3 | width: 300px;
4 | padding: 8px;
5 | border: 1px solid #ccc;
6 | border-radius: 4px;
7 | background-color: white;
8 | transition: border-color 0.2s ease-in-out, box-shadow 0.2s ease-in-out;
9 | }
10 |
11 | select.t-input:focus {
12 | outline: none;
13 | border-color: #007bff;
14 | box-shadow: 0 0 5px rgba(0, 123, 255, 0.25);
15 | }
16 |
17 | /* 容器与标题等基础样式 */
18 | .t-container {
19 | display: none;
20 | max-width: 90%;
21 |
22 | margin: 0 auto;
23 | font-family: Arial, sans-serif;
24 | }
25 | .t-header-container {
26 | display: flex;
27 | justify-content: space-between;
28 | align-items: center;
29 | margin-bottom: 20px;
30 | }
31 | .t-section {
32 | border: 1px solid #ddd;
33 | margin: 10px 0;
34 | padding: 10px;
35 | border-radius: 4px;
36 | background: #fff;
37 | }
38 |
39 | /* 展开/折叠区域的基础样式 */
40 | .t-section-header {
41 | display: flex;
42 | justify-content: space-between;
43 | align-items: center;
44 | cursor: pointer;
45 |
46 | }
47 |
48 | /* 展开/折叠按钮:这里使用了一个“加号”图标,有旋转和淡入效果 */
49 | .t-toggle-btn {
50 | background: none;
51 | border: none;
52 | font-size: 18px;
53 | cursor: pointer;
54 | transition: transform 0.4s ease;
55 | width: 30px;
56 | height: 30px;
57 | border-radius: 4px;
58 | display: flex;
59 | align-items: center;
60 | justify-content: center;
61 | color: #000;
62 | position: relative;
63 | }
64 | .t-toggle-btn:hover {
65 | background: #f0f0f0;
66 | }
67 | .t-toggle-btn.t-active {
68 | transform: rotate(45deg);
69 | color: #007bff;
70 | }
71 |
72 | /* 用于内容区域的动画展开:max-height + 透明度平滑过渡 */
73 | .t-content {
74 | max-height: 0;
75 | overflow: hidden;
76 | transition: max-height 0.4s ease, opacity 0.4s ease;
77 | opacity: 0;
78 | }
79 | .t-content.t-active {
80 | max-height: 1000px; /* 根据内容高度可适度加大 */
81 | opacity: 1;
82 | }
83 |
84 | /* 子区域卡片 */
85 | .t-sub-section {
86 | margin-left: 20px;
87 | padding: 10px;
88 | border: 2px solid #eee;
89 | margin-top: 10px;
90 | border-radius: 4px;
91 | background-color: white;
92 | }
93 |
94 | /* 输入框分组与标签 */
95 | .t-input-group {
96 | margin: 10px 0;
97 | }
98 | .t-input-group label {
99 | display: inline-block;
100 | width: 150px;
101 | font-weight: bold;
102 | color: #555;
103 | }
104 |
105 | /* 数量显示 */
106 | .t-count-display {
107 | padding: 5px 10px;
108 | background-color: #f8f9fa;
109 | border: 1px solid #ddd;
110 | border-radius: 4px;
111 | display: inline-block;
112 | }
113 |
114 | /* 美化输入框 */
115 | .t-input {
116 | width: 300px;
117 | padding: 8px;
118 | border: 1px solid #ccc;
119 | border-radius: 4px;
120 | transition: border-color 0.2s ease-in-out, box-shadow 0.2s ease-in-out;
121 | }
122 | .t-input:focus {
123 | outline: none;
124 | border-color: #007bff;
125 | box-shadow: 0 0 5px rgba(0, 123, 255, 0.25);
126 | }
127 |
128 | /* 保存按钮 */
129 | .t-save-btn {
130 | background-color: #6366f1;
131 | color: white;
132 | border: none;
133 | padding: 10px 20px;
134 | border-radius: 5px;
135 | cursor: pointer;
136 | transition: background-color 0.3s;
137 | }
138 |
139 | .t-save-btn:hover {
140 | background-color: #4f46e5;
141 | }
142 |
143 | .t-save-btn.success {
144 | background-color: #22c55e;
145 | }
146 |
147 | .t-ppc {
148 | /* 基础排版 */
149 | width: 120px;
150 | padding: 8px;
151 | font-size: 14px;
152 |
153 | /* 边框与圆角 */
154 | border: 1px solid #ccc;
155 | border-radius: 4px;
156 |
157 | /* 其他外观 */
158 | color: #333;
159 | background-color: #f9f9f9;
160 | outline: none;
161 |
162 | /* 过渡,提升交互体验 */
163 | transition: border-color 0.3s, box-shadow 0.3s;
164 | }
165 |
166 | .t-ppc:focus {
167 | /* 获取焦点时边框颜色改变,比如蓝色 */
168 | border-color: #4A90E2;
169 | box-shadow: 0 0 5px rgba(74,144,226,0.5);
170 | }
171 |
--------------------------------------------------------------------------------
/static/setup.js:
--------------------------------------------------------------------------------
1 | // 页面加载时获取配置
2 | fetch('/config_json')
3 | .then(response => response.json())
4 | .then(data => {
5 | initializeUI(data);
6 | });
7 |
8 | // 初始化UI
9 | // 初始化UI
10 | function initializeUI(data) {
11 | document.getElementById('t-count').textContent = data.count;
12 | document.getElementById('t-count').value = data.count;
13 | document.getElementById('t-ppc').textContent = data.PPC;
14 | document.getElementById('t-ppc').value = data.PPC;
15 | document.getElementById('count_article').textContent += data.count;
16 |
17 |
18 | console.log('count', data.count);
19 |
20 | // 初始化翻译服务 (这部分代码保持不变)
21 | const translationServices = document.getElementById('t-translation-services');
22 | Object.entries(data.translation_services).forEach(([service, config]) => {
23 | const serviceDiv = createServiceSection(service, config);
24 | translationServices.appendChild(serviceDiv);
25 | });
26 |
27 | // 初始化OCR服务 (这部分代码保持不变)
28 | const ocrServices = document.getElementById('t-ocr-services');
29 | Object.entries(data.ocr_services).forEach(([service, config]) => {
30 | const serviceDiv = createServiceSection(service, config);
31 | ocrServices.appendChild(serviceDiv);
32 | });
33 |
34 | // 初始化默认配置
35 | const defaultServices = document.getElementById('t-default-services');
36 | console.log('api',data.default_services.Translation_api)
37 | const defaultConfig = {
38 | 'ocr_model': {
39 | type: 'select',
40 | options: ['true', 'false'],
41 | value: data.default_services.ocr_model
42 | },
43 | 'Enable_translation': {
44 | type: 'select',
45 | options: ['true', 'false'],
46 | value: data.default_services.Enable_translation
47 | },
48 | 'Translation_api': {
49 | type: 'select',
50 | options: ['Doubao', 'Qwen', 'deepseek', 'openai', 'deepl', 'youdao','Grok', 'ThirdParty', 'GLM', 'bing'],
51 | value: data.default_services.Translation_api
52 | }
53 | };
54 |
55 | // 在 initializeUI 函数中修改相关部分
56 | Object.entries(defaultConfig).forEach(([key, config]) => {
57 | const inputGroup = document.createElement('div');
58 | inputGroup.className = 't-input-group';
59 |
60 | const select = document.createElement('select');
61 | select.className = 't-input';
62 |
63 | config.options.forEach(option => {
64 | const optionElement = document.createElement('option');
65 | optionElement.value = option;
66 | optionElement.textContent = option;
67 |
68 | // 修改选项匹配逻辑
69 | if (key === 'Translation_api') {
70 | // 直接比较字符串值
71 | optionElement.selected = (option === config.value);
72 | console.log(`Translation API option: ${option}, config value: ${config.value}, selected: ${optionElement.selected}`);
73 | } else if (key === 'ocr_model' || key === 'Enable_translation' ) {
74 | const optionBool = option.toLowerCase() === 'true';
75 | optionElement.selected = (optionBool === config.value);
76 | }
77 |
78 | select.appendChild(optionElement);
79 | });
80 | if (key === 'Enable_translation') {
81 | inputGroup.innerHTML = `${key}: `;
82 | } else {
83 | inputGroup.innerHTML = `${key}: `;
84 | }
85 |
86 |
87 | inputGroup.appendChild(select);
88 | defaultServices.appendChild(inputGroup);
89 | });
90 |
91 | }
92 |
93 | // 创建服务配置区域
94 | function createServiceSection(serviceName, config) {
95 | const section = document.createElement('div');
96 | section.className = 't-sub-section';
97 |
98 | const header = document.createElement('div');
99 | header.className = 't-section-header';
100 | header.innerHTML = `
101 | ${serviceName}
102 | +
103 | `;
104 |
105 | const content = document.createElement('div');
106 | content.className = 't-content';
107 |
108 | Object.entries(config).forEach(([key, value]) => {
109 | const inputGroup = document.createElement('div');
110 | inputGroup.className = 't-input-group';
111 | inputGroup.innerHTML = `
112 | ${key}:
113 |
114 | `;
115 | content.appendChild(inputGroup);
116 | });
117 |
118 |
119 | section.appendChild(header);
120 | section.appendChild(content);
121 |
122 | return section;
123 | }
124 |
125 | // 添加展开/折叠功能
126 | document.addEventListener('click', function(e) {
127 | if (e.target.classList.contains('t-toggle-btn')) {
128 | const button = e.target;
129 | const content = button.closest('.t-section-header').nextElementSibling;
130 | button.classList.toggle('t-active');
131 | content.classList.toggle('t-active');
132 | }
133 | });
134 |
135 | // 添加自动保存功能
136 | let saveTimeout;
137 | document.addEventListener('input', function(e) {
138 | if (e.target.classList.contains('t-input')) {
139 | clearTimeout(saveTimeout);
140 | saveTimeout = setTimeout(() => {
141 | // 收集当前所有配置数据
142 | const config = collectConfig();
143 | // 发送到后端
144 | fetch('/update_config', {
145 | method: 'POST',
146 | headers: {
147 | 'Content-Type': 'application/json',
148 | },
149 | body: JSON.stringify(config)
150 | });
151 | }, 5000);
152 | }
153 | });
154 | async function saveall() {
155 | const saveall = document.getElementById('saveall');
156 |
157 |
158 | // 添加切换事件监听
159 |
160 |
161 | try {
162 | // 发送数据到后端
163 |
164 | const config = collectConfig();
165 |
166 | const response = await fetch('/save_all', {
167 | method: 'POST',
168 | headers: {
169 | 'Content-Type': 'application/json',
170 | },
171 | body: JSON.stringify(config)
172 | });
173 |
174 | if (!response.ok) {
175 | throw new Error('保存失败');
176 | }
177 |
178 | // 显示成功状态
179 | saveall.innerHTML = '✓';
180 | saveall.classList.add('success');
181 |
182 | // 2秒后恢复按钮状态
183 | setTimeout(() => {
184 | saveall.innerHTML = '保存所有修改';
185 | saveall.classList.remove('success');
186 | }, 2000);
187 |
188 |
189 |
190 | } catch (error) {
191 | console.error('保存设置失败:', error);
192 | alert('保存设置失败,请重试');
193 | }
194 | }
195 | // 保存所有修改
196 | document.querySelector('.t-save-btn').addEventListener('click', function() {
197 | const config = collectConfig();
198 | fetch('/save_all', {
199 | method: 'POST',
200 | headers: {
201 | 'Content-Type': 'application/json',
202 | },
203 | body: JSON.stringify(config)
204 | });
205 | });
206 |
207 | // 收集所有配置数据
208 | // 收集所有配置数据
209 | function collectConfig() {
210 | const config = {
211 | count: document.getElementById('t-count').value,
212 | PPC: parseInt(document.getElementById('t-ppc').value, 10),
213 | translation_services: {},
214 | ocr_services: {},
215 | default_services: {}
216 | };
217 |
218 | // 收集翻译服务配置
219 | const translationServices = document.getElementById('t-translation-services');
220 | [...translationServices.getElementsByClassName('t-sub-section')].forEach(section => {
221 | const serviceName = section.querySelector('h4').textContent;
222 | config.translation_services[serviceName] = {};
223 | [...section.getElementsByClassName('t-input-group')].forEach(group => {
224 | const key = group.querySelector('label').textContent.replace(':', '');
225 | const value = group.querySelector('input').value;
226 | config.translation_services[serviceName][key] = value;
227 | });
228 | });
229 |
230 | // 收集OCR服务配置
231 | const ocrServices = document.getElementById('t-ocr-services');
232 | [...ocrServices.getElementsByClassName('t-sub-section')].forEach(section => {
233 | const serviceName = section.querySelector('h4').textContent;
234 | config.ocr_services[serviceName] = {};
235 | [...section.getElementsByClassName('t-input-group')].forEach(group => {
236 | const key = group.querySelector('label').textContent.replace(':', '');
237 | const value = group.querySelector('input').value;
238 | config.ocr_services[serviceName][key] = value;
239 | });
240 | });
241 |
242 | // 收集默认配置
243 | // 收集默认配置
244 | const defaultServices = document.getElementById('t-default-services');
245 | [...defaultServices.getElementsByClassName('t-input-group')].forEach(group => {
246 | const key = group.querySelector('label').textContent.replace(':', '');
247 | let value = group.querySelector('select').value;
248 |
249 | // 对特定key进行布尔值转换
250 | if(key === 'ocr_model' || key === 'Enable_translation' ) {
251 | value = value === 'true' ? true : false;
252 | }
253 |
254 |
255 | config.default_services[key] = value;
256 | });
257 |
258 |
259 | return config;
260 | }
261 |
262 | // 在加载翻译服务配置时,确保处理Grok选项
263 | function loadTranslationServices(config) {
264 | const container = document.getElementById('t-translation-services');
265 | // ...existing code...
266 |
267 | // 确保在创建服务配置UI时包含Grok
268 | // 使用正确的键名'Grok'而不是'grok'
269 | if (config.translation_services && config.translation_services.Grok) {
270 | const grokDiv = document.createElement('div');
271 | grokDiv.className = 't-service';
272 | grokDiv.innerHTML = `
273 | Grok Translate API
274 |
275 | Auth Key:
276 |
277 |
278 |
279 | Model Name:
280 |
281 |
282 | `;
283 | container.appendChild(grokDiv);
284 | }
285 |
286 | // 确保在创建服务配置UI时包含GLM
287 | if (config.translation_services && config.translation_services.GLM) {
288 | const glmDiv = document.createElement('div');
289 | glmDiv.className = 't-service';
290 | glmDiv.innerHTML = `
291 | GLM Translate API
292 |
293 | Auth Key:
294 |
295 |
296 |
297 | Model Name:
298 |
299 |
300 | `;
301 | container.appendChild(glmDiv);
302 | }
303 |
304 | // 添加ThirdParty服务配置
305 | if (config.translation_services && config.translation_services.ThirdParty) {
306 | const thirdPartyDiv = document.createElement('div');
307 | thirdPartyDiv.className = 't-sub-section';
308 | thirdPartyDiv.innerHTML = `
309 |
313 |
327 | `;
328 | container.appendChild(thirdPartyDiv);
329 | } else {
330 | // 如果ThirdParty配置不存在,则创建一个默认的
331 | const thirdPartyDiv = document.createElement('div');
332 | thirdPartyDiv.className = 't-sub-section';
333 | thirdPartyDiv.innerHTML = `
334 |
338 |
352 | `;
353 | container.appendChild(thirdPartyDiv);
354 | }
355 |
356 | // 添加Bing服务配置UI
357 | if (config.translation_services && config.translation_services.bing) {
358 | const bingDiv = document.createElement('div');
359 | bingDiv.className = 't-service';
360 | bingDiv.innerHTML = `
361 | Bing Translate API
362 |
363 | 无需配置API密钥
364 |
365 | `;
366 | container.appendChild(bingDiv);
367 | } else {
368 | // 如果Bing配置不存在,则创建一个默认的
369 | const bingDiv = document.createElement('div');
370 | bingDiv.className = 't-sub-section';
371 | bingDiv.innerHTML = `
372 |
376 |
377 |
378 | 无需API密钥,直接使用微软Bing翻译
379 |
380 |
381 | `;
382 | container.appendChild(bingDiv);
383 | }
384 | }
385 |
386 |
--------------------------------------------------------------------------------
/static/target/2403.20127v1_zh.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CBIhalsen/PolyglotPDF/f476650ba6563f574d8cbdcc0840bab27fcc7d35/static/target/2403.20127v1_zh.pdf
--------------------------------------------------------------------------------
/static/target/2501.05450v1_zh.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CBIhalsen/PolyglotPDF/f476650ba6563f574d8cbdcc0840bab27fcc7d35/static/target/2501.05450v1_zh.pdf
--------------------------------------------------------------------------------
/static/thumbnail/...txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CBIhalsen/PolyglotPDF/f476650ba6563f574d8cbdcc0840bab27fcc7d35/static/thumbnail/...txt
--------------------------------------------------------------------------------
/static/thumbnail/2403.20127v1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CBIhalsen/PolyglotPDF/f476650ba6563f574d8cbdcc0840bab27fcc7d35/static/thumbnail/2403.20127v1.png
--------------------------------------------------------------------------------
/static/thumbnail/2501.05450v1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CBIhalsen/PolyglotPDF/f476650ba6563f574d8cbdcc0840bab27fcc7d35/static/thumbnail/2501.05450v1.png
--------------------------------------------------------------------------------
/static/thumbnail/2g2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CBIhalsen/PolyglotPDF/f476650ba6563f574d8cbdcc0840bab27fcc7d35/static/thumbnail/2g2.png
--------------------------------------------------------------------------------
/static/thumbnail/32g2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CBIhalsen/PolyglotPDF/f476650ba6563f574d8cbdcc0840bab27fcc7d35/static/thumbnail/32g2.png
--------------------------------------------------------------------------------
/static/thumbnail/High-precision real-time autonomous driving targetdetection based on YOLOv8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CBIhalsen/PolyglotPDF/f476650ba6563f574d8cbdcc0840bab27fcc7d35/static/thumbnail/High-precision real-time autonomous driving targetdetection based on YOLOv8.png
--------------------------------------------------------------------------------
/static/thumbnail/g2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CBIhalsen/PolyglotPDF/f476650ba6563f574d8cbdcc0840bab27fcc7d35/static/thumbnail/g2.png
--------------------------------------------------------------------------------
/static/thumbnail/g55.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CBIhalsen/PolyglotPDF/f476650ba6563f574d8cbdcc0840bab27fcc7d35/static/thumbnail/g55.png
--------------------------------------------------------------------------------
/static/thumbnail/g6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CBIhalsen/PolyglotPDF/f476650ba6563f574d8cbdcc0840bab27fcc7d35/static/thumbnail/g6.png
--------------------------------------------------------------------------------
/static/thumbnail/gl1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CBIhalsen/PolyglotPDF/f476650ba6563f574d8cbdcc0840bab27fcc7d35/static/thumbnail/gl1.png
--------------------------------------------------------------------------------
/static/thumbnail/line.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CBIhalsen/PolyglotPDF/f476650ba6563f574d8cbdcc0840bab27fcc7d35/static/thumbnail/line.png
--------------------------------------------------------------------------------
/static/thumbnail/m2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CBIhalsen/PolyglotPDF/f476650ba6563f574d8cbdcc0840bab27fcc7d35/static/thumbnail/m2.png
--------------------------------------------------------------------------------
/static/thumbnail/zz1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CBIhalsen/PolyglotPDF/f476650ba6563f574d8cbdcc0840bab27fcc7d35/static/thumbnail/zz1.png
--------------------------------------------------------------------------------
/temp/fonts/zh_subset.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CBIhalsen/PolyglotPDF/f476650ba6563f574d8cbdcc0840bab27fcc7d35/temp/fonts/zh_subset.ttf
--------------------------------------------------------------------------------
/update_recent.py:
--------------------------------------------------------------------------------
1 | import os
2 | import json
3 | import datetime
4 | from typing import List, Dict, Any
5 | import glob
6 | from collections import OrderedDict
7 | import re
8 | import shutil
9 |
10 | def parse_merged_filename(filename: str) -> Dict[str, str]:
11 | """从合并PDF文件名解析出原始文件名、原始语言和目标语言"""
12 | # 格式为:原始文件名_原始语言_目标语言.pdf
13 | pattern = r"(.+)_(\w+)_(\w+)\.pdf$"
14 | match = re.match(pattern, filename)
15 |
16 | if match:
17 | original_name = match.group(1) + ".pdf" # 添加.pdf后缀
18 | original_lang = match.group(2)
19 | target_lang = match.group(3)
20 | return {
21 | "original_name": original_name,
22 | "original_language": original_lang,
23 | "target_language": target_lang
24 | }
25 | else:
26 | # 如果不符合格式,返回默认值并确保有.pdf后缀
27 | name_without_ext = filename.rsplit(".", 1)[0] # 去掉扩展名
28 | return {
29 | "original_name": name_without_ext + ".pdf",
30 | "original_language": "auto",
31 | "target_language": "zh"
32 | }
33 |
34 | def get_file_info(file_path: str) -> Dict[str, Any]:
35 | """从文件路径获取文件信息"""
36 | filename = os.path.basename(file_path)
37 | creation_time = os.path.getctime(file_path)
38 | date_str = datetime.datetime.fromtimestamp(creation_time).strftime('%Y-%m-%d %H:%M:%S')
39 |
40 | # 解析文件名
41 | parsed_info = parse_merged_filename(filename)
42 |
43 | # 创建有序字典,确保属性按指定顺序排列
44 | ordered_info = OrderedDict()
45 | ordered_info["index"] = 0 # 临时值,会在后面被更新
46 | ordered_info["date"] = date_str
47 | ordered_info["name"] = parsed_info["original_name"]
48 | ordered_info["original_language"] = parsed_info["original_language"]
49 | ordered_info["target_language"] = parsed_info["target_language"]
50 | ordered_info["read"] = "0" # 默认为未读
51 | ordered_info["statue"] = "1" # 默认状态为1
52 |
53 | return ordered_info
54 |
55 | def update_config_count(count: int) -> bool:
56 | """
57 | 更新config.json中的count值为指定的数量
58 |
59 | Args:
60 | count: 要设置的count值
61 |
62 | Returns:
63 | bool: 操作是否成功
64 | """
65 | try:
66 | # 读取config.json文件
67 | config_path = "config.json"
68 | if os.path.exists(config_path):
69 | with open(config_path, "r", encoding="utf-8") as f:
70 | config = json.load(f)
71 |
72 | # 更新count值
73 | config["count"] = count
74 |
75 | # 写回文件
76 | with open(config_path, "w", encoding="utf-8") as f:
77 | json.dump(config, f, ensure_ascii=False, indent=2)
78 |
79 | print(f"已更新config.json的count值为: {count}")
80 | return True
81 | else:
82 | print(f"错误: 找不到config.json文件")
83 | return False
84 | except Exception as e:
85 | print(f"更新config.json的count值时发生错误: {str(e)}")
86 | return False
87 |
88 | def validate_json_file(file_path: str) -> bool:
89 | """
90 | 验证JSON文件格式是否正确
91 |
92 | Args:
93 | file_path: JSON文件路径
94 |
95 | Returns:
96 | bool: 文件格式是否有效
97 | """
98 | try:
99 | if os.path.exists(file_path):
100 | with open(file_path, "r", encoding="utf-8") as f:
101 | json.load(f)
102 | return True
103 | return False
104 | except Exception as e:
105 | print(f"JSON文件格式无效: {str(e)}")
106 | return False
107 |
108 | def update_recent_json():
109 | """更新recent.json文件,先清空现有配置,然后从索引0开始重新生成"""
110 | # 从merged_pdf目录读取文件
111 | merged_path = os.path.join("static", "merged_pdf")
112 |
113 | # 创建备份
114 | if os.path.exists("recent.json"):
115 | try:
116 | shutil.copy2("recent.json", "recent.json.bak")
117 | print(f"已创建备份文件: recent.json.bak")
118 | except Exception as e:
119 | print(f"创建备份文件失败: {str(e)}")
120 |
121 | # 扫描merged_pdf目录获取文件
122 | if not os.path.exists(merged_path):
123 | print(f"警告: 目录不存在 {merged_path}")
124 | try:
125 | os.makedirs(merged_path, exist_ok=True)
126 | except Exception as e:
127 | print(f"创建目录失败: {str(e)}")
128 |
129 | merged_files = glob.glob(os.path.join(merged_path, "*.pdf"))
130 | new_entries = []
131 |
132 | for file_path in merged_files:
133 | file_info = get_file_info(file_path)
134 | new_entries.append(file_info)
135 |
136 | # 从索引0开始分配
137 | for i, entry in enumerate(new_entries):
138 | entry["index"] = i
139 |
140 | # 保存前先验证数据格式
141 | try:
142 | # 使用json.dumps检查序列化是否正常
143 | json_str = json.dumps(new_entries, ensure_ascii=False, indent=2)
144 |
145 | # 写入文件
146 | with open("recent.json", "w", encoding="utf-8") as f:
147 | f.write(json_str)
148 |
149 | # 验证写入的文件
150 | if not validate_json_file("recent.json"):
151 | raise Exception("写入的JSON文件验证失败")
152 |
153 | # 更新config.json中的count值为新条目的数量
154 | update_config_count(len(new_entries))
155 |
156 | print(f"已重置并更新recent.json,共 {len(new_entries)} 条记录")
157 | except Exception as e:
158 | print(f"更新recent.json文件失败: {str(e)}")
159 | # 尝试恢复备份
160 | if os.path.exists("recent.json.bak"):
161 | try:
162 | shutil.copy2("recent.json.bak", "recent.json")
163 | print("已从备份恢复recent.json文件")
164 | except Exception as e2:
165 | print(f"从备份恢复失败: {str(e2)}")
166 |
167 | if __name__ == "__main__":
168 | update_recent_json()
169 |
--------------------------------------------------------------------------------