├── .github
└── workflows
│ └── docker-build.yml
├── .gitignore
├── Dockerfile
├── LICENSE
├── NOTICE
├── README.md
├── assets
├── magic-pdf-template.json
├── sponsor.JPG
├── streamlint_ui.png
└── zsxq.JPG
├── client
└── streamlit_client.py
├── core
├── __init__.py
├── base.py
├── converters
│ ├── __init__.py
│ ├── bingsearch.py
│ ├── custommarkdownify.py
│ ├── docx.py
│ ├── html.py
│ ├── image.py
│ ├── ipynb.py
│ ├── media.py
│ ├── mineru
│ │ ├── __init__.py
│ │ ├── pdf_processor.py
│ │ └── title_corrector.py
│ ├── mp3.py
│ ├── outlook.py
│ ├── pdf.py
│ ├── plaintext.py
│ ├── pptx.py
│ ├── rss.py
│ ├── wav.py
│ ├── wikipedia.py
│ ├── xls.py
│ ├── xlsx.py
│ ├── youtube.py
│ └── zip.py
├── markitdown.py
└── model_manager.py
├── main.py
├── repository
└── db.py
└── requirements.txt
/.github/workflows/docker-build.yml:
--------------------------------------------------------------------------------
1 | name: Docker Image CI
2 |
3 | on:
4 | push:
5 | tags:
6 | - 'v[0-9]+.[0-9]+.[0-9]+'
7 |
8 | jobs:
9 | build:
10 |
11 | runs-on: ubuntu-latest
12 |
13 | env:
14 | IMAGE_NAME: ${{ github.event.repository.name }}
15 |
16 | steps:
17 | - uses: actions/checkout@v4
18 |
19 | - name: Remove 'v' prefix from tag
20 | id: tag_name
21 | run: |
22 | TAG_NAME=${GITHUB_REF#refs/tags/}
23 | TAG_NAME=${TAG_NAME#v}
24 | echo "TAG_NAME=$TAG_NAME" >> $GITHUB_ENV
25 |
26 | - name: Log in to DockerHub
27 | run: echo "${{ secrets.DOCKERHUB_TOKEN }}" | docker login -u "${{ secrets.DOCKERHUB_USERNAME }}" --password-stdin
28 |
29 | - name: Build the Docker image
30 | run: docker build . --file Dockerfile --tag ${{ secrets.DOCKERHUB_USERNAME }}/${{ env.IMAGE_NAME }}:${{ env.TAG_NAME }}
31 |
32 | - name: Push the Docker image
33 | run: docker push ${{ secrets.DOCKERHUB_USERNAME }}/${{ env.IMAGE_NAME }}:${{ env.TAG_NAME }}
34 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .nox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | *.py,cover
50 | .hypothesis/
51 | .pytest_cache/
52 | cover/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | .pybuilder/
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | # For a library or package, you might want to ignore these files since the code is
87 | # intended to run in multiple environments; otherwise, check them in:
88 | # .python-version
89 |
90 | # pipenv
91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
94 | # install all needed dependencies.
95 | #Pipfile.lock
96 |
97 | # UV
98 | # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
99 | # This is especially recommended for binary packages to ensure reproducibility, and is more
100 | # commonly ignored for libraries.
101 | #uv.lock
102 |
103 | # poetry
104 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105 | # This is especially recommended for binary packages to ensure reproducibility, and is more
106 | # commonly ignored for libraries.
107 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108 | #poetry.lock
109 |
110 | # pdm
111 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
112 | #pdm.lock
113 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
114 | # in version control.
115 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
116 | .pdm.toml
117 | .pdm-python
118 | .pdm-build/
119 |
120 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
121 | __pypackages__/
122 |
123 | # Celery stuff
124 | celerybeat-schedule
125 | celerybeat.pid
126 |
127 | # SageMath parsed files
128 | *.sage.py
129 |
130 | # Environments
131 | .env
132 | .venv
133 | env/
134 | venv/
135 | ENV/
136 | env.bak/
137 | venv.bak/
138 |
139 | # Spyder project settings
140 | .spyderproject
141 | .spyproject
142 |
143 | # Rope project settings
144 | .ropeproject
145 |
146 | # mkdocs documentation
147 | /site
148 |
149 | # mypy
150 | .mypy_cache/
151 | .dmypy.json
152 | dmypy.json
153 |
154 | # Pyre type checker
155 | .pyre/
156 |
157 | # pytype static type analyzer
158 | .pytype/
159 |
160 | # Cython debug symbols
161 | cython_debug/
162 |
163 | # PyCharm
164 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
165 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
166 | # and can be added to the global gitignore or merged into this file. For a more nuclear
167 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
168 | #.idea/
169 |
170 | # Ruff stuff:
171 | .ruff_cache/
172 |
173 | # PyPI configuration file
174 | .pypirc
175 |
176 | # custome
177 | *.db
178 | output_files
179 | .DS_Store
180 | .idea
181 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.10-slim
2 |
3 | ENV PYTHONDONTWRITEBYTECODE=1
4 |
5 | ENV PYTHONUNBUFFERED=1
6 |
7 | ENV HF_ENDPOINT="https://hf-mirror.com"
8 |
9 | # 设置工作目录
10 | WORKDIR /app
11 |
12 | # 将 requirements.txt 复制到容器中
13 | COPY requirements.txt .
14 |
15 | # 安装依赖
16 | RUN pip install --upgrade -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple pip && \
17 | pip install --no-cache-dir -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple -r requirements.txt
18 |
19 | # 将应用代码复制到容器中
20 | COPY . .
21 |
22 | # 暴露应用服务端口
23 | EXPOSE 20926
24 |
25 | # 定义启动命令
26 | CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "20926"]
27 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | This project is licensed under the GNU Affero General Public License v3.0 (AGPL-3.0).
2 | Portions of this software are derived from the following project, see NOTICE for details.
3 | - [MarkitDown] (https://github.com/microsoft/markitdown) under the MIT License.
4 | - [MinerU] (https://github.com/opendatalab/MinerU) under the AGPL License.
5 |
6 |
7 |
8 | GNU AFFERO GENERAL PUBLIC LICENSE
9 | Version 3, 19 November 2007
10 |
11 | Copyright (C) 2007 Free Software Foundation, Inc.
12 | Everyone is permitted to copy and distribute verbatim copies
13 | of this license document, but changing it is not allowed.
14 |
15 | Preamble
16 |
17 | The GNU Affero General Public License is a free, copyleft license for
18 | software and other kinds of works, specifically designed to ensure
19 | cooperation with the community in the case of network server software.
20 |
21 | The licenses for most software and other practical works are designed
22 | to take away your freedom to share and change the works. By contrast,
23 | our General Public Licenses are intended to guarantee your freedom to
24 | share and change all versions of a program--to make sure it remains free
25 | software for all its users.
26 |
27 | When we speak of free software, we are referring to freedom, not
28 | price. Our General Public Licenses are designed to make sure that you
29 | have the freedom to distribute copies of free software (and charge for
30 | them if you wish), that you receive source code or can get it if you
31 | want it, that you can change the software or use pieces of it in new
32 | free programs, and that you know you can do these things.
33 |
34 | Developers that use our General Public Licenses protect your rights
35 | with two steps: (1) assert copyright on the software, and (2) offer
36 | you this License which gives you legal permission to copy, distribute
37 | and/or modify the software.
38 |
39 | A secondary benefit of defending all users' freedom is that
40 | improvements made in alternate versions of the program, if they
41 | receive widespread use, become available for other developers to
42 | incorporate. Many developers of free software are heartened and
43 | encouraged by the resulting cooperation. However, in the case of
44 | software used on network servers, this result may fail to come about.
45 | The GNU General Public License permits making a modified version and
46 | letting the public access it on a server without ever releasing its
47 | source code to the public.
48 |
49 | The GNU Affero General Public License is designed specifically to
50 | ensure that, in such cases, the modified source code becomes available
51 | to the community. It requires the operator of a network server to
52 | provide the source code of the modified version running there to the
53 | users of that server. Therefore, public use of a modified version, on
54 | a publicly accessible server, gives the public access to the source
55 | code of the modified version.
56 |
57 | An older license, called the Affero General Public License and
58 | published by Affero, was designed to accomplish similar goals. This is
59 | a different license, not a version of the Affero GPL, but Affero has
60 | released a new version of the Affero GPL which permits relicensing under
61 | this license.
62 |
63 | The precise terms and conditions for copying, distribution and
64 | modification follow.
65 |
66 | TERMS AND CONDITIONS
67 |
68 | 0. Definitions.
69 |
70 | "This License" refers to version 3 of the GNU Affero General Public License.
71 |
72 | "Copyright" also means copyright-like laws that apply to other kinds of
73 | works, such as semiconductor masks.
74 |
75 | "The Program" refers to any copyrightable work licensed under this
76 | License. Each licensee is addressed as "you". "Licensees" and
77 | "recipients" may be individuals or organizations.
78 |
79 | To "modify" a work means to copy from or adapt all or part of the work
80 | in a fashion requiring copyright permission, other than the making of an
81 | exact copy. The resulting work is called a "modified version" of the
82 | earlier work or a work "based on" the earlier work.
83 |
84 | A "covered work" means either the unmodified Program or a work based
85 | on the Program.
86 |
87 | To "propagate" a work means to do anything with it that, without
88 | permission, would make you directly or secondarily liable for
89 | infringement under applicable copyright law, except executing it on a
90 | computer or modifying a private copy. Propagation includes copying,
91 | distribution (with or without modification), making available to the
92 | public, and in some countries other activities as well.
93 |
94 | To "convey" a work means any kind of propagation that enables other
95 | parties to make or receive copies. Mere interaction with a user through
96 | a computer network, with no transfer of a copy, is not conveying.
97 |
98 | An interactive user interface displays "Appropriate Legal Notices"
99 | to the extent that it includes a convenient and prominently visible
100 | feature that (1) displays an appropriate copyright notice, and (2)
101 | tells the user that there is no warranty for the work (except to the
102 | extent that warranties are provided), that licensees may convey the
103 | work under this License, and how to view a copy of this License. If
104 | the interface presents a list of user commands or options, such as a
105 | menu, a prominent item in the list meets this criterion.
106 |
107 | 1. Source Code.
108 |
109 | The "source code" for a work means the preferred form of the work
110 | for making modifications to it. "Object code" means any non-source
111 | form of a work.
112 |
113 | A "Standard Interface" means an interface that either is an official
114 | standard defined by a recognized standards body, or, in the case of
115 | interfaces specified for a particular programming language, one that
116 | is widely used among developers working in that language.
117 |
118 | The "System Libraries" of an executable work include anything, other
119 | than the work as a whole, that (a) is included in the normal form of
120 | packaging a Major Component, but which is not part of that Major
121 | Component, and (b) serves only to enable use of the work with that
122 | Major Component, or to implement a Standard Interface for which an
123 | implementation is available to the public in source code form. A
124 | "Major Component", in this context, means a major essential component
125 | (kernel, window system, and so on) of the specific operating system
126 | (if any) on which the executable work runs, or a compiler used to
127 | produce the work, or an object code interpreter used to run it.
128 |
129 | The "Corresponding Source" for a work in object code form means all
130 | the source code needed to generate, install, and (for an executable
131 | work) run the object code and to modify the work, including scripts to
132 | control those activities. However, it does not include the work's
133 | System Libraries, or general-purpose tools or generally available free
134 | programs which are used unmodified in performing those activities but
135 | which are not part of the work. For example, Corresponding Source
136 | includes interface definition files associated with source files for
137 | the work, and the source code for shared libraries and dynamically
138 | linked subprograms that the work is specifically designed to require,
139 | such as by intimate data communication or control flow between those
140 | subprograms and other parts of the work.
141 |
142 | The Corresponding Source need not include anything that users
143 | can regenerate automatically from other parts of the Corresponding
144 | Source.
145 |
146 | The Corresponding Source for a work in source code form is that
147 | same work.
148 |
149 | 2. Basic Permissions.
150 |
151 | All rights granted under this License are granted for the term of
152 | copyright on the Program, and are irrevocable provided the stated
153 | conditions are met. This License explicitly affirms your unlimited
154 | permission to run the unmodified Program. The output from running a
155 | covered work is covered by this License only if the output, given its
156 | content, constitutes a covered work. This License acknowledges your
157 | rights of fair use or other equivalent, as provided by copyright law.
158 |
159 | You may make, run and propagate covered works that you do not
160 | convey, without conditions so long as your license otherwise remains
161 | in force. You may convey covered works to others for the sole purpose
162 | of having them make modifications exclusively for you, or provide you
163 | with facilities for running those works, provided that you comply with
164 | the terms of this License in conveying all material for which you do
165 | not control copyright. Those thus making or running the covered works
166 | for you must do so exclusively on your behalf, under your direction
167 | and control, on terms that prohibit them from making any copies of
168 | your copyrighted material outside their relationship with you.
169 |
170 | Conveying under any other circumstances is permitted solely under
171 | the conditions stated below. Sublicensing is not allowed; section 10
172 | makes it unnecessary.
173 |
174 | 3. Protecting Users' Legal Rights From Anti-Circumvention Law.
175 |
176 | No covered work shall be deemed part of an effective technological
177 | measure under any applicable law fulfilling obligations under article
178 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or
179 | similar laws prohibiting or restricting circumvention of such
180 | measures.
181 |
182 | When you convey a covered work, you waive any legal power to forbid
183 | circumvention of technological measures to the extent such circumvention
184 | is effected by exercising rights under this License with respect to
185 | the covered work, and you disclaim any intention to limit operation or
186 | modification of the work as a means of enforcing, against the work's
187 | users, your or third parties' legal rights to forbid circumvention of
188 | technological measures.
189 |
190 | 4. Conveying Verbatim Copies.
191 |
192 | You may convey verbatim copies of the Program's source code as you
193 | receive it, in any medium, provided that you conspicuously and
194 | appropriately publish on each copy an appropriate copyright notice;
195 | keep intact all notices stating that this License and any
196 | non-permissive terms added in accord with section 7 apply to the code;
197 | keep intact all notices of the absence of any warranty; and give all
198 | recipients a copy of this License along with the Program.
199 |
200 | You may charge any price or no price for each copy that you convey,
201 | and you may offer support or warranty protection for a fee.
202 |
203 | 5. Conveying Modified Source Versions.
204 |
205 | You may convey a work based on the Program, or the modifications to
206 | produce it from the Program, in the form of source code under the
207 | terms of section 4, provided that you also meet all of these conditions:
208 |
209 | a) The work must carry prominent notices stating that you modified
210 | it, and giving a relevant date.
211 |
212 | b) The work must carry prominent notices stating that it is
213 | released under this License and any conditions added under section
214 | 7. This requirement modifies the requirement in section 4 to
215 | "keep intact all notices".
216 |
217 | c) You must license the entire work, as a whole, under this
218 | License to anyone who comes into possession of a copy. This
219 | License will therefore apply, along with any applicable section 7
220 | additional terms, to the whole of the work, and all its parts,
221 | regardless of how they are packaged. This License gives no
222 | permission to license the work in any other way, but it does not
223 | invalidate such permission if you have separately received it.
224 |
225 | d) If the work has interactive user interfaces, each must display
226 | Appropriate Legal Notices; however, if the Program has interactive
227 | interfaces that do not display Appropriate Legal Notices, your
228 | work need not make them do so.
229 |
230 | A compilation of a covered work with other separate and independent
231 | works, which are not by their nature extensions of the covered work,
232 | and which are not combined with it such as to form a larger program,
233 | in or on a volume of a storage or distribution medium, is called an
234 | "aggregate" if the compilation and its resulting copyright are not
235 | used to limit the access or legal rights of the compilation's users
236 | beyond what the individual works permit. Inclusion of a covered work
237 | in an aggregate does not cause this License to apply to the other
238 | parts of the aggregate.
239 |
240 | 6. Conveying Non-Source Forms.
241 |
242 | You may convey a covered work in object code form under the terms
243 | of sections 4 and 5, provided that you also convey the
244 | machine-readable Corresponding Source under the terms of this License,
245 | in one of these ways:
246 |
247 | a) Convey the object code in, or embodied in, a physical product
248 | (including a physical distribution medium), accompanied by the
249 | Corresponding Source fixed on a durable physical medium
250 | customarily used for software interchange.
251 |
252 | b) Convey the object code in, or embodied in, a physical product
253 | (including a physical distribution medium), accompanied by a
254 | written offer, valid for at least three years and valid for as
255 | long as you offer spare parts or customer support for that product
256 | model, to give anyone who possesses the object code either (1) a
257 | copy of the Corresponding Source for all the software in the
258 | product that is covered by this License, on a durable physical
259 | medium customarily used for software interchange, for a price no
260 | more than your reasonable cost of physically performing this
261 | conveying of source, or (2) access to copy the
262 | Corresponding Source from a network server at no charge.
263 |
264 | c) Convey individual copies of the object code with a copy of the
265 | written offer to provide the Corresponding Source. This
266 | alternative is allowed only occasionally and noncommercially, and
267 | only if you received the object code with such an offer, in accord
268 | with subsection 6b.
269 |
270 | d) Convey the object code by offering access from a designated
271 | place (gratis or for a charge), and offer equivalent access to the
272 | Corresponding Source in the same way through the same place at no
273 | further charge. You need not require recipients to copy the
274 | Corresponding Source along with the object code. If the place to
275 | copy the object code is a network server, the Corresponding Source
276 | may be on a different server (operated by you or a third party)
277 | that supports equivalent copying facilities, provided you maintain
278 | clear directions next to the object code saying where to find the
279 | Corresponding Source. Regardless of what server hosts the
280 | Corresponding Source, you remain obligated to ensure that it is
281 | available for as long as needed to satisfy these requirements.
282 |
283 | e) Convey the object code using peer-to-peer transmission, provided
284 | you inform other peers where the object code and Corresponding
285 | Source of the work are being offered to the general public at no
286 | charge under subsection 6d.
287 |
288 | A separable portion of the object code, whose source code is excluded
289 | from the Corresponding Source as a System Library, need not be
290 | included in conveying the object code work.
291 |
292 | A "User Product" is either (1) a "consumer product", which means any
293 | tangible personal property which is normally used for personal, family,
294 | or household purposes, or (2) anything designed or sold for incorporation
295 | into a dwelling. In determining whether a product is a consumer product,
296 | doubtful cases shall be resolved in favor of coverage. For a particular
297 | product received by a particular user, "normally used" refers to a
298 | typical or common use of that class of product, regardless of the status
299 | of the particular user or of the way in which the particular user
300 | actually uses, or expects or is expected to use, the product. A product
301 | is a consumer product regardless of whether the product has substantial
302 | commercial, industrial or non-consumer uses, unless such uses represent
303 | the only significant mode of use of the product.
304 |
305 | "Installation Information" for a User Product means any methods,
306 | procedures, authorization keys, or other information required to install
307 | and execute modified versions of a covered work in that User Product from
308 | a modified version of its Corresponding Source. The information must
309 | suffice to ensure that the continued functioning of the modified object
310 | code is in no case prevented or interfered with solely because
311 | modification has been made.
312 |
313 | If you convey an object code work under this section in, or with, or
314 | specifically for use in, a User Product, and the conveying occurs as
315 | part of a transaction in which the right of possession and use of the
316 | User Product is transferred to the recipient in perpetuity or for a
317 | fixed term (regardless of how the transaction is characterized), the
318 | Corresponding Source conveyed under this section must be accompanied
319 | by the Installation Information. But this requirement does not apply
320 | if neither you nor any third party retains the ability to install
321 | modified object code on the User Product (for example, the work has
322 | been installed in ROM).
323 |
324 | The requirement to provide Installation Information does not include a
325 | requirement to continue to provide support service, warranty, or updates
326 | for a work that has been modified or installed by the recipient, or for
327 | the User Product in which it has been modified or installed. Access to a
328 | network may be denied when the modification itself materially and
329 | adversely affects the operation of the network or violates the rules and
330 | protocols for communication across the network.
331 |
332 | Corresponding Source conveyed, and Installation Information provided,
333 | in accord with this section must be in a format that is publicly
334 | documented (and with an implementation available to the public in
335 | source code form), and must require no special password or key for
336 | unpacking, reading or copying.
337 |
338 | 7. Additional Terms.
339 |
340 | "Additional permissions" are terms that supplement the terms of this
341 | License by making exceptions from one or more of its conditions.
342 | Additional permissions that are applicable to the entire Program shall
343 | be treated as though they were included in this License, to the extent
344 | that they are valid under applicable law. If additional permissions
345 | apply only to part of the Program, that part may be used separately
346 | under those permissions, but the entire Program remains governed by
347 | this License without regard to the additional permissions.
348 |
349 | When you convey a copy of a covered work, you may at your option
350 | remove any additional permissions from that copy, or from any part of
351 | it. (Additional permissions may be written to require their own
352 | removal in certain cases when you modify the work.) You may place
353 | additional permissions on material, added by you to a covered work,
354 | for which you have or can give appropriate copyright permission.
355 |
356 | Notwithstanding any other provision of this License, for material you
357 | add to a covered work, you may (if authorized by the copyright holders of
358 | that material) supplement the terms of this License with terms:
359 |
360 | a) Disclaiming warranty or limiting liability differently from the
361 | terms of sections 15 and 16 of this License; or
362 |
363 | b) Requiring preservation of specified reasonable legal notices or
364 | author attributions in that material or in the Appropriate Legal
365 | Notices displayed by works containing it; or
366 |
367 | c) Prohibiting misrepresentation of the origin of that material, or
368 | requiring that modified versions of such material be marked in
369 | reasonable ways as different from the original version; or
370 |
371 | d) Limiting the use for publicity purposes of names of licensors or
372 | authors of the material; or
373 |
374 | e) Declining to grant rights under trademark law for use of some
375 | trade names, trademarks, or service marks; or
376 |
377 | f) Requiring indemnification of licensors and authors of that
378 | material by anyone who conveys the material (or modified versions of
379 | it) with contractual assumptions of liability to the recipient, for
380 | any liability that these contractual assumptions directly impose on
381 | those licensors and authors.
382 |
383 | All other non-permissive additional terms are considered "further
384 | restrictions" within the meaning of section 10. If the Program as you
385 | received it, or any part of it, contains a notice stating that it is
386 | governed by this License along with a term that is a further
387 | restriction, you may remove that term. If a license document contains
388 | a further restriction but permits relicensing or conveying under this
389 | License, you may add to a covered work material governed by the terms
390 | of that license document, provided that the further restriction does
391 | not survive such relicensing or conveying.
392 |
393 | If you add terms to a covered work in accord with this section, you
394 | must place, in the relevant source files, a statement of the
395 | additional terms that apply to those files, or a notice indicating
396 | where to find the applicable terms.
397 |
398 | Additional terms, permissive or non-permissive, may be stated in the
399 | form of a separately written license, or stated as exceptions;
400 | the above requirements apply either way.
401 |
402 | 8. Termination.
403 |
404 | You may not propagate or modify a covered work except as expressly
405 | provided under this License. Any attempt otherwise to propagate or
406 | modify it is void, and will automatically terminate your rights under
407 | this License (including any patent licenses granted under the third
408 | paragraph of section 11).
409 |
410 | However, if you cease all violation of this License, then your
411 | license from a particular copyright holder is reinstated (a)
412 | provisionally, unless and until the copyright holder explicitly and
413 | finally terminates your license, and (b) permanently, if the copyright
414 | holder fails to notify you of the violation by some reasonable means
415 | prior to 60 days after the cessation.
416 |
417 | Moreover, your license from a particular copyright holder is
418 | reinstated permanently if the copyright holder notifies you of the
419 | violation by some reasonable means, this is the first time you have
420 | received notice of violation of this License (for any work) from that
421 | copyright holder, and you cure the violation prior to 30 days after
422 | your receipt of the notice.
423 |
424 | Termination of your rights under this section does not terminate the
425 | licenses of parties who have received copies or rights from you under
426 | this License. If your rights have been terminated and not permanently
427 | reinstated, you do not qualify to receive new licenses for the same
428 | material under section 10.
429 |
430 | 9. Acceptance Not Required for Having Copies.
431 |
432 | You are not required to accept this License in order to receive or
433 | run a copy of the Program. Ancillary propagation of a covered work
434 | occurring solely as a consequence of using peer-to-peer transmission
435 | to receive a copy likewise does not require acceptance. However,
436 | nothing other than this License grants you permission to propagate or
437 | modify any covered work. These actions infringe copyright if you do
438 | not accept this License. Therefore, by modifying or propagating a
439 | covered work, you indicate your acceptance of this License to do so.
440 |
441 | 10. Automatic Licensing of Downstream Recipients.
442 |
443 | Each time you convey a covered work, the recipient automatically
444 | receives a license from the original licensors, to run, modify and
445 | propagate that work, subject to this License. You are not responsible
446 | for enforcing compliance by third parties with this License.
447 |
448 | An "entity transaction" is a transaction transferring control of an
449 | organization, or substantially all assets of one, or subdividing an
450 | organization, or merging organizations. If propagation of a covered
451 | work results from an entity transaction, each party to that
452 | transaction who receives a copy of the work also receives whatever
453 | licenses to the work the party's predecessor in interest had or could
454 | give under the previous paragraph, plus a right to possession of the
455 | Corresponding Source of the work from the predecessor in interest, if
456 | the predecessor has it or can get it with reasonable efforts.
457 |
458 | You may not impose any further restrictions on the exercise of the
459 | rights granted or affirmed under this License. For example, you may
460 | not impose a license fee, royalty, or other charge for exercise of
461 | rights granted under this License, and you may not initiate litigation
462 | (including a cross-claim or counterclaim in a lawsuit) alleging that
463 | any patent claim is infringed by making, using, selling, offering for
464 | sale, or importing the Program or any portion of it.
465 |
466 | 11. Patents.
467 |
468 | A "contributor" is a copyright holder who authorizes use under this
469 | License of the Program or a work on which the Program is based. The
470 | work thus licensed is called the contributor's "contributor version".
471 |
472 | A contributor's "essential patent claims" are all patent claims
473 | owned or controlled by the contributor, whether already acquired or
474 | hereafter acquired, that would be infringed by some manner, permitted
475 | by this License, of making, using, or selling its contributor version,
476 | but do not include claims that would be infringed only as a
477 | consequence of further modification of the contributor version. For
478 | purposes of this definition, "control" includes the right to grant
479 | patent sublicenses in a manner consistent with the requirements of
480 | this License.
481 |
482 | Each contributor grants you a non-exclusive, worldwide, royalty-free
483 | patent license under the contributor's essential patent claims, to
484 | make, use, sell, offer for sale, import and otherwise run, modify and
485 | propagate the contents of its contributor version.
486 |
487 | In the following three paragraphs, a "patent license" is any express
488 | agreement or commitment, however denominated, not to enforce a patent
489 | (such as an express permission to practice a patent or covenant not to
490 | sue for patent infringement). To "grant" such a patent license to a
491 | party means to make such an agreement or commitment not to enforce a
492 | patent against the party.
493 |
494 | If you convey a covered work, knowingly relying on a patent license,
495 | and the Corresponding Source of the work is not available for anyone
496 | to copy, free of charge and under the terms of this License, through a
497 | publicly available network server or other readily accessible means,
498 | then you must either (1) cause the Corresponding Source to be so
499 | available, or (2) arrange to deprive yourself of the benefit of the
500 | patent license for this particular work, or (3) arrange, in a manner
501 | consistent with the requirements of this License, to extend the patent
502 | license to downstream recipients. "Knowingly relying" means you have
503 | actual knowledge that, but for the patent license, your conveying the
504 | covered work in a country, or your recipient's use of the covered work
505 | in a country, would infringe one or more identifiable patents in that
506 | country that you have reason to believe are valid.
507 |
508 | If, pursuant to or in connection with a single transaction or
509 | arrangement, you convey, or propagate by procuring conveyance of, a
510 | covered work, and grant a patent license to some of the parties
511 | receiving the covered work authorizing them to use, propagate, modify
512 | or convey a specific copy of the covered work, then the patent license
513 | you grant is automatically extended to all recipients of the covered
514 | work and works based on it.
515 |
516 | A patent license is "discriminatory" if it does not include within
517 | the scope of its coverage, prohibits the exercise of, or is
518 | conditioned on the non-exercise of one or more of the rights that are
519 | specifically granted under this License. You may not convey a covered
520 | work if you are a party to an arrangement with a third party that is
521 | in the business of distributing software, under which you make payment
522 | to the third party based on the extent of your activity of conveying
523 | the work, and under which the third party grants, to any of the
524 | parties who would receive the covered work from you, a discriminatory
525 | patent license (a) in connection with copies of the covered work
526 | conveyed by you (or copies made from those copies), or (b) primarily
527 | for and in connection with specific products or compilations that
528 | contain the covered work, unless you entered into that arrangement,
529 | or that patent license was granted, prior to 28 March 2007.
530 |
531 | Nothing in this License shall be construed as excluding or limiting
532 | any implied license or other defenses to infringement that may
533 | otherwise be available to you under applicable patent law.
534 |
535 | 12. No Surrender of Others' Freedom.
536 |
537 | If conditions are imposed on you (whether by court order, agreement or
538 | otherwise) that contradict the conditions of this License, they do not
539 | excuse you from the conditions of this License. If you cannot convey a
540 | covered work so as to satisfy simultaneously your obligations under this
541 | License and any other pertinent obligations, then as a consequence you may
542 | not convey it at all. For example, if you agree to terms that obligate you
543 | to collect a royalty for further conveying from those to whom you convey
544 | the Program, the only way you could satisfy both those terms and this
545 | License would be to refrain entirely from conveying the Program.
546 |
547 | 13. Remote Network Interaction; Use with the GNU General Public License.
548 |
549 | Notwithstanding any other provision of this License, if you modify the
550 | Program, your modified version must prominently offer all users
551 | interacting with it remotely through a computer network (if your version
552 | supports such interaction) an opportunity to receive the Corresponding
553 | Source of your version by providing access to the Corresponding Source
554 | from a network server at no charge, through some standard or customary
555 | means of facilitating copying of software. This Corresponding Source
556 | shall include the Corresponding Source for any work covered by version 3
557 | of the GNU General Public License that is incorporated pursuant to the
558 | following paragraph.
559 |
560 | Notwithstanding any other provision of this License, you have
561 | permission to link or combine any covered work with a work licensed
562 | under version 3 of the GNU General Public License into a single
563 | combined work, and to convey the resulting work. The terms of this
564 | License will continue to apply to the part which is the covered work,
565 | but the work with which it is combined will remain governed by version
566 | 3 of the GNU General Public License.
567 |
568 | 14. Revised Versions of this License.
569 |
570 | The Free Software Foundation may publish revised and/or new versions of
571 | the GNU Affero General Public License from time to time. Such new versions
572 | will be similar in spirit to the present version, but may differ in detail to
573 | address new problems or concerns.
574 |
575 | Each version is given a distinguishing version number. If the
576 | Program specifies that a certain numbered version of the GNU Affero General
577 | Public License "or any later version" applies to it, you have the
578 | option of following the terms and conditions either of that numbered
579 | version or of any later version published by the Free Software
580 | Foundation. If the Program does not specify a version number of the
581 | GNU Affero General Public License, you may choose any version ever published
582 | by the Free Software Foundation.
583 |
584 | If the Program specifies that a proxy can decide which future
585 | versions of the GNU Affero General Public License can be used, that proxy's
586 | public statement of acceptance of a version permanently authorizes you
587 | to choose that version for the Program.
588 |
589 | Later license versions may give you additional or different
590 | permissions. However, no additional obligations are imposed on any
591 | author or copyright holder as a result of your choosing to follow a
592 | later version.
593 |
594 | 15. Disclaimer of Warranty.
595 |
596 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
597 | APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
598 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
599 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
600 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
601 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
602 | IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
603 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
604 |
605 | 16. Limitation of Liability.
606 |
607 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
608 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
609 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
610 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
611 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
612 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
613 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
614 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
615 | SUCH DAMAGES.
616 |
617 | 17. Interpretation of Sections 15 and 16.
618 |
619 | If the disclaimer of warranty and limitation of liability provided
620 | above cannot be given local legal effect according to their terms,
621 | reviewing courts shall apply local law that most closely approximates
622 | an absolute waiver of all civil liability in connection with the
623 | Program, unless a warranty or assumption of liability accompanies a
624 | copy of the Program in return for a fee.
625 |
626 | END OF TERMS AND CONDITIONS
627 |
628 | How to Apply These Terms to Your New Programs
629 |
630 | If you develop a new program, and you want it to be of the greatest
631 | possible use to the public, the best way to achieve this is to make it
632 | free software which everyone can redistribute and change under these terms.
633 |
634 | To do so, attach the following notices to the program. It is safest
635 | to attach them to the start of each source file to most effectively
636 | state the exclusion of warranty; and each file should have at least
637 | the "copyright" line and a pointer to where the full notice is found.
638 |
639 |
640 | Copyright (C)
641 |
642 | This program is free software: you can redistribute it and/or modify
643 | it under the terms of the GNU Affero General Public License as published
644 | by the Free Software Foundation, either version 3 of the License, or
645 | (at your option) any later version.
646 |
647 | This program is distributed in the hope that it will be useful,
648 | but WITHOUT ANY WARRANTY; without even the implied warranty of
649 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
650 | GNU Affero General Public License for more details.
651 |
652 | You should have received a copy of the GNU Affero General Public License
653 | along with this program. If not, see .
654 |
655 | Also add information on how to contact you by electronic and paper mail.
656 |
657 | If your software can interact with users remotely through a computer
658 | network, you should also make sure that it provides a way for users to
659 | get its source. For example, if your program is a web application, its
660 | interface could display a "Source" link that leads users to an archive
661 | of the code. There are many ways you could offer source, and different
662 | solutions will be better for different programs; see section 13 for the
663 | specific requirements.
664 |
665 | You should also get your employer (if you work as a programmer) or school,
666 | if any, to sign a "copyright disclaimer" for the program, if necessary.
667 | For more information on this, and how to apply and follow the GNU AGPL, see
668 | .
669 |
--------------------------------------------------------------------------------
/NOTICE:
--------------------------------------------------------------------------------
1 |
2 | NOTICE
3 |
4 | This project includes code from the following MIT-licensed project(s):
5 |
6 | - Project Name: Markitdown
7 | - Repository: https://github.com/microsoft/markitdown
8 | - License: MIT
9 |
10 | MIT License
11 |
12 | Copyright (c) Microsoft Corporation.
13 |
14 | Permission is hereby granted, free of charge, to any person obtaining a copy
15 | of this software and associated documentation files (the "Software"), to deal
16 | in the Software without restriction, including without limitation the rights
17 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
18 | copies of the Software, and to permit persons to whom the Software is
19 | furnished to do so, subject to the following conditions:
20 |
21 | The above copyright notice and this permission notice shall be included in all
22 | copies or substantial portions of the Software.
23 |
24 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
25 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
26 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
27 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
28 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
29 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 | SOFTWARE
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Markify
2 |
3 | ✨ **轻松转换文件为 Markdown,助力 RAG 与 LLM 更智能地理解内容!** ✨
4 |
5 | 🚀 **基于 Markitdown 与 MinerU**,支持多种格式转换,并提供 **高质量 PDF 解析**,让你的文档更易处理、更易用!
6 |
7 | 📡 **支持 API & Streamlit 端**,随时随地高效转换,轻松集成!
8 |
9 | 📂 **支持多种文件格式**:
10 | - 📄 **文档**:PDF、Word、PPT、Excel
11 | - 🖼 **多媒体**:图片、音频
12 | - 🌐 **网页与数据**:HTML、CSV、JSON、XML
13 | - 🗂 **压缩文件**:ZIP
14 |
15 | ⚡ **多种 PDF 解析模式,满足不同需求**:
16 | - 🚀 **快速模式**(基于 pdfminer,解析高效)
17 | - 🏆 **高级模式**(结合 MinerU 深度解析,效果更佳)
18 | - ☁️ **云端模式**(开发中,敬请期待!)
19 |
20 | 📖 **Markdown 化你的文件,助力 LLM 更好地理解与处理文档!** 💡
21 |
22 | 
23 | ```shell
24 | streamlit run ./client/streamlit_client.py
25 | ```
26 |
27 | ## API
28 | FastAPI自带API文档 http://127.0.0.1:20926/docs
29 | ### 上传文件,创建任务
30 | 请求
31 | ```shell
32 | curl -X 'POST' \
33 | 'http://127.0.0.1:20926/api/jobs' \
34 | -H 'accept: application/json' \
35 | -H 'Content-Type: multipart/form-data' \
36 | -F 'file=@CoA.pdf;type=application/pdf' \
37 | -F 'mode=advanced'
38 | ```
39 | 响应
40 | ```json
41 | {
42 | "job_id": "29bbad6b-c167-41f0-8a29-99551c499263"
43 | }
44 | ```
45 | ### 查询任务状态
46 | 请求
47 | ```shell
48 | curl -X 'GET' \
49 | 'http://127.0.0.1:20926/api/jobs/29bbad6b-c167-41f0-8a29-99551c499263' \
50 | -H 'accept: application/json'
51 | ```
52 | 响应
53 | ```json
54 | {
55 | "job_id": "29bbad6b-c167-41f0-8a29-99551c499263",
56 | "status": "completed",
57 | "filename": "CoA.pdf",
58 | "params": {
59 | "mode": "advanced"
60 | },
61 | "error": null
62 | }
63 | ```
64 | ### 下载markdown文件
65 | 请求
66 | ```shell
67 | curl -X 'GET' \
68 | 'http://127.0.0.1:20926/api/jobs/29bbad6b-c167-41f0-8a29-99551c499263/result' \
69 | -H 'accept: application/json'
70 | ```
71 | 响应
72 | 文件
73 |
74 |
75 | ## Docker部署
76 | ```shell
77 | docker pull wsjcuhk/markify:0.0.1
78 | docker run -d -p 20926:20926 wsjcuhk/markify:0.0.1
79 | ```
80 |
81 |
82 | ## TODO
83 | - 添加云端解析模式
84 | - 自动打包为Docker镜像
85 |
86 | ## 赞助我
87 | 开源不易,有专业指导需求或赞助,可以加入我的知识星球,我会提供专业的技术指导。
88 | 
89 |
90 |
91 | ## 致敬
92 | 本项目参考微软markitdown和上海浦语mineru。
93 | - [markitdown](https://github.com/microsoft/markitdown)
94 | - [mineru](https://github.com/opendatalab/MinerU)
95 |
--------------------------------------------------------------------------------
/assets/magic-pdf-template.json:
--------------------------------------------------------------------------------
1 | {
2 | "bucket_info":{
3 | "bucket-name-1":["ak", "sk", "endpoint"],
4 | "bucket-name-2":["ak", "sk", "endpoint"]
5 | },
6 | "models-dir":"/tmp/models",
7 | "layoutreader-model-dir":"/tmp/layoutreader",
8 | "device-mode":"cpu",
9 | "layout-config": {
10 | "model": "doclayout_yolo"
11 | },
12 | "formula-config": {
13 | "mfd_model": "yolo_v8_mfd",
14 | "mfr_model": "unimernet_small",
15 | "enable": true
16 | },
17 | "table-config": {
18 | "model": "rapid_table",
19 | "sub_model": "slanet_plus",
20 | "enable": true,
21 | "max_time": 400
22 | },
23 | "llm-aided-config": {
24 | "formula_aided": {
25 | "api_key": "your_api_key",
26 | "base_url": "https://dashscope.aliyuncs.com/compatible-mode/v1",
27 | "model": "qwen2.5-7b-instruct",
28 | "enable": false
29 | },
30 | "text_aided": {
31 | "api_key": "your_api_key",
32 | "base_url": "https://dashscope.aliyuncs.com/compatible-mode/v1",
33 | "model": "qwen2.5-7b-instruct",
34 | "enable": false
35 | },
36 | "title_aided": {
37 | "api_key": "your_api_key",
38 | "base_url": "https://dashscope.aliyuncs.com/compatible-mode/v1",
39 | "model": "qwen2.5-32b-instruct",
40 | "enable": false
41 | }
42 | },
43 | "config_version": "1.1.1"
44 | }
--------------------------------------------------------------------------------
/assets/sponsor.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KylinMountain/markify/622070781f1cdb74506b3ec1ec7d1749c1ef458c/assets/sponsor.JPG
--------------------------------------------------------------------------------
/assets/streamlint_ui.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KylinMountain/markify/622070781f1cdb74506b3ec1ec7d1749c1ef458c/assets/streamlint_ui.png
--------------------------------------------------------------------------------
/assets/zsxq.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KylinMountain/markify/622070781f1cdb74506b3ec1ec7d1749c1ef458c/assets/zsxq.JPG
--------------------------------------------------------------------------------
/client/streamlit_client.py:
--------------------------------------------------------------------------------
1 | import streamlit as st
2 | import requests
3 | import time
4 | import os
5 |
6 | # ============ 配置区 ============
7 | BASE_URL = "http://localhost:20926"
8 |
9 |
10 | # ============ 工具函数 ============
11 |
12 | def fetch_jobs(page=0, limit=10):
13 | """
14 | 从后端 /api/jobs 获取最新任务列表
15 | (请确保后端已实现 ?page=...&limit=... 分页参数)
16 | """
17 | url = f"{BASE_URL}/api/jobs?page={page}&limit={limit}"
18 | try:
19 | resp = requests.get(url)
20 | if resp.status_code == 200:
21 | return resp.json() # 后端应返回一个任务列表 (list)
22 | else:
23 | st.error(f"获取任务列表失败: {resp.text}")
24 | return []
25 | except requests.RequestException as e:
26 | st.error(f"网络异常: {e}")
27 | return []
28 |
29 |
30 | def upload_file(file, mode):
31 | """
32 | 上传单个文件到后端,创建任务。
33 | 成功后立刻刷新页面,以获取最新的任务列表。
34 | """
35 | files = {"file": file}
36 | data = {"mode": mode}
37 | try:
38 | response = requests.post(f"{BASE_URL}/api/jobs", files=files, data=data)
39 | if response.status_code == 202:
40 | st.success(f"文件 `{file.name}` 上传成功,已加入任务队列。")
41 | st.experimental_rerun() # 触发页面刷新,从而 fetch_jobs()
42 | else:
43 | st.error(f"文件 `{file.name}` 上传失败: {response.text}")
44 | except requests.RequestException as e:
45 | st.error(f"网络异常:{e}")
46 |
47 |
48 | def upload_url(url, mode):
49 | """
50 | 上传单个 URL 到后端,创建任务。
51 | 成功后立刻刷新页面,以获取最新的任务列表。
52 | """
53 | data = {"url": url, "mode": mode}
54 | try:
55 | response = requests.post(f"{BASE_URL}/api/jobs/url", json=data)
56 | if response.status_code == 202:
57 | st.success(f"URL `{url}` 提交成功,已加入任务队列。")
58 | st.experimental_rerun()
59 | else:
60 | st.error(f"URL `{url}` 上传失败: {response.text}")
61 | except requests.RequestException as e:
62 | st.error(f"网络异常:{e}")
63 |
64 |
65 | def show_file_entry(job):
66 | """
67 | 在右侧文件列表中渲染每个任务条目。
68 | 后端返回的 job 数据结构示例(JSON):
69 | {
70 | "job_id": "xxx",
71 | "status": "completed",
72 | "filename": "test.pdf",
73 | "params": {"mode": "simple"},
74 | "error": null,
75 | "created_at": "2025-02-25T10:00:00"
76 | }
77 | 你也可以根据后端的返回字段进行修改。
78 | """
79 | col1, col2, col3, col4 = st.columns([3, 2, 2, 1])
80 |
81 | with col1:
82 | st.markdown(f"**{job['filename']}**")
83 |
84 | # 如果后端返回了 created_at,可显示
85 | with col2:
86 | created_time = job.get("created_at", "")
87 | st.markdown(f"{created_time}")
88 |
89 | with col3:
90 | status = job["status"]
91 | if status == "completed":
92 | status_icon = "✅"
93 | elif status == "failed":
94 | status_icon = "❌"
95 | else:
96 | status_icon = "⏳"
97 | st.markdown(f"{status_icon} {status}")
98 |
99 | with col4:
100 | # 如果已完成,提供下载
101 | if status == "completed":
102 | try:
103 | result_response = requests.get(f"{BASE_URL}/api/jobs/{job['job_id']}/result")
104 | if result_response.status_code == 200:
105 | st.download_button(
106 | label="下载",
107 | data=result_response.content,
108 | file_name=f"{job['filename']}.md",
109 | mime="text/markdown",
110 | key=f"download_{job['job_id']}" # 添加唯一 key
111 | )
112 | else:
113 | st.error("无法下载")
114 | except requests.RequestException as e:
115 | st.error(f"下载异常:{e}")
116 |
117 |
118 | # ============ 主函数 ============
119 |
120 | def main():
121 | st.set_page_config(page_title="Markify", layout="wide")
122 |
123 | # 页面标题与说明
124 | st.title("Markify - 文档处理")
125 | st.markdown("在左侧上传文件或提交 URL,右侧实时查看进度并下载结果。")
126 |
127 | # 布局:左侧上传,右侧列表
128 | left_col, right_col = st.columns([2, 3], gap="large")
129 |
130 | with left_col:
131 | st.subheader("上传设置")
132 | mode = st.selectbox("选择 PDF 处理模式", ["simple", "advanced", "cloud"])
133 |
134 | # 本地文件上传
135 | uploaded_files = st.file_uploader(
136 | "选择文件(任意类型)",
137 | type=None,
138 | accept_multiple_files=True
139 | )
140 | if uploaded_files and st.button("上传文件"):
141 | for file in uploaded_files:
142 | upload_file(file, mode)
143 |
144 | # URL 上传
145 | st.subheader("URL 上传")
146 | file_urls = st.text_area("请输入文件 URL(每行一个)")
147 | if file_urls and st.button("提交 URL"):
148 | for url in file_urls.strip().split("\n"):
149 | if url:
150 | upload_url(url.strip(), mode)
151 |
152 | # 结果存储位置(仅作提示)
153 | st.markdown(f"**解析结果存储路径**:`{os.path.expanduser('~')}/MinerU`")
154 |
155 | with right_col:
156 | st.subheader("文件列表")
157 |
158 | # 手动刷新按钮
159 | if st.button("刷新列表"):
160 | st.experimental_rerun()
161 |
162 | # 从后端获取任务列表
163 | jobs = fetch_jobs(page=0, limit=10)
164 | if not jobs:
165 | st.info("暂无任务,请上传后查看。")
166 | else:
167 | for job in jobs:
168 | show_file_entry(job)
169 |
170 |
171 | if __name__ == "__main__":
172 | main()
173 |
--------------------------------------------------------------------------------
/core/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KylinMountain/markify/622070781f1cdb74506b3ec1ec7d1749c1ef458c/core/__init__.py
--------------------------------------------------------------------------------
/core/base.py:
--------------------------------------------------------------------------------
1 | from typing import Any, Union
2 |
3 |
4 | class DocumentConverterResult:
5 | """The result of converting a document to text."""
6 |
7 | def __init__(self, title: Union[str, None] = None, text_content: str = ""):
8 | self.title: Union[str, None] = title
9 | self.text_content: str = text_content
10 |
11 |
12 | class DocumentConverter:
13 | """Abstract superclass of all DocumentConverters."""
14 |
15 | def convert(
16 | self, local_path: str, **kwargs: Any
17 | ) -> Union[None, DocumentConverterResult]:
18 | raise NotImplementedError()
19 |
20 |
21 | class FileConversionException(BaseException):
22 | pass
23 |
24 |
25 | class UnsupportedFormatException(BaseException):
26 | pass
27 |
--------------------------------------------------------------------------------
/core/converters/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KylinMountain/markify/622070781f1cdb74506b3ec1ec7d1749c1ef458c/core/converters/__init__.py
--------------------------------------------------------------------------------
/core/converters/bingsearch.py:
--------------------------------------------------------------------------------
1 | import base64
2 | import binascii
3 | import re
4 | from typing import Union
5 | from urllib.parse import parse_qs, urlparse
6 |
7 | from bs4 import BeautifulSoup
8 |
9 | from core.base import DocumentConverter, DocumentConverterResult
10 | from core.converters.custommarkdownify import _CustomMarkdownify
11 |
12 |
13 | class BingSerpConverter(DocumentConverter):
14 | """
15 | Handle Bing results pages (only the organic search results).
16 | NOTE: It is better to use the Bing API
17 | """
18 |
19 | def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
20 | # Bail if not a Bing SERP
21 | extension = kwargs.get("file_extension", "")
22 | if extension.lower() not in [".html", ".htm"]:
23 | return None
24 | url = kwargs.get("url", "")
25 | if not re.search(r"^https://www\.bing\.com/search\?q=", url):
26 | return None
27 |
28 | # Parse the query parameters
29 | parsed_params = parse_qs(urlparse(url).query)
30 | query = parsed_params.get("q", [""])[0]
31 |
32 | # Parse the file
33 | soup = None
34 | with open(local_path, "rt", encoding="utf-8") as fh:
35 | soup = BeautifulSoup(fh.read(), "html.parser")
36 |
37 | # Clean up some formatting
38 | for tptt in soup.find_all(class_="tptt"):
39 | if hasattr(tptt, "string") and tptt.string:
40 | tptt.string += " "
41 | for slug in soup.find_all(class_="algoSlug_icon"):
42 | slug.extract()
43 |
44 | # Parse the algorithmic results
45 | _markdownify = _CustomMarkdownify()
46 | results = list()
47 | for result in soup.find_all(class_="b_algo"):
48 | # Rewrite redirect urls
49 | for a in result.find_all("a", href=True):
50 | parsed_href = urlparse(a["href"])
51 | qs = parse_qs(parsed_href.query)
52 |
53 | # The destination is contained in the u parameter,
54 | # but appears to be base64 encoded, with some prefix
55 | if "u" in qs:
56 | u = (
57 | qs["u"][0][2:].strip() + "=="
58 | ) # Python 3 doesn't care about extra padding
59 |
60 | try:
61 | # RFC 4648 / Base64URL" variant, which uses "-" and "_"
62 | a["href"] = base64.b64decode(u, altchars="-_").decode("utf-8")
63 | except UnicodeDecodeError:
64 | pass
65 | except binascii.Error:
66 | pass
67 |
68 | # Convert to markdown
69 | md_result = _markdownify.convert_soup(result).strip()
70 | lines = [line.strip() for line in re.split(r"\n+", md_result)]
71 | results.append("\n".join([line for line in lines if len(line) > 0]))
72 |
73 | webpage_text = (
74 | f"## A Bing search for '{query}' found the following results:\n\n"
75 | + "\n\n".join(results)
76 | )
77 |
78 | return DocumentConverterResult(
79 | title=None if soup.title is None else soup.title.string,
80 | text_content=webpage_text,
81 | )
82 |
--------------------------------------------------------------------------------
/core/converters/custommarkdownify.py:
--------------------------------------------------------------------------------
1 | import re
2 | from typing import Any
3 | from urllib.parse import urlparse, urlunparse, quote, unquote
4 |
5 | import markdownify
6 |
7 |
8 | class _CustomMarkdownify(markdownify.MarkdownConverter):
9 | """
10 | A custom version of markdownify's MarkdownConverter. Changes include:
11 |
12 | - Altering the default heading style to use '#', '##', etc.
13 | - Removing javascript hyperlinks.
14 | - Truncating images with large data:uri sources.
15 | - Ensuring URIs are properly escaped, and do not conflict with Markdown syntax
16 | """
17 |
18 | def __init__(self, **options: Any):
19 | options["heading_style"] = options.get("heading_style", markdownify.ATX)
20 | # Explicitly cast options to the expected type if necessary
21 | super().__init__(**options)
22 |
23 | def convert_hn(self, n: int, el: Any, text: str, convert_as_inline: bool) -> str:
24 | """Same as usual, but be sure to start with a new line"""
25 | if not convert_as_inline:
26 | if not re.search(r"^\n", text):
27 | return "\n" + super().convert_hn(n, el, text, convert_as_inline) # type: ignore
28 |
29 | return super().convert_hn(n, el, text, convert_as_inline) # type: ignore
30 |
31 | def convert_a(self, el: Any, text: str, convert_as_inline: bool):
32 | """Same as usual converter, but removes Javascript links and escapes URIs."""
33 | prefix, suffix, text = markdownify.chomp(text) # type: ignore
34 | if not text:
35 | return ""
36 | href = el.get("href")
37 | title = el.get("title")
38 |
39 | # Escape URIs and skip non-http or file schemes
40 | if href:
41 | try:
42 | parsed_url = urlparse(href) # type: ignore
43 | if parsed_url.scheme and parsed_url.scheme.lower() not in ["http", "https", "file"]: # type: ignore
44 | return "%s%s%s" % (prefix, text, suffix)
45 | href = urlunparse(parsed_url._replace(path=quote(unquote(parsed_url.path)))) # type: ignore
46 | except ValueError: # It's not clear if this ever gets thrown
47 | return "%s%s%s" % (prefix, text, suffix)
48 |
49 | # For the replacement see #29: text nodes underscores are escaped
50 | if (
51 | self.options["autolinks"]
52 | and text.replace(r"\_", "_") == href
53 | and not title
54 | and not self.options["default_title"]
55 | ):
56 | # Shortcut syntax
57 | return "<%s>" % href
58 | if self.options["default_title"] and not title:
59 | title = href
60 | title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
61 | return (
62 | "%s[%s](%s%s)%s" % (prefix, text, href, title_part, suffix)
63 | if href
64 | else text
65 | )
66 |
67 | def convert_img(self, el: Any, text: str, convert_as_inline: bool) -> str:
68 | """Same as usual converter, but removes data URIs"""
69 |
70 | alt = el.attrs.get("alt", None) or ""
71 | src = el.attrs.get("src", None) or ""
72 | title = el.attrs.get("title", None) or ""
73 | title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
74 | if (
75 | convert_as_inline
76 | and el.parent.name not in self.options["keep_inline_images_in"]
77 | ):
78 | return alt
79 |
80 | # Remove dataURIs
81 | if src.startswith("data:"):
82 | src = src.split(",")[0] + "..."
83 |
84 | return "" % (alt, src, title_part)
85 |
86 | def convert_soup(self, soup: Any) -> str:
87 | return super().convert_soup(soup) # type: ignore
88 |
--------------------------------------------------------------------------------
/core/converters/docx.py:
--------------------------------------------------------------------------------
1 | from typing import Union
2 |
3 | import mammoth
4 |
5 | from core.base import DocumentConverterResult
6 | from core.converters.html import HtmlConverter
7 |
8 |
9 | class DocxConverter(HtmlConverter):
10 | """
11 | Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
12 | """
13 |
14 | def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
15 | # Bail if not a DOCX
16 | extension = kwargs.get("file_extension", "")
17 | if extension.lower() != ".docx":
18 | return None
19 |
20 | result = None
21 | with open(local_path, "rb") as docx_file:
22 | style_map = kwargs.get("style_map", None)
23 |
24 | result = mammoth.convert_to_html(docx_file, style_map=style_map)
25 | html_content = result.value
26 | result = self._convert(html_content)
27 |
28 | return result
29 |
--------------------------------------------------------------------------------
/core/converters/html.py:
--------------------------------------------------------------------------------
1 | from typing import Any, Union
2 |
3 | from bs4 import BeautifulSoup
4 |
5 | from core.base import DocumentConverter, DocumentConverterResult
6 | from core.converters.custommarkdownify import _CustomMarkdownify
7 |
8 |
9 | class HtmlConverter(DocumentConverter):
10 | """Anything with content type text/html"""
11 |
12 | def convert(
13 | self, local_path: str, **kwargs: Any
14 | ) -> Union[None, DocumentConverterResult]:
15 | # Bail if not html
16 | extension = kwargs.get("file_extension", "")
17 | if extension.lower() not in [".html", ".htm"]:
18 | return None
19 |
20 | result = None
21 | with open(local_path, "rt", encoding="utf-8") as fh:
22 | result = self._convert(fh.read())
23 |
24 | return result
25 |
26 | def _convert(self, html_content: str) -> Union[None, DocumentConverterResult]:
27 | """Helper function that converts and HTML string."""
28 |
29 | # Parse the string
30 | soup = BeautifulSoup(html_content, "html.parser")
31 |
32 | # Remove javascript and style blocks
33 | for script in soup(["script", "style"]):
34 | script.extract()
35 |
36 | # Print only the main content
37 | body_elm = soup.find("body")
38 | webpage_text = ""
39 | if body_elm:
40 | webpage_text = _CustomMarkdownify().convert_soup(body_elm)
41 | else:
42 | webpage_text = _CustomMarkdownify().convert_soup(soup)
43 |
44 | assert isinstance(webpage_text, str)
45 |
46 | return DocumentConverterResult(
47 | title=None if soup.title is None else soup.title.string,
48 | text_content=webpage_text,
49 | )
50 |
--------------------------------------------------------------------------------
/core/converters/image.py:
--------------------------------------------------------------------------------
1 | import base64
2 | import mimetypes
3 | from typing import Union
4 |
5 | from core.base import DocumentConverterResult
6 | from core.converters.media import MediaConverter
7 |
8 |
9 | class ImageConverter(MediaConverter):
10 | """
11 | Converts images to markdown via extraction of metadata (if `exiftool` is installed), OCR (if `easyocr` is installed), and description via a multimodal LLM (if an llm_client is configured).
12 | """
13 |
14 | def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
15 | # Bail if not an image
16 | extension = kwargs.get("file_extension", "")
17 | if extension.lower() not in [".jpg", ".jpeg", ".png"]:
18 | return None
19 |
20 | md_content = ""
21 |
22 | # Add metadata
23 | metadata = self._get_metadata(local_path, kwargs.get("exiftool_path"))
24 | if metadata:
25 | for f in [
26 | "ImageSize",
27 | "Title",
28 | "Caption",
29 | "Description",
30 | "Keywords",
31 | "Artist",
32 | "Author",
33 | "DateTimeOriginal",
34 | "CreateDate",
35 | "GPSPosition",
36 | ]:
37 | if f in metadata:
38 | md_content += f"{f}: {metadata[f]}\n"
39 |
40 | # Try describing the image with GPTV
41 | llm_client = kwargs.get("llm_client")
42 | llm_model = kwargs.get("llm_model")
43 | if llm_client is not None and llm_model is not None:
44 | md_content += (
45 | "\n# Description:\n"
46 | + self._get_llm_description(
47 | local_path,
48 | extension,
49 | llm_client,
50 | llm_model,
51 | prompt=kwargs.get("llm_prompt"),
52 | ).strip()
53 | + "\n"
54 | )
55 | else:
56 | md_content += """
57 | Image description need set following env:
58 |
59 | - MARKIFY_LLM_API_BASE
60 | - MARKIFY_LLM_API_KEY
61 | - MARKIFY_LLM_MODE
62 | """
63 | return DocumentConverterResult(
64 | title=None,
65 | text_content=md_content,
66 | )
67 |
68 | def _get_llm_description(self, local_path, extension, client, model, prompt=None):
69 | if prompt is None or prompt.strip() == "":
70 | prompt = "Write a detailed caption for this image."
71 |
72 | data_uri = ""
73 | with open(local_path, "rb") as image_file:
74 | content_type, encoding = mimetypes.guess_type("_dummy" + extension)
75 | if content_type is None:
76 | content_type = "image/jpeg"
77 | image_base64 = base64.b64encode(image_file.read()).decode("utf-8")
78 | data_uri = f"data:{content_type};base64,{image_base64}"
79 |
80 | messages = [
81 | {
82 | "role": "user",
83 | "content": [
84 | {"type": "text", "text": prompt},
85 | {
86 | "type": "image_url",
87 | "image_url": {
88 | "url": data_uri,
89 | },
90 | },
91 | ],
92 | }
93 | ]
94 |
95 | response = client.chat.completions.create(model=model, messages=messages)
96 | return response.choices[0].message.content
97 |
--------------------------------------------------------------------------------
/core/converters/ipynb.py:
--------------------------------------------------------------------------------
1 | import json
2 | from typing import Any, Union
3 |
4 | from core.base import DocumentConverter, DocumentConverterResult, FileConversionException
5 |
6 |
7 | class IpynbConverter(DocumentConverter):
8 | """Converts Jupyter Notebook (.ipynb) files to Markdown."""
9 |
10 | def convert(
11 | self, local_path: str, **kwargs: Any
12 | ) -> Union[None, DocumentConverterResult]:
13 | # Bail if not ipynb
14 | extension = kwargs.get("file_extension", "")
15 | if extension.lower() != ".ipynb":
16 | return None
17 |
18 | # Parse and convert the notebook
19 | result = None
20 | with open(local_path, "rt", encoding="utf-8") as fh:
21 | notebook_content = json.load(fh)
22 | result = self._convert(notebook_content)
23 |
24 | return result
25 |
26 | def _convert(self, notebook_content: dict) -> Union[None, DocumentConverterResult]:
27 | """Helper function that converts notebook JSON content to Markdown."""
28 | try:
29 | md_output = []
30 | title = None
31 |
32 | for cell in notebook_content.get("cells", []):
33 | cell_type = cell.get("cell_type", "")
34 | source_lines = cell.get("source", [])
35 |
36 | if cell_type == "markdown":
37 | md_output.append("".join(source_lines))
38 |
39 | # Extract the first # heading as title if not already found
40 | if title is None:
41 | for line in source_lines:
42 | if line.startswith("# "):
43 | title = line.lstrip("# ").strip()
44 | break
45 |
46 | elif cell_type == "code":
47 | # Code cells are wrapped in Markdown code blocks
48 | md_output.append(f"```python\n{''.join(source_lines)}\n```")
49 | elif cell_type == "raw":
50 | md_output.append(f"```\n{''.join(source_lines)}\n```")
51 |
52 | md_text = "\n\n".join(md_output)
53 |
54 | # Check for title in notebook metadata
55 | title = notebook_content.get("metadata", {}).get("title", title)
56 |
57 | return DocumentConverterResult(
58 | title=title,
59 | text_content=md_text,
60 | )
61 |
62 | except Exception as e:
63 | raise FileConversionException(
64 | f"Error converting .ipynb file: {str(e)}"
65 | ) from e
66 |
--------------------------------------------------------------------------------
/core/converters/media.py:
--------------------------------------------------------------------------------
1 | import json
2 | import shutil
3 | import subprocess
4 | from _warnings import warn
5 |
6 | from core.base import DocumentConverter
7 |
8 |
9 | class MediaConverter(DocumentConverter):
10 | """
11 | Abstract class for multi-modal media (e.g., images and audio)
12 | """
13 |
14 | def _get_metadata(self, local_path, exiftool_path=None):
15 | if not exiftool_path:
16 | which_exiftool = shutil.which("exiftool")
17 | if which_exiftool:
18 | warn(
19 | f"""Implicit discovery of 'exiftool' is disabled. If you would like to continue to use exiftool in MarkItDown, please set the exiftool_path parameter in the MarkItDown consructor. E.g.,
20 |
21 | md = MarkItDown(exiftool_path="{which_exiftool}")
22 |
23 | This warning will be removed in future releases.
24 | """,
25 | DeprecationWarning,
26 | )
27 |
28 | return None
29 | else:
30 | try:
31 | result = subprocess.run(
32 | [exiftool_path, "-json", local_path], capture_output=True, text=True
33 | ).stdout
34 | return json.loads(result)[0]
35 | except Exception:
36 | return None
37 |
--------------------------------------------------------------------------------
/core/converters/mineru/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KylinMountain/markify/622070781f1cdb74506b3ec1ec7d1749c1ef458c/core/converters/mineru/__init__.py
--------------------------------------------------------------------------------
/core/converters/mineru/pdf_processor.py:
--------------------------------------------------------------------------------
1 | import re
2 | import urllib.parse
3 | from pathlib import Path
4 | from typing import Dict
5 |
6 | from magic_pdf.config.enums import SupportedPdfParseMethod
7 | from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
8 | from magic_pdf.data.dataset import PymuDocDataset
9 | from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
10 |
11 | from core.converters.mineru.title_corrector import MarkdownTitleProcessor
12 |
13 |
14 | class PDFProcessor:
15 | """PDF文档处理管道"""
16 |
17 | def __init__(self, output_dir: str = "output", base_url: str = "http://localhost:20926", **kwargs):
18 | self.output_dir = Path(output_dir)
19 | self.image_dir = self.output_dir / "images"
20 | self.base_url = base_url
21 | self._prepare_directories()
22 |
23 | def _prepare_directories(self):
24 | """创建输出目录结构"""
25 | self.image_dir.mkdir(parents=True, exist_ok=True)
26 | self.output_dir.mkdir(exist_ok=True)
27 |
28 | def process(self, pdf_path: str) -> Dict[str, str]:
29 | """处理PDF主流程"""
30 | pdf_path = Path(pdf_path)
31 | if not pdf_path.exists():
32 | raise FileNotFoundError(f"PDF文件不存在: {pdf_path}")
33 |
34 | name_stem = pdf_path.stem
35 | writers = {
36 | 'image': FileBasedDataWriter(str(self.image_dir)),
37 | 'markdown': FileBasedDataWriter(str(self.output_dir))
38 | }
39 |
40 | # 读取并解析PDF
41 | pdf_content = FileBasedDataReader("").read(str(pdf_path))
42 | dataset = PymuDocDataset(pdf_content)
43 |
44 | # 执行解析流程
45 | if dataset.classify() == SupportedPdfParseMethod.OCR:
46 | result = dataset.apply(doc_analyze, ocr=True).pipe_ocr_mode(writers['image'])
47 | else:
48 | result = dataset.apply(doc_analyze, ocr=False).pipe_txt_mode(writers['image'])
49 |
50 | # 生成输出文件
51 | output_files = self._generate_outputs(result, writers, name_stem)
52 |
53 | # 自动修正标题层级
54 | self._adjust_title_levels(output_files['markdown'])
55 |
56 | self._replace_image_paths(output_files['markdown'], self.base_url)
57 |
58 | return output_files
59 |
60 | def _generate_outputs(self, result, writers, name_stem: str) -> Dict[str, str]:
61 | """生成所有输出文件"""
62 | # 生成原始Markdown
63 | md_file = f"{name_stem}.md"
64 | result.dump_md(writers['markdown'], md_file, self.image_dir.name)
65 |
66 | # 生成中间文件
67 | # result.dump_content_list(writers['markdown'], f"{name_stem}_content.json")
68 | # result.dump_middle_json(writers['markdown'], f"{name_stem}_middle.json")
69 |
70 | return {
71 | 'markdown': str(self.output_dir / md_file),
72 | 'images': str(self.image_dir),
73 | # 'middle_json': str(self.output_dir / f"{name_stem}_middle.json")
74 | }
75 |
76 | def _replace_image_paths(self, md_path: str, base_url: str):
77 | """替换Markdown文件中的本地图像路径为HTTP URL"""
78 | with open(md_path, 'r', encoding='utf-8') as f:
79 | content = f.read()
80 |
81 | # 匹配 Markdown 中的图像链接,假设格式为 
82 | pattern = r'!\[.*?\]\((images/.*?)\)'
83 | replacement = lambda m: f'![{m.group(0).split("]")[0].split("[")[1]}]({urllib.parse.urljoin(base_url, "images/")}{m.group(1).split("/")[-1]})'
84 | new_content = re.sub(pattern, replacement, content)
85 |
86 | # 将修改后的内容写回文件
87 | with open(md_path, 'w', encoding='utf-8') as f:
88 | f.write(new_content)
89 |
90 | def _adjust_title_levels(self, md_path: str):
91 | """执行Markdown标题修正"""
92 | processor = MarkdownTitleProcessor()
93 | processor.process_file(md_path)
94 |
95 |
96 | if __name__ == "__main__":
97 | # 示例用法
98 | processor = PDFProcessor()
99 | result = processor.process("/path/to/your.pdf")
100 | print(f"处理完成,输出文件:{result}")
101 |
--------------------------------------------------------------------------------
/core/converters/mineru/title_corrector.py:
--------------------------------------------------------------------------------
1 | import re
2 | from typing import List, Tuple, Optional
3 | from pathlib import Path
4 |
5 |
6 | class MarkdownTitleProcessor:
7 | """智能Markdown标题层级处理器"""
8 |
9 | def __init__(self, title_patterns: Optional[List[Tuple[str, int]]] = None):
10 | """
11 | 初始化标题处理器
12 |
13 | Args:
14 | title_patterns: 自定义标题模式列表,格式为[(正则模式, 基准层级), ...]
15 | """
16 | # 默认支持中英文混合标题模式
17 | self.title_patterns = title_patterns or [
18 | # 中文章节模式
19 | (r'^(第[一二三四五六七八九十百]+章)\s*[::]?\s*.+', 1),
20 | (r'^(第[一二三四五六七八九十百]+节)\s*[::]?\s*.+', 2),
21 | (r'^【.+】\s*.+', 2),
22 |
23 | # 英文章节模式
24 | (r'^(Chapter|CHAPTER)\s+\d+\.?\s*[:-]?\s*.+', 1),
25 | (r'^(Section|SECTION)\s+\d+\.?\d*\s*[:-]?\s*.+', 2),
26 |
27 | # 数字层级模式
28 | (r'^\d+(?![.]\d)', 1), # 单独数字开头:1
29 | (r'^\d+\.\d+(?![.]\d)', 2), # 二级编号:1.1
30 | (r'^\d+\.\d+\.\d+', 3), # 三级编号:1.1.1
31 | (r'^\d+\.\d+\.\d+\.\d+', 4), # 四级编号:1.1.1.1
32 |
33 | # 特殊标识
34 | (r'^(※|◆|►)\s*.+', 3), # 特殊符号标题
35 | (r'^(Note|Warning):\s*.+', 4) # 提示类标题
36 | ]
37 |
38 | # 编译正则表达式
39 | self.compiled_patterns = [
40 | (re.compile(pattern, re.IGNORECASE), level)
41 | for pattern, level in self.title_patterns
42 | ]
43 |
44 | # 层级栈管理
45 | self.level_stack = [0] # [当前层级,父层级,祖父层级...]
46 |
47 | def _clean_title(self, title: str) -> str:
48 | """清洗标题内容"""
49 | # 移除常见干扰符号
50 | title = re.sub(r'^[【《〈((]', '', title)
51 | title = re.sub(r'[】》〉)):.]$', '', title)
52 | # 去除首尾特殊符号
53 | return title.strip('※★▪•·\t ')
54 |
55 | def determine_level(self, title: str) -> int:
56 | """智能判断标题层级"""
57 | clean_title = self._clean_title(title)
58 |
59 | # 优先匹配预定义模式
60 | for pattern, base_level in self.compiled_patterns:
61 | if pattern.match(clean_title):
62 | return self._calculate_relative_level(base_level)
63 |
64 | # 无匹配时根据上下文推断
65 | return self._infer_level_from_context(clean_title)
66 |
67 | def _calculate_relative_level(self, base_level: int) -> int:
68 | """计算相对层级"""
69 | # 当前基准层级深度
70 | current_depth = len(self.level_stack)
71 |
72 | # 如果基准层级比当前深,则作为子级
73 | if base_level > current_depth:
74 | return current_depth + 1
75 | # 如果基准层级较浅,则重置层级栈
76 | elif base_level < current_depth:
77 | self.level_stack = self.level_stack[:base_level]
78 | return base_level
79 |
80 | def _infer_level_from_context(self, title: str) -> int:
81 | """根据上下文推断层级"""
82 | # 根据标题长度和内容特征推断
83 | if len(title) < 15 and not re.search(r'\s', title):
84 | return min(len(self.level_stack) + 1, 6)
85 | return max(len(self.level_stack), 1)
86 |
87 | def process_line(self, line: str) -> str:
88 | """处理单行Markdown文本"""
89 | # 匹配标题行
90 | match = re.match(r'^(#+)\s+(.+)$', line.strip())
91 | if not match:
92 | return line
93 |
94 | original_level = len(match.group(1))
95 | title_content = match.group(2)
96 |
97 | # 计算新层级
98 | new_level = self.determine_level(title_content)
99 | new_level = max(1, min(new_level, 6)) # 限制在1-6级
100 |
101 | # 更新层级栈
102 | if new_level > len(self.level_stack):
103 | self.level_stack.append(new_level)
104 | else:
105 | self.level_stack = self.level_stack[:new_level]
106 |
107 | return f"{'#' * new_level} {title_content}\n"
108 |
109 | def process_file(self, input_path: str, output_path: Optional[str] = None):
110 | """处理整个Markdown文件"""
111 | input_file = Path(input_path)
112 | output_file = Path(output_path) if output_path else input_file
113 |
114 | with input_file.open('r', encoding='utf-8') as f:
115 | lines = f.readlines()
116 |
117 | processed_lines = []
118 | for line in lines:
119 | processed_lines.append(self.process_line(line))
120 |
121 | with output_file.open('w', encoding='utf-8') as f:
122 | f.writelines(processed_lines)
123 |
124 |
125 | if __name__ == '__main__':
126 | main()
--------------------------------------------------------------------------------
/core/converters/mp3.py:
--------------------------------------------------------------------------------
1 | import os
2 | import tempfile
3 | from typing import Union
4 | from warnings import catch_warnings, resetwarnings
5 |
6 | # Optional Transcription support
7 | IS_AUDIO_TRANSCRIPTION_CAPABLE = False
8 | try:
9 | # Using warnings' catch_warnings to catch
10 | # pydub's warning of ffmpeg or avconv missing
11 | with catch_warnings(record=True) as w:
12 | import pydub
13 |
14 | if w:
15 | raise ModuleNotFoundError
16 | import speech_recognition as sr
17 |
18 | IS_AUDIO_TRANSCRIPTION_CAPABLE = True
19 | except ModuleNotFoundError:
20 | pass
21 | finally:
22 | resetwarnings()
23 |
24 | from core.base import DocumentConverterResult
25 | from core.converters.wav import WavConverter
26 |
27 |
28 | class Mp3Converter(WavConverter):
29 | """
30 | Converts MP3 files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` AND `pydub` are installed).
31 | """
32 |
33 | def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
34 | # Bail if not a MP3
35 | extension = kwargs.get("file_extension", "")
36 | if extension.lower() != ".mp3":
37 | return None
38 |
39 | md_content = ""
40 |
41 | # Add metadata
42 | metadata = self._get_metadata(local_path, kwargs.get("exiftool_path"))
43 | if metadata:
44 | for f in [
45 | "Title",
46 | "Artist",
47 | "Author",
48 | "Band",
49 | "Album",
50 | "Genre",
51 | "Track",
52 | "DateTimeOriginal",
53 | "CreateDate",
54 | "Duration",
55 | ]:
56 | if f in metadata:
57 | md_content += f"{f}: {metadata[f]}\n"
58 |
59 | # Transcribe
60 | if IS_AUDIO_TRANSCRIPTION_CAPABLE:
61 | handle, temp_path = tempfile.mkstemp(suffix=".wav")
62 | os.close(handle)
63 | try:
64 | sound = pydub.AudioSegment.from_mp3(local_path)
65 | sound.export(temp_path, format="wav")
66 |
67 | _args = dict()
68 | _args.update(kwargs)
69 | _args["file_extension"] = ".wav"
70 |
71 | try:
72 | transcript = super()._transcribe_audio(temp_path).strip()
73 | md_content += "\n\n### Audio Transcript:\n" + (
74 | "[No speech detected]" if transcript == "" else transcript
75 | )
76 | except Exception:
77 | md_content += "\n\n### Audio Transcript:\nError. Could not transcribe this audio."
78 |
79 | finally:
80 | os.unlink(temp_path)
81 |
82 | # Return the result
83 | return DocumentConverterResult(
84 | title=None,
85 | text_content=md_content.strip(),
86 | )
87 |
--------------------------------------------------------------------------------
/core/converters/outlook.py:
--------------------------------------------------------------------------------
1 | from typing import Union, Any
2 |
3 | from olefile import olefile
4 |
5 | from core.base import FileConversionException, DocumentConverterResult, DocumentConverter
6 |
7 |
8 | class OutlookMsgConverter(DocumentConverter):
9 | """Converts Outlook .msg files to markdown by extracting email metadata and content.
10 |
11 | Uses the olefile package to parse the .msg file structure and extract:
12 | - Email headers (From, To, Subject)
13 | - Email body content
14 | """
15 |
16 | def convert(
17 | self, local_path: str, **kwargs: Any
18 | ) -> Union[None, DocumentConverterResult]:
19 | # Bail if not a MSG file
20 | extension = kwargs.get("file_extension", "")
21 | if extension.lower() != ".msg":
22 | return None
23 |
24 | try:
25 | msg = olefile.OleFileIO(local_path)
26 | # Extract email metadata
27 | md_content = "# Email Message\n\n"
28 |
29 | # Get headers
30 | headers = {
31 | "From": self._get_stream_data(msg, "__substg1.0_0C1F001F"),
32 | "To": self._get_stream_data(msg, "__substg1.0_0E04001F"),
33 | "Subject": self._get_stream_data(msg, "__substg1.0_0037001F"),
34 | }
35 |
36 | # Add headers to markdown
37 | for key, value in headers.items():
38 | if value:
39 | md_content += f"**{key}:** {value}\n"
40 |
41 | md_content += "\n## Content\n\n"
42 |
43 | # Get email body
44 | body = self._get_stream_data(msg, "__substg1.0_1000001F")
45 | if body:
46 | md_content += body
47 |
48 | msg.close()
49 |
50 | return DocumentConverterResult(
51 | title=headers.get("Subject"), text_content=md_content.strip()
52 | )
53 |
54 | except Exception as e:
55 | raise FileConversionException(
56 | f"Could not convert MSG file '{local_path}': {str(e)}"
57 | )
58 |
59 | def _get_stream_data(
60 | self, msg: olefile.OleFileIO, stream_path: str
61 | ) -> Union[str, None]:
62 | """Helper to safely extract and decode stream data from the MSG file."""
63 | try:
64 | if msg.exists(stream_path):
65 | data = msg.openstream(stream_path).read()
66 | # Try UTF-16 first (common for .msg files)
67 | try:
68 | return data.decode("utf-16-le").strip()
69 | except UnicodeDecodeError:
70 | # Fall back to UTF-8
71 | try:
72 | return data.decode("utf-8").strip()
73 | except UnicodeDecodeError:
74 | # Last resort - ignore errors
75 | return data.decode("utf-8", errors="ignore").strip()
76 | except Exception:
77 | pass
78 | return None
--------------------------------------------------------------------------------
/core/converters/pdf.py:
--------------------------------------------------------------------------------
1 | from typing import Union
2 | from pathlib import Path
3 |
4 | from core.base import DocumentConverter, DocumentConverterResult, FileConversionException
5 |
6 |
7 | class PdfConverter(DocumentConverter):
8 | """默认PDF解析器(simple模式,基于pdfminer)"""
9 |
10 | def convert(self, local_path: str, **kwargs) -> Union[None, DocumentConverterResult]:
11 | # Bail if not a pdf
12 | extension = kwargs.get("file_extension", "")
13 | if extension.lower() != ".pdf":
14 | return None
15 | try:
16 | import pdfminer.high_level
17 | return DocumentConverterResult(
18 | title=None,
19 | text_content=pdfminer.high_level.extract_text(local_path)
20 | )
21 | except Exception as e:
22 | raise FileConversionException(f"Simple PDF解析失败: {str(e)}")
23 |
24 |
25 | class AdvancedPdfConverter(DocumentConverter):
26 | """使用mineru的增强PDF解析器(advanced模式)"""
27 |
28 | def convert(self, local_path: str, **kwargs) -> DocumentConverterResult:
29 | # Bail if not a pdf
30 | extension = kwargs.get("file_extension", "")
31 | if extension.lower() != ".pdf":
32 | return None
33 |
34 | try:
35 | from core.converters.mineru.pdf_processor import PDFProcessor
36 | processor = PDFProcessor(**kwargs)
37 | result = processor.process(local_path)
38 |
39 | # 读取生成的markdown文件
40 | with open(result["markdown"], "r", encoding="utf-8") as f:
41 | md_content = f.read()
42 |
43 | return DocumentConverterResult(
44 | title=Path(local_path).stem,
45 | text_content=md_content
46 | )
47 | except ImportError:
48 | raise RuntimeError("miner模块未找到,请安装mineru解析器")
49 | except Exception as e:
50 | raise FileConversionException(f"Advanced PDF解析失败: {str(e)}")
51 |
52 |
53 | class CloudPdfConverter(DocumentConverter):
54 | """云端PDF解析器(预留cloud模式实现)"""
55 |
56 | def convert(self, local_path: str, **kwargs) -> DocumentConverterResult:
57 | # Bail if not a pdf
58 | extension = kwargs.get("file_extension", "")
59 | if extension.lower() != ".pdf":
60 | return None
61 | raise NotImplementedError("Cloud模式尚未实现")
62 |
63 |
--------------------------------------------------------------------------------
/core/converters/plaintext.py:
--------------------------------------------------------------------------------
1 | import mimetypes
2 | from typing import Any, Union
3 |
4 | from charset_normalizer import from_path
5 |
6 | from core.base import DocumentConverter, DocumentConverterResult
7 |
8 |
9 | class PlainTextConverter(DocumentConverter):
10 | """Anything with content type text/plain"""
11 |
12 | def convert(
13 | self, local_path: str, **kwargs: Any
14 | ) -> Union[None, DocumentConverterResult]:
15 | # Guess the content type from any file extension that might be around
16 | content_type, _ = mimetypes.guess_type(
17 | "__placeholder" + kwargs.get("file_extension", "")
18 | )
19 |
20 | # Only accept text files
21 | if content_type is None:
22 | return None
23 | elif all(
24 | not content_type.lower().startswith(type_prefix)
25 | for type_prefix in ["text/", "application/json"]
26 | ):
27 | return None
28 |
29 | text_content = str(from_path(local_path).best())
30 | return DocumentConverterResult(
31 | title=None,
32 | text_content=text_content,
33 | )
34 |
--------------------------------------------------------------------------------
/core/converters/pptx.py:
--------------------------------------------------------------------------------
1 | import html
2 | import re
3 | from typing import Union
4 |
5 | import pptx
6 |
7 | from core.base import DocumentConverterResult
8 | from core.converters.html import HtmlConverter
9 |
10 |
11 | class PptxConverter(HtmlConverter):
12 | """
13 | Converts PPTX files to Markdown. Supports heading, tables and images with alt text.
14 | """
15 |
16 | def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
17 | # Bail if not a PPTX
18 | extension = kwargs.get("file_extension", "")
19 | if extension.lower() != ".pptx":
20 | return None
21 |
22 | md_content = ""
23 |
24 | presentation = pptx.Presentation(local_path)
25 | slide_num = 0
26 | for slide in presentation.slides:
27 | slide_num += 1
28 |
29 | md_content += f"\n\n\n"
30 |
31 | title = slide.shapes.title
32 | for shape in slide.shapes:
33 | # Pictures
34 | if self._is_picture(shape):
35 | # https://github.com/scanny/python-pptx/pull/512#issuecomment-1713100069
36 | alt_text = ""
37 | try:
38 | alt_text = shape._element._nvXxPr.cNvPr.attrib.get("descr", "")
39 | except Exception:
40 | pass
41 |
42 | # A placeholder name
43 | filename = re.sub(r"\W", "", shape.name) + ".jpg"
44 | md_content += (
45 | "\n\n"
50 | )
51 |
52 | # Tables
53 | if self._is_table(shape):
54 | html_table = "
"
55 | first_row = True
56 | for row in shape.table.rows:
57 | html_table += "
"
58 | for cell in row.cells:
59 | if first_row:
60 | html_table += "
" + html.escape(cell.text) + "
"
61 | else:
62 | html_table += "
" + html.escape(cell.text) + "
"
63 | html_table += "
"
64 | first_row = False
65 | html_table += "
"
66 | md_content += (
67 | "\n" + self._convert(html_table).text_content.strip() + "\n"
68 | )
69 |
70 | # Charts
71 | if shape.has_chart:
72 | md_content += self._convert_chart_to_markdown(shape.chart)
73 |
74 | # Text areas
75 | elif shape.has_text_frame:
76 | if shape == title:
77 | md_content += "# " + shape.text.lstrip() + "\n"
78 | else:
79 | md_content += shape.text + "\n"
80 |
81 | md_content = md_content.strip()
82 |
83 | if slide.has_notes_slide:
84 | md_content += "\n\n### Notes:\n"
85 | notes_frame = slide.notes_slide.notes_text_frame
86 | if notes_frame is not None:
87 | md_content += notes_frame.text
88 | md_content = md_content.strip()
89 |
90 | return DocumentConverterResult(
91 | title=None,
92 | text_content=md_content.strip(),
93 | )
94 |
95 | def _is_picture(self, shape):
96 | if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PICTURE:
97 | return True
98 | if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PLACEHOLDER:
99 | if hasattr(shape, "image"):
100 | return True
101 | return False
102 |
103 | def _is_table(self, shape):
104 | if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.TABLE:
105 | return True
106 | return False
107 |
108 | def _convert_chart_to_markdown(self, chart):
109 | md = "\n\n### Chart"
110 | if chart.has_title:
111 | md += f": {chart.chart_title.text_frame.text}"
112 | md += "\n\n"
113 | data = []
114 | category_names = [c.label for c in chart.plots[0].categories]
115 | series_names = [s.name for s in chart.series]
116 | data.append(["Category"] + series_names)
117 |
118 | for idx, category in enumerate(category_names):
119 | row = [category]
120 | for series in chart.series:
121 | row.append(series.values[idx])
122 | data.append(row)
123 |
124 | markdown_table = []
125 | for row in data:
126 | markdown_table.append("| " + " | ".join(map(str, row)) + " |")
127 | header = markdown_table[0]
128 | separator = "|" + "|".join(["---"] * len(data[0])) + "|"
129 | return md + "\n".join([header, separator] + markdown_table[1:])
130 |
--------------------------------------------------------------------------------
/core/converters/rss.py:
--------------------------------------------------------------------------------
1 | import traceback
2 | from typing import Union
3 | from xml.dom import minidom
4 |
5 | from bs4 import BeautifulSoup
6 |
7 | from core.base import DocumentConverter, DocumentConverterResult
8 | from core.converters.custommarkdownify import _CustomMarkdownify
9 |
10 |
11 | class RSSConverter(DocumentConverter):
12 | """Convert RSS / Atom type to markdown"""
13 |
14 | def convert(
15 | self, local_path: str, **kwargs
16 | ) -> Union[None, DocumentConverterResult]:
17 | # Bail if not RSS type
18 | extension = kwargs.get("file_extension", "")
19 | if extension.lower() not in [".xml", ".rss", ".atom"]:
20 | return None
21 | try:
22 | doc = minidom.parse(local_path)
23 | except BaseException as _:
24 | return None
25 | result = None
26 | if doc.getElementsByTagName("rss"):
27 | # A RSS feed must have a root element of
28 | result = self._parse_rss_type(doc)
29 | elif doc.getElementsByTagName("feed"):
30 | root = doc.getElementsByTagName("feed")[0]
31 | if root.getElementsByTagName("entry"):
32 | # An Atom feed must have a root element of and at least one
33 | result = self._parse_atom_type(doc)
34 | else:
35 | return None
36 | else:
37 | # not rss or atom
38 | return None
39 |
40 | return result
41 |
42 | def _parse_atom_type(
43 | self, doc: minidom.Document
44 | ) -> Union[None, DocumentConverterResult]:
45 | """Parse the type of an Atom feed.
46 |
47 | Returns None if the feed type is not recognized or something goes wrong.
48 | """
49 | try:
50 | root = doc.getElementsByTagName("feed")[0]
51 | title = self._get_data_by_tag_name(root, "title")
52 | subtitle = self._get_data_by_tag_name(root, "subtitle")
53 | entries = root.getElementsByTagName("entry")
54 | md_text = f"# {title}\n"
55 | if subtitle:
56 | md_text += f"{subtitle}\n"
57 | for entry in entries:
58 | entry_title = self._get_data_by_tag_name(entry, "title")
59 | entry_summary = self._get_data_by_tag_name(entry, "summary")
60 | entry_updated = self._get_data_by_tag_name(entry, "updated")
61 | entry_content = self._get_data_by_tag_name(entry, "content")
62 |
63 | if entry_title:
64 | md_text += f"\n## {entry_title}\n"
65 | if entry_updated:
66 | md_text += f"Updated on: {entry_updated}\n"
67 | if entry_summary:
68 | md_text += self._parse_content(entry_summary)
69 | if entry_content:
70 | md_text += self._parse_content(entry_content)
71 |
72 | return DocumentConverterResult(
73 | title=title,
74 | text_content=md_text,
75 | )
76 | except BaseException as _:
77 | return None
78 |
79 | def _parse_rss_type(
80 | self, doc: minidom.Document
81 | ) -> Union[None, DocumentConverterResult]:
82 | """Parse the type of an RSS feed.
83 |
84 | Returns None if the feed type is not recognized or something goes wrong.
85 | """
86 | try:
87 | root = doc.getElementsByTagName("rss")[0]
88 | channel = root.getElementsByTagName("channel")
89 | if not channel:
90 | return None
91 | channel = channel[0]
92 | channel_title = self._get_data_by_tag_name(channel, "title")
93 | channel_description = self._get_data_by_tag_name(channel, "description")
94 | items = channel.getElementsByTagName("item")
95 | if channel_title:
96 | md_text = f"# {channel_title}\n"
97 | if channel_description:
98 | md_text += f"{channel_description}\n"
99 | if not items:
100 | items = []
101 | for item in items:
102 | title = self._get_data_by_tag_name(item, "title")
103 | description = self._get_data_by_tag_name(item, "description")
104 | pubDate = self._get_data_by_tag_name(item, "pubDate")
105 | content = self._get_data_by_tag_name(item, "content:encoded")
106 |
107 | if title:
108 | md_text += f"\n## {title}\n"
109 | if pubDate:
110 | md_text += f"Published on: {pubDate}\n"
111 | if description:
112 | md_text += self._parse_content(description)
113 | if content:
114 | md_text += self._parse_content(content)
115 |
116 | return DocumentConverterResult(
117 | title=channel_title,
118 | text_content=md_text,
119 | )
120 | except BaseException as _:
121 | print(traceback.format_exc())
122 | return None
123 |
124 | def _parse_content(self, content: str) -> str:
125 | """Parse the content of an RSS feed item"""
126 | try:
127 | # using bs4 because many RSS feeds have HTML-styled content
128 | soup = BeautifulSoup(content, "html.parser")
129 | return _CustomMarkdownify().convert_soup(soup)
130 | except BaseException as _:
131 | return content
132 |
133 | def _get_data_by_tag_name(
134 | self, element: minidom.Element, tag_name: str
135 | ) -> Union[str, None]:
136 | """Get data from first child element with the given tag name.
137 | Returns None when no such element is found.
138 | """
139 | nodes = element.getElementsByTagName(tag_name)
140 | if not nodes:
141 | return None
142 | fc = nodes[0].firstChild
143 | if fc:
144 | return fc.data
145 | return None
146 |
--------------------------------------------------------------------------------
/core/converters/wav.py:
--------------------------------------------------------------------------------
1 | from typing import Union
2 | from warnings import catch_warnings, resetwarnings
3 |
4 | # Optional Transcription support
5 | IS_AUDIO_TRANSCRIPTION_CAPABLE = False
6 | try:
7 | # Using warnings' catch_warnings to catch
8 | # pydub's warning of ffmpeg or avconv missing
9 | with catch_warnings(record=True) as w:
10 | import pydub
11 |
12 | if w:
13 | raise ModuleNotFoundError
14 | import speech_recognition as sr
15 |
16 | IS_AUDIO_TRANSCRIPTION_CAPABLE = True
17 | except ModuleNotFoundError:
18 | pass
19 | finally:
20 | resetwarnings()
21 |
22 | from core.base import DocumentConverterResult
23 | from core.converters.media import MediaConverter
24 |
25 |
26 | class WavConverter(MediaConverter):
27 | """
28 | Converts WAV files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` is installed).
29 | """
30 |
31 | def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
32 | # Bail if not a WAV
33 | extension = kwargs.get("file_extension", "")
34 | if extension.lower() != ".wav":
35 | return None
36 |
37 | md_content = ""
38 |
39 | # Add metadata
40 | metadata = self._get_metadata(local_path, kwargs.get("exiftool_path"))
41 | if metadata:
42 | for f in [
43 | "Title",
44 | "Artist",
45 | "Author",
46 | "Band",
47 | "Album",
48 | "Genre",
49 | "Track",
50 | "DateTimeOriginal",
51 | "CreateDate",
52 | "Duration",
53 | ]:
54 | if f in metadata:
55 | md_content += f"{f}: {metadata[f]}\n"
56 |
57 | # Transcribe
58 | if IS_AUDIO_TRANSCRIPTION_CAPABLE:
59 | try:
60 | transcript = self._transcribe_audio(local_path)
61 | md_content += "\n\n### Audio Transcript:\n" + (
62 | "[No speech detected]" if transcript == "" else transcript
63 | )
64 | except Exception:
65 | md_content += (
66 | "\n\n### Audio Transcript:\nError. Could not transcribe this audio."
67 | )
68 |
69 | return DocumentConverterResult(
70 | title=None,
71 | text_content=md_content.strip(),
72 | )
73 |
74 | def _transcribe_audio(self, local_path) -> str:
75 | recognizer = sr.Recognizer()
76 | with sr.AudioFile(local_path) as source:
77 | audio = recognizer.record(source)
78 | return recognizer.recognize_google(audio).strip()
79 |
--------------------------------------------------------------------------------
/core/converters/wikipedia.py:
--------------------------------------------------------------------------------
1 | import re
2 | from typing import Any, Union
3 |
4 | from bs4 import BeautifulSoup
5 |
6 | from core.base import DocumentConverter, DocumentConverterResult
7 | from core.converters.custommarkdownify import _CustomMarkdownify
8 |
9 |
10 | class WikipediaConverter(DocumentConverter):
11 | """Handle Wikipedia pages separately, focusing only on the main document content."""
12 |
13 | def convert(
14 | self, local_path: str, **kwargs: Any
15 | ) -> Union[None, DocumentConverterResult]:
16 | # Bail if not Wikipedia
17 | extension = kwargs.get("file_extension", "")
18 | if extension.lower() not in [".html", ".htm"]:
19 | return None
20 | url = kwargs.get("url", "")
21 | if not re.search(r"^https?:\/\/[a-zA-Z]{2,3}\.wikipedia.org\/", url):
22 | return None
23 |
24 | # Parse the file
25 | soup = None
26 | with open(local_path, "rt", encoding="utf-8") as fh:
27 | soup = BeautifulSoup(fh.read(), "html.parser")
28 |
29 | # Remove javascript and style blocks
30 | for script in soup(["script", "style"]):
31 | script.extract()
32 |
33 | # Print only the main content
34 | body_elm = soup.find("div", {"id": "mw-content-text"})
35 | title_elm = soup.find("span", {"class": "mw-page-title-main"})
36 |
37 | webpage_text = ""
38 | main_title = None if soup.title is None else soup.title.string
39 |
40 | if body_elm:
41 | # What's the title
42 | if title_elm and len(title_elm) > 0:
43 | main_title = title_elm.string # type: ignore
44 | assert isinstance(main_title, str)
45 |
46 | # Convert the page
47 | webpage_text = f"# {main_title}\n\n" + _CustomMarkdownify().convert_soup(
48 | body_elm
49 | )
50 | else:
51 | webpage_text = _CustomMarkdownify().convert_soup(soup)
52 |
53 | return DocumentConverterResult(
54 | title=main_title,
55 | text_content=webpage_text,
56 | )
57 |
--------------------------------------------------------------------------------
/core/converters/xls.py:
--------------------------------------------------------------------------------
1 | from typing import Union
2 |
3 | import pandas as pd
4 |
5 | from core.base import DocumentConverterResult
6 | from core.converters.html import HtmlConverter
7 |
8 |
9 | class XlsConverter(HtmlConverter):
10 | """
11 | Converts XLS files to Markdown, with each sheet presented as a separate Markdown table.
12 | """
13 |
14 | def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
15 | # Bail if not a XLS
16 | extension = kwargs.get("file_extension", "")
17 | if extension.lower() != ".xls":
18 | return None
19 |
20 | sheets = pd.read_excel(local_path, sheet_name=None, engine="xlrd")
21 | md_content = ""
22 | for s in sheets:
23 | md_content += f"## {s}\n"
24 | html_content = sheets[s].to_html(index=False)
25 | md_content += self._convert(html_content).text_content.strip() + "\n\n"
26 |
27 | return DocumentConverterResult(
28 | title=None,
29 | text_content=md_content.strip(),
30 | )
31 |
--------------------------------------------------------------------------------
/core/converters/xlsx.py:
--------------------------------------------------------------------------------
1 | from typing import Union
2 |
3 | import pandas as pd
4 |
5 | from core.base import DocumentConverterResult
6 | from core.converters.html import HtmlConverter
7 |
8 |
9 | class XlsxConverter(HtmlConverter):
10 | """
11 | Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table.
12 | """
13 |
14 | def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
15 | # Bail if not a XLSX
16 | extension = kwargs.get("file_extension", "")
17 | if extension.lower() != ".xlsx":
18 | return None
19 |
20 | sheets = pd.read_excel(local_path, sheet_name=None, engine="openpyxl")
21 | md_content = ""
22 | for s in sheets:
23 | md_content += f"## {s}\n"
24 | html_content = sheets[s].to_html(index=False)
25 | md_content += self._convert(html_content).text_content.strip() + "\n\n"
26 |
27 | return DocumentConverterResult(
28 | title=None,
29 | text_content=md_content.strip(),
30 | )
31 |
--------------------------------------------------------------------------------
/core/converters/youtube.py:
--------------------------------------------------------------------------------
1 | import json
2 | import re
3 | from typing import Any, Union, Dict, List
4 | from urllib.parse import urlparse, parse_qs
5 |
6 | from bs4 import BeautifulSoup
7 |
8 | # Optional YouTube transcription support
9 | try:
10 | from youtube_transcript_api import YouTubeTranscriptApi
11 |
12 | IS_YOUTUBE_TRANSCRIPT_CAPABLE = True
13 | except ModuleNotFoundError:
14 | pass
15 |
16 |
17 | from core.base import DocumentConverter, DocumentConverterResult
18 |
19 |
20 | class YouTubeConverter(DocumentConverter):
21 | """Handle YouTube specially, focusing on the video title, description, and transcript."""
22 |
23 | def convert(
24 | self, local_path: str, **kwargs: Any
25 | ) -> Union[None, DocumentConverterResult]:
26 | # Bail if not YouTube
27 | extension = kwargs.get("file_extension", "")
28 | if extension.lower() not in [".html", ".htm"]:
29 | return None
30 | url = kwargs.get("url", "")
31 | if not url.startswith("https://www.youtube.com/watch?"):
32 | return None
33 |
34 | # Parse the file
35 | soup = None
36 | with open(local_path, "rt", encoding="utf-8") as fh:
37 | soup = BeautifulSoup(fh.read(), "html.parser")
38 |
39 | # Read the meta tags
40 | assert soup.title is not None and soup.title.string is not None
41 | metadata: Dict[str, str] = {"title": soup.title.string}
42 | for meta in soup(["meta"]):
43 | for a in meta.attrs:
44 | if a in ["itemprop", "property", "name"]:
45 | metadata[meta[a]] = meta.get("content", "")
46 | break
47 |
48 | # We can also try to read the full description. This is more prone to breaking, since it reaches into the page implementation
49 | try:
50 | for script in soup(["script"]):
51 | content = script.text
52 | if "ytInitialData" in content:
53 | lines = re.split(r"\r?\n", content)
54 | obj_start = lines[0].find("{")
55 | obj_end = lines[0].rfind("}")
56 | if obj_start >= 0 and obj_end >= 0:
57 | data = json.loads(lines[0][obj_start : obj_end + 1])
58 | attrdesc = self._findKey(data, "attributedDescriptionBodyText") # type: ignore
59 | if attrdesc:
60 | metadata["description"] = str(attrdesc["content"])
61 | break
62 | except Exception:
63 | pass
64 |
65 | # Start preparing the page
66 | webpage_text = "# YouTube\n"
67 |
68 | title = self._get(metadata, ["title", "og:title", "name"]) # type: ignore
69 | assert isinstance(title, str)
70 |
71 | if title:
72 | webpage_text += f"\n## {title}\n"
73 |
74 | stats = ""
75 | views = self._get(metadata, ["interactionCount"]) # type: ignore
76 | if views:
77 | stats += f"- **Views:** {views}\n"
78 |
79 | keywords = self._get(metadata, ["keywords"]) # type: ignore
80 | if keywords:
81 | stats += f"- **Keywords:** {keywords}\n"
82 |
83 | runtime = self._get(metadata, ["duration"]) # type: ignore
84 | if runtime:
85 | stats += f"- **Runtime:** {runtime}\n"
86 |
87 | if len(stats) > 0:
88 | webpage_text += f"\n### Video Metadata\n{stats}\n"
89 |
90 | description = self._get(metadata, ["description", "og:description"]) # type: ignore
91 | if description:
92 | webpage_text += f"\n### Description\n{description}\n"
93 |
94 | if IS_YOUTUBE_TRANSCRIPT_CAPABLE:
95 | transcript_text = ""
96 | parsed_url = urlparse(url) # type: ignore
97 | params = parse_qs(parsed_url.query) # type: ignore
98 | if "v" in params:
99 | assert isinstance(params["v"][0], str)
100 | video_id = str(params["v"][0])
101 | try:
102 | youtube_transcript_languages = kwargs.get(
103 | "youtube_transcript_languages", ("en",)
104 | )
105 | # Must be a single transcript.
106 | transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=youtube_transcript_languages) # type: ignore
107 | transcript_text = " ".join([part["text"] for part in transcript]) # type: ignore
108 | # Alternative formatting:
109 | # formatter = TextFormatter()
110 | # formatter.format_transcript(transcript)
111 | except Exception:
112 | pass
113 | if transcript_text:
114 | webpage_text += f"\n### Transcript\n{transcript_text}\n"
115 |
116 | title = title if title else soup.title.string
117 | assert isinstance(title, str)
118 |
119 | return DocumentConverterResult(
120 | title=title,
121 | text_content=webpage_text,
122 | )
123 |
124 | def _get(
125 | self,
126 | metadata: Dict[str, str],
127 | keys: List[str],
128 | default: Union[str, None] = None,
129 | ) -> Union[str, None]:
130 | for k in keys:
131 | if k in metadata:
132 | return metadata[k]
133 | return default
134 |
135 | def _findKey(self, json: Any, key: str) -> Union[str, None]: # TODO: Fix json type
136 | if isinstance(json, list):
137 | for elm in json:
138 | ret = self._findKey(elm, key)
139 | if ret is not None:
140 | return ret
141 | elif isinstance(json, dict):
142 | for k in json:
143 | if k == key:
144 | return json[k]
145 | else:
146 | ret = self._findKey(json[k], key)
147 | if ret is not None:
148 | return ret
149 | return None
150 |
--------------------------------------------------------------------------------
/core/converters/zip.py:
--------------------------------------------------------------------------------
1 | import os
2 | import shutil
3 | import zipfile
4 | from typing import Any, Union
5 |
6 | from core.base import DocumentConverterResult, DocumentConverter
7 |
8 |
9 | class ZipConverter(DocumentConverter):
10 | """Converts ZIP files to markdown by extracting and converting all contained files.
11 |
12 | The converter extracts the ZIP contents to a temporary directory, processes each file
13 | using appropriate converters based on file extensions, and then combines the results
14 | into a single markdown document. The temporary directory is cleaned up after processing.
15 |
16 | Example output format:
17 | ```markdown
18 | Content from the zip file `example.zip`:
19 |
20 | ## File: docs/readme.txt
21 |
22 | This is the content of readme.txt
23 | Multiple lines are preserved
24 |
25 | ## File: images/example.jpg
26 |
27 | ImageSize: 1920x1080
28 | DateTimeOriginal: 2024-02-15 14:30:00
29 | Description: A beautiful landscape photo
30 |
31 | ## File: data/report.xlsx
32 |
33 | ## Sheet1
34 | | Column1 | Column2 | Column3 |
35 | |---------|---------|---------|
36 | | data1 | data2 | data3 |
37 | | data4 | data5 | data6 |
38 | ```
39 |
40 | Key features:
41 | - Maintains original file structure in headings
42 | - Processes nested files recursively
43 | - Uses appropriate converters for each file type
44 | - Preserves formatting of converted content
45 | - Cleans up temporary files after processing
46 | """
47 |
48 | def convert(
49 | self, local_path: str, **kwargs: Any
50 | ) -> Union[None, DocumentConverterResult]:
51 | # Bail if not a ZIP
52 | extension = kwargs.get("file_extension", "")
53 | if extension.lower() != ".zip":
54 | return None
55 |
56 | # Get parent converters list if available
57 | parent_converters = kwargs.get("_parent_converters", [])
58 | if not parent_converters:
59 | return DocumentConverterResult(
60 | title=None,
61 | text_content=f"[ERROR] No converters available to process zip contents from: {local_path}",
62 | )
63 |
64 | extracted_zip_folder_name = (
65 | f"extracted_{os.path.basename(local_path).replace('.zip', '_zip')}"
66 | )
67 | extraction_dir = os.path.normpath(
68 | os.path.join(os.path.dirname(local_path), extracted_zip_folder_name)
69 | )
70 | md_content = f"Content from the zip file `{os.path.basename(local_path)}`:\n\n"
71 |
72 | try:
73 | # Extract the zip file safely
74 | with zipfile.ZipFile(local_path, "r") as zipObj:
75 | # Safeguard against path traversal
76 | for member in zipObj.namelist():
77 | member_path = os.path.normpath(os.path.join(extraction_dir, member))
78 | if (
79 | not os.path.commonprefix([extraction_dir, member_path])
80 | == extraction_dir
81 | ):
82 | raise ValueError(
83 | f"Path traversal detected in zip file: {member}"
84 | )
85 |
86 | # Extract all files safely
87 | zipObj.extractall(path=extraction_dir)
88 |
89 | # Process each extracted file
90 | for root, dirs, files in os.walk(extraction_dir):
91 | for name in files:
92 | file_path = os.path.join(root, name)
93 | relative_path = os.path.relpath(file_path, extraction_dir)
94 |
95 | # Get file extension
96 | _, file_extension = os.path.splitext(name)
97 |
98 | # Update kwargs for the file
99 | file_kwargs = kwargs.copy()
100 | file_kwargs["file_extension"] = file_extension
101 | file_kwargs["_parent_converters"] = parent_converters
102 |
103 | # Try converting the file using available converters
104 | for converter in parent_converters:
105 | # Skip the zip converter to avoid infinite recursion
106 | if isinstance(converter, ZipConverter):
107 | continue
108 |
109 | result = converter.convert(file_path, **file_kwargs)
110 | if result is not None:
111 | md_content += f"\n## File: {relative_path}\n\n"
112 | md_content += result.text_content + "\n\n"
113 | break
114 |
115 | # Clean up extracted files if specified
116 | if kwargs.get("cleanup_extracted", True):
117 | shutil.rmtree(extraction_dir)
118 |
119 | return DocumentConverterResult(title=None, text_content=md_content.strip())
120 |
121 | except zipfile.BadZipFile:
122 | return DocumentConverterResult(
123 | title=None,
124 | text_content=f"[ERROR] Invalid or corrupted zip file: {local_path}",
125 | )
126 | except ValueError as ve:
127 | return DocumentConverterResult(
128 | title=None,
129 | text_content=f"[ERROR] Security error in zip file {local_path}: {str(ve)}",
130 | )
131 | except Exception as e:
132 | return DocumentConverterResult(
133 | title=None,
134 | text_content=f"[ERROR] Failed to process zip file {local_path}: {str(e)}",
135 | )
--------------------------------------------------------------------------------
/core/markitdown.py:
--------------------------------------------------------------------------------
1 | # type: ignore
2 | import copy
3 | import mimetypes
4 | import os
5 | import re
6 | import tempfile
7 | import traceback
8 | from pathlib import Path
9 | from typing import Any, List, Optional, Union
10 | from urllib.parse import urlparse
11 |
12 | # File-format detection
13 | import puremagic
14 | import requests
15 |
16 | from core.base import DocumentConverterResult, DocumentConverter, FileConversionException, UnsupportedFormatException
17 | from core.converters.bingsearch import BingSerpConverter
18 | from core.converters.docx import DocxConverter
19 | from core.converters.html import HtmlConverter
20 | from core.converters.image import ImageConverter
21 | from core.converters.ipynb import IpynbConverter
22 | from core.converters.mp3 import Mp3Converter
23 | from core.converters.outlook import OutlookMsgConverter
24 | from core.converters.plaintext import PlainTextConverter
25 | from core.converters.pptx import PptxConverter
26 | from core.converters.rss import RSSConverter
27 | from core.converters.wav import WavConverter
28 | from core.converters.wikipedia import WikipediaConverter
29 | from core.converters.xls import XlsConverter
30 | from core.converters.xlsx import XlsxConverter
31 | from core.converters.youtube import YouTubeConverter
32 | from core.converters.zip import ZipConverter
33 |
34 |
35 | class MarkItDown:
36 | """(In preview) An extremely simple text-based document reader, suitable for LLM use.
37 | This reader will convert common file-types or webpages to Markdown."""
38 |
39 | def __init__(
40 | self,
41 | requests_session: Optional[requests.Session] = None,
42 | llm_client: Optional[Any] = None,
43 | llm_model: Optional[str] = None,
44 | style_map: Optional[str] = None,
45 | exiftool_path: Optional[str] = None,
46 | mode: str = "simple", # simple|advanced|cloud
47 | ):
48 | self.mode = mode
49 | if requests_session is None:
50 | self._requests_session = requests.Session()
51 | else:
52 | self._requests_session = requests_session
53 |
54 | if exiftool_path is None:
55 | exiftool_path = os.environ.get("EXIFTOOL_PATH")
56 |
57 | self._llm_client = llm_client
58 | self._llm_model = llm_model
59 | self._style_map = style_map
60 | self._exiftool_path = exiftool_path
61 |
62 | self._page_converters: List[DocumentConverter] = []
63 |
64 | # Register converters for successful browsing operations
65 | # Later registrations are tried first / take higher priority than earlier registrations
66 | # To this end, the most specific converters should appear below the most generic converters
67 | self.register_page_converter(PlainTextConverter())
68 | self.register_page_converter(HtmlConverter())
69 | self.register_page_converter(RSSConverter())
70 | self.register_page_converter(WikipediaConverter())
71 | self.register_page_converter(YouTubeConverter())
72 | self.register_page_converter(BingSerpConverter())
73 | self.register_page_converter(DocxConverter())
74 | self.register_page_converter(XlsxConverter())
75 | self.register_page_converter(XlsConverter())
76 | self.register_page_converter(PptxConverter())
77 | self.register_page_converter(WavConverter())
78 | self.register_page_converter(Mp3Converter())
79 | self.register_page_converter(ImageConverter())
80 | self.register_page_converter(IpynbConverter())
81 |
82 | # 动态注册PDF转换器
83 | # 确保PDF转换器只处理PDF文件
84 | if self.mode == 'advanced':
85 | from core.converters.pdf import AdvancedPdfConverter
86 | self.register_page_converter(AdvancedPdfConverter())
87 | elif self.mode == 'cloud':
88 | from core.converters.pdf import CloudPdfConverter
89 | self.register_page_converter(CloudPdfConverter())
90 | else: # 默认simple模式
91 | from core.converters.pdf import PdfConverter
92 | self.register_page_converter(PdfConverter())
93 |
94 | self.register_page_converter(ZipConverter())
95 | self.register_page_converter(OutlookMsgConverter())
96 |
97 | def convert(
98 | self, source: Union[str, requests.Response, Path], **kwargs: Any
99 | ) -> DocumentConverterResult: # TODO: deal with kwargs
100 | """
101 | Args:
102 | - source: can be a string representing a path either as string pathlib path object or url, or a requests.response object
103 | - extension: specifies the file extension to use when interpreting the file. If None, infer from source (path, uri, content-type, etc.)
104 | """
105 |
106 | # Local path or url
107 | if isinstance(source, str):
108 | if (
109 | source.startswith("http://")
110 | or source.startswith("https://")
111 | or source.startswith("file://")
112 | ):
113 | return self.convert_url(source, **kwargs)
114 | else:
115 | return self.convert_local(source, **kwargs)
116 | # Request response
117 | elif isinstance(source, requests.Response):
118 | return self.convert_response(source, **kwargs)
119 | elif isinstance(source, Path):
120 | return self.convert_local(source, **kwargs)
121 |
122 | def convert_local(
123 | self, path: Union[str, Path], **kwargs: Any
124 | ) -> DocumentConverterResult: # TODO: deal with kwargs
125 | if isinstance(path, Path):
126 | path = str(path)
127 | # Prepare a list of extensions to try (in order of priority)
128 | ext = kwargs.get("file_extension")
129 | extensions = [ext] if ext is not None else []
130 |
131 | # Get extension alternatives from the path and puremagic
132 | base, ext = os.path.splitext(path)
133 | self._append_ext(extensions, ext)
134 |
135 | for g in self._guess_ext_magic(path):
136 | self._append_ext(extensions, g)
137 |
138 | # Convert
139 | return self._convert(path, extensions, **kwargs)
140 |
141 | # TODO what should stream's type be?
142 | def convert_stream(
143 | self, stream: Any, **kwargs: Any
144 | ) -> DocumentConverterResult: # TODO: deal with kwargs
145 | # Prepare a list of extensions to try (in order of priority)
146 | ext = kwargs.get("file_extension")
147 | extensions = [ext] if ext is not None else []
148 |
149 | # Save the file locally to a temporary file. It will be deleted before this method exits
150 | handle, temp_path = tempfile.mkstemp()
151 | fh = os.fdopen(handle, "wb")
152 | result = None
153 | try:
154 | # Write to the temporary file
155 | content = stream.read()
156 | if isinstance(content, str):
157 | fh.write(content.encode("utf-8"))
158 | else:
159 | fh.write(content)
160 | fh.close()
161 |
162 | # Use puremagic to check for more extension options
163 | for g in self._guess_ext_magic(temp_path):
164 | self._append_ext(extensions, g)
165 |
166 | # Convert
167 | result = self._convert(temp_path, extensions, **kwargs)
168 | # Clean up
169 | finally:
170 | try:
171 | fh.close()
172 | except Exception:
173 | pass
174 | os.unlink(temp_path)
175 |
176 | return result
177 |
178 | def convert_url(
179 | self, url: str, **kwargs: Any
180 | ) -> DocumentConverterResult: # TODO: fix kwargs type
181 | # Send a HTTP request to the URL
182 | response = self._requests_session.get(url, stream=True)
183 | response.raise_for_status()
184 | return self.convert_response(response, **kwargs)
185 |
186 | def convert_response(
187 | self, response: requests.Response, **kwargs: Any
188 | ) -> DocumentConverterResult: # TODO fix kwargs type
189 | # Prepare a list of extensions to try (in order of priority)
190 | ext = kwargs.get("file_extension")
191 | extensions = [ext] if ext is not None else []
192 |
193 | # Guess from the mimetype
194 | content_type = response.headers.get("content-type", "").split(";")[0]
195 | self._append_ext(extensions, mimetypes.guess_extension(content_type))
196 |
197 | # Read the content disposition if there is one
198 | content_disposition = response.headers.get("content-disposition", "")
199 | m = re.search(r"filename=([^;]+)", content_disposition)
200 | if m:
201 | base, ext = os.path.splitext(m.group(1).strip("\"'"))
202 | self._append_ext(extensions, ext)
203 |
204 | # Read from the extension from the path
205 | base, ext = os.path.splitext(urlparse(response.url).path)
206 | self._append_ext(extensions, ext)
207 |
208 | # Save the file locally to a temporary file. It will be deleted before this method exits
209 | handle, temp_path = tempfile.mkstemp()
210 | fh = os.fdopen(handle, "wb")
211 | result = None
212 | try:
213 | # Download the file
214 | for chunk in response.iter_content(chunk_size=512):
215 | fh.write(chunk)
216 | fh.close()
217 |
218 | # Use puremagic to check for more extension options
219 | for g in self._guess_ext_magic(temp_path):
220 | self._append_ext(extensions, g)
221 |
222 | # Convert
223 | result = self._convert(temp_path, extensions, url=response.url, **kwargs)
224 | # Clean up
225 | finally:
226 | try:
227 | fh.close()
228 | except Exception:
229 | pass
230 | os.unlink(temp_path)
231 |
232 | return result
233 |
234 | def _convert(
235 | self, local_path: str, extensions: List[Union[str, None]], **kwargs
236 | ) -> DocumentConverterResult:
237 | error_trace = ""
238 | for ext in extensions + [None]: # Try last with no extension
239 | for converter in self._page_converters:
240 | _kwargs = copy.deepcopy(kwargs)
241 |
242 | # Overwrite file_extension appropriately
243 | if ext is None:
244 | if "file_extension" in _kwargs:
245 | del _kwargs["file_extension"]
246 | else:
247 | _kwargs.update({"file_extension": ext})
248 |
249 | # Copy any additional global options
250 | if "llm_client" not in _kwargs and self._llm_client is not None:
251 | _kwargs["llm_client"] = self._llm_client
252 |
253 | if "llm_model" not in _kwargs and self._llm_model is not None:
254 | _kwargs["llm_model"] = self._llm_model
255 |
256 | if "style_map" not in _kwargs and self._style_map is not None:
257 | _kwargs["style_map"] = self._style_map
258 |
259 | if "exiftool_path" not in _kwargs and self._exiftool_path is not None:
260 | _kwargs["exiftool_path"] = self._exiftool_path
261 |
262 | # Add the list of converters for nested processing
263 | _kwargs["_parent_converters"] = self._page_converters
264 |
265 | # If we hit an error log it and keep trying
266 | try:
267 | res = converter.convert(local_path, **_kwargs)
268 | except Exception:
269 | error_trace = ("\n\n" + traceback.format_exc()).strip()
270 |
271 | if res is not None:
272 | # Normalize the content
273 | res.text_content = "\n".join(
274 | [line.rstrip() for line in re.split(r"\r?\n", res.text_content)]
275 | )
276 | res.text_content = re.sub(r"\n{3,}", "\n\n", res.text_content)
277 |
278 | # Todo
279 | return res
280 |
281 | # If we got this far without success, report any exceptions
282 | if len(error_trace) > 0:
283 | raise FileConversionException(
284 | f"Could not convert '{local_path}' to Markdown. File type was recognized as {extensions}. While converting the file, the following error was encountered:\n\n{error_trace}"
285 | )
286 |
287 | # Nothing can handle it!
288 | raise UnsupportedFormatException(
289 | f"Could not convert '{local_path}' to Markdown. The formats {extensions} are not supported."
290 | )
291 |
292 | def _append_ext(self, extensions, ext):
293 | """Append a unique non-None, non-empty extension to a list of extensions."""
294 | if ext is None:
295 | return
296 | ext = ext.strip()
297 | if ext == "":
298 | return
299 | # if ext not in extensions:
300 | extensions.append(ext)
301 |
302 | def _guess_ext_magic(self, path):
303 | """Use puremagic (a Python implementation of libmagic) to guess a file's extension based on the first few bytes."""
304 | # Use puremagic to guess
305 | try:
306 | guesses = puremagic.magic_file(path)
307 |
308 | # Fix for: https://github.com/microsoft/markitdown/issues/222
309 | # If there are no guesses, then try again after trimming leading ASCII whitespaces.
310 | # ASCII whitespace characters are those byte values in the sequence b' \t\n\r\x0b\f'
311 | # (space, tab, newline, carriage return, vertical tab, form feed).
312 | if len(guesses) == 0:
313 | with open(path, "rb") as file:
314 | while True:
315 | char = file.read(1)
316 | if not char: # End of file
317 | break
318 | if not char.isspace():
319 | file.seek(file.tell() - 1)
320 | break
321 | try:
322 | guesses = puremagic.magic_stream(file)
323 | except puremagic.main.PureError:
324 | pass
325 |
326 | extensions = list()
327 | for g in guesses:
328 | ext = g.extension.strip()
329 | if len(ext) > 0:
330 | if not ext.startswith("."):
331 | ext = "." + ext
332 | if ext not in extensions:
333 | extensions.append(ext)
334 | return extensions
335 | except FileNotFoundError:
336 | pass
337 | except IsADirectoryError:
338 | pass
339 | except PermissionError:
340 | pass
341 | return []
342 |
343 | def register_page_converter(self, converter: DocumentConverter) -> None:
344 | """Register a page text converter."""
345 | self._page_converters.insert(0, converter)
346 |
--------------------------------------------------------------------------------
/core/model_manager.py:
--------------------------------------------------------------------------------
1 | import json
2 | import os
3 | from pathlib import Path
4 |
5 | from huggingface_hub import snapshot_download as hf_download
6 | from modelscope.hub.snapshot_download import snapshot_download as ms_download
7 |
8 | DEFAULT_CONFIG_NAME = "magic-pdf.json"
9 | GITHUB_TEMPLATE_URL = "https://raw.githubusercontent.com/opendatalab/MinerU/master/magic-pdf.template.json"
10 | MODEL_REPOS = {
11 | 'main': 'opendatalab/PDF-Extract-Kit-1.0',
12 | 'layout': 'hantian/layoutreader'
13 | }
14 |
15 |
16 | class ModelConfigurator:
17 | """模型配置管理器"""
18 |
19 | def __init__(self, device='cpu', models_dir=None, use_modelscope=True):
20 | self.device = device
21 | self.use_modelscope = use_modelscope
22 | self.models_dir = models_dir
23 | self.config_path = self._get_config_path()
24 | self.mineru_patterns = [
25 | "models/Layout/LayoutLMv3/*",
26 | "models/Layout/YOLO/*",
27 | "models/MFD/YOLO/*",
28 | "models/MFR/unimernet_small_2501/*",
29 | "models/TabRec/TableMaster/*",
30 | "models/TabRec/StructEqTable/*",
31 | ]
32 | if self.use_modelscope:
33 | MODEL_REPOS['layout'] = 'ppaanngggg/layoutreader'
34 |
35 | def _get_cache_dir(self, model_type):
36 | """获取符合各库规范的缓存目录"""
37 | if self.models_dir:
38 | custom_dir = Path(self.models_dir).expanduser().resolve()
39 | return custom_dir / model_type
40 |
41 | # 自动识别默认缓存路径
42 | if self.use_modelscope:
43 | return Path.home() / ".cache/modelscope/hub" / MODEL_REPOS[model_type]
44 | else:
45 | return Path.home() / ".cache/huggingface/hub" / MODEL_REPOS[model_type]
46 |
47 | def _get_config_path(self):
48 | """获取配置文件路径"""
49 | env_path = os.getenv('MINERU_TOOLS_CONFIG_JSON')
50 | return Path(env_path) if env_path else Path.home() / DEFAULT_CONFIG_NAME
51 |
52 | def setup_environment(self):
53 | """配置环境并下载模型"""
54 | self._download_models()
55 | self._generate_config()
56 | os.environ['MINERU_TOOLS_CONFIG_JSON'] = str(self.config_path)
57 |
58 | def _download_models(self):
59 | """改进后的下载方法"""
60 | downloader = ms_download if self.use_modelscope else hf_download
61 |
62 | model_paths = {}
63 | for model_type in ['main', 'layout']:
64 | cache_dir = self._get_cache_dir(model_type)
65 |
66 | print(f"下载 {model_type} 模型到: {cache_dir}")
67 |
68 | # 保留库的默认缓存行为,仅在指定--models-dir时覆盖
69 | download_args = {
70 | 'repo_id': MODEL_REPOS[model_type],
71 | 'local_dir': str(cache_dir), # 新增参数确保文件存储在指定位置
72 | 'allow_patterns': self.mineru_patterns if model_type == 'main' else None # 添加过滤规则
73 | }
74 |
75 | # 仅在自定义路径时覆盖缓存目录
76 | if self.models_dir:
77 | download_args['cache_dir'] = str(cache_dir.parent)
78 |
79 | snapshot_path = downloader(**download_args)
80 |
81 | # 处理特殊目录结构
82 | if model_type == 'main':
83 | self.main_model_path = Path(snapshot_path) / 'models'
84 | else:
85 | self.layout_model_path = Path(snapshot_path)
86 |
87 | return model_paths
88 |
89 | def _generate_config(self):
90 | """生成配置文件"""
91 | template_path = "assets/magic-pdf-template.json"
92 | try:
93 | with open(template_path, "r") as f:
94 | template_config = json.load(f)
95 | print(f"成功加载模板配置: {template_path}")
96 | except Exception as e:
97 | print(f"加载模板配置失败,使用默认值: {e}")
98 | template_config = {}
99 |
100 | custom_config = {
101 | "device-mode": self.device,
102 | "models-dir": str(self.main_model_path),
103 | "layoutreader-model-dir": str(self.layout_model_path),
104 | }
105 | template_config.update(custom_config)
106 | config = template_config
107 |
108 | if self.config_path.exists():
109 | with open(self.config_path, 'r') as f:
110 | existing_config = json.load(f)
111 | existing_config.update(custom_config)
112 | config = existing_config
113 |
114 | with open(self.config_path, 'w') as f:
115 | json.dump(config, f, indent=2)
116 |
--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | import io
2 | import os
3 | import uuid
4 | from contextlib import asynccontextmanager
5 | from pathlib import Path
6 | from typing import Optional, List
7 |
8 | import openai
9 | from fastapi import FastAPI, UploadFile, File, HTTPException, status, Depends, BackgroundTasks, Form, Query
10 | from fastapi.responses import FileResponse
11 | from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
12 | from pydantic import BaseModel
13 | from sqlalchemy.orm import Session
14 | from fastapi.staticfiles import StaticFiles
15 |
16 | from core.markitdown import MarkItDown
17 | from core.base import DocumentConverterResult
18 | from core.model_manager import ModelConfigurator
19 | from repository.db import get_db, Job
20 |
21 | # 安全验证
22 | security = HTTPBearer()
23 |
24 | # 从环境变量获取API密钥
25 | API_KEY = os.getenv("MARKIT_API_KEY", "secret-key")
26 | OUTPUT_DIR = Path("output")
27 | OUTPUT_DIR.mkdir(exist_ok=True)
28 | MINER_RUNNING_DEVICE = os.getenv("MINER_RUNNING_DEVICE", "cpu")
29 | port = int(os.getenv("PORT", 20926))
30 |
31 |
32 | # 依赖项:API Key 验证
33 | async def verify_api_key(
34 | credentials: HTTPAuthorizationCredentials = Depends(security)
35 | ):
36 | if credentials.scheme != "Bearer" or credentials.credentials != API_KEY:
37 | raise HTTPException(
38 | status_code=status.HTTP_401_UNAUTHORIZED,
39 | detail="Invalid API Key",
40 | )
41 | return credentials
42 |
43 |
44 | @asynccontextmanager
45 | async def lifespan(app: FastAPI):
46 | """服务启动和关闭时的生命周期管理"""
47 | try:
48 | # 初始化模型
49 | configurator = ModelConfigurator(
50 | device=os.getenv("MINERU_DEVICE", MINER_RUNNING_DEVICE),
51 | use_modelscope=os.getenv("MINERU_USE_MODELSCOPE", "true").lower() in ("true", "1")
52 | )
53 | configurator.setup_environment()
54 | print("模型初始化完成")
55 | except Exception as e:
56 | print(f"模型初始化失败: {str(e)}")
57 | raise
58 |
59 | yield # 应用运行期间
60 |
61 | # 清理逻辑(可选)
62 | print("服务关闭,清理资源...")
63 |
64 |
65 | # FastAPI 应用
66 | app = FastAPI(lifespan=lifespan)
67 | if not os.path.exists("output/images"):
68 | os.mkdir("output/images")
69 | app.mount("/images", StaticFiles(directory="output/images"), name="images")
70 |
71 |
72 | # from slowapi import Limiter, _rate_limit_exceeded_handler
73 | # from slowapi.errors import RateLimitExceeded
74 | # from slowapi.util import get_remote_address
75 | # limiter = Limiter(key_func=get_remote_address)
76 | # app.state.limiter = limiter
77 | # app.add_exception_handler(RateLimitExceeded, _rate_limit_exceeded_handler)
78 | # @limiter.limit("100/minute")
79 |
80 |
81 | # 数据模型
82 | class JobStatusResponse(BaseModel):
83 | job_id: str
84 | status: str
85 | filename: str
86 | params: dict
87 | error: Optional[str]
88 |
89 |
90 | class JobResultResponse(BaseModel):
91 | job_id: str
92 | download_url: str
93 | format: str
94 |
95 |
96 | oai_client = None
97 | if os.getenv("MARKIFY_LLM_API_KEY", None) and os.getenv("MARKIFY_LLM_API_BASE", None):
98 | oai_client = openai.OpenAI(
99 | api_key=os.getenv("MARKIFY_LLM_API_KEY", None),
100 | base_url=os.getenv("MARKIFY_LLM_API_BASE", None)
101 | )
102 |
103 |
104 | def process_file(db: Session, job_id: str, file_content: bytes, filename: str, mode: str = "simple"):
105 | """处理各种文件的后台任务"""
106 | try:
107 | # 更新任务状态为 processing
108 | job = db.query(Job).filter(Job.id == job_id).first()
109 | if not job:
110 | raise ValueError(f"Job {job_id} not found")
111 |
112 | job.status = "processing"
113 | db.commit()
114 |
115 | # 创建处理器
116 | markitdown = MarkItDown(mode=mode,
117 | llm_client=oai_client,
118 | llm_model=os.getenv("MARKIFY_LLM_MODEL", None)
119 | )
120 |
121 | # 根据输入类型处理
122 | if filename.endswith('.md'):
123 | result = DocumentConverterResult(text_content=file_content.decode('utf-8'))
124 | else:
125 | # 将字节内容转为文件流
126 | file_stream = io.BytesIO(file_content)
127 | result = markitdown.convert_stream(file_stream, base_url="http://localhost:20926")
128 |
129 | # 保存结果到文件
130 | output_file = OUTPUT_DIR / f"{job_id}.md"
131 | with open(output_file, "w", encoding="utf-8") as f:
132 | f.write(result.text_content)
133 |
134 | # 更新任务状态为 completed
135 | job.status = "completed"
136 | job.result_file = str(output_file)
137 | db.commit()
138 |
139 | except Exception as e:
140 | # 更新任务状态为 failed
141 | job.status = "failed"
142 | job.error = f"{type(e).__name__}: {str(e)}"
143 | db.commit()
144 |
145 |
146 | @app.post("/api/jobs", status_code=status.HTTP_202_ACCEPTED)
147 | async def upload_file(
148 | background_tasks: BackgroundTasks,
149 | file: UploadFile = File(...),
150 | mode: str = Form("simple"),
151 | db: Session = Depends(get_db)
152 | ):
153 | """上传文件并启动转换任务"""
154 | # 生成任务ID
155 | job_id = str(uuid.uuid4())
156 |
157 | try:
158 | # 读取文件内容
159 | content = await file.read()
160 |
161 | # 创建任务记录
162 | job = Job(
163 | id=job_id,
164 | filename=file.filename,
165 | params={"mode": mode},
166 | status="pending"
167 | )
168 | db.add(job)
169 | db.commit()
170 |
171 | # 启动后台任务
172 | background_tasks.add_task(
173 | process_file,
174 | db=db,
175 | job_id=job_id,
176 | file_content=content,
177 | filename=file.filename,
178 | mode=mode
179 | )
180 |
181 | return {"job_id": job_id}
182 |
183 | except Exception as e:
184 | raise HTTPException(
185 | status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
186 | detail=f"File upload failed: {str(e)}"
187 | )
188 |
189 |
190 | @app.get("/api/jobs", response_model=List[JobStatusResponse])
191 | async def list_jobs(
192 | db: Session = Depends(get_db),
193 | page: int = Query(0, ge=0, description=""),
194 | limit: int = Query(10, gt=0, le=100, description="default 10,max 100")):
195 | """查询任务状态"""
196 | jobs = db.query(Job).order_by(Job.created_at.desc()).limit(limit).offset(page * limit).all()
197 | if not jobs:
198 | raise HTTPException(
199 | status_code=status.HTTP_404_NOT_FOUND,
200 | detail="Job not found"
201 | )
202 |
203 | response_list = []
204 | for job in jobs:
205 | response_list.append(JobStatusResponse(
206 | job_id=job.id,
207 | status=job.status,
208 | filename=job.filename,
209 | params=job.params,
210 | error=job.error
211 | ))
212 | return response_list
213 |
214 |
215 | @app.get("/api/jobs/{job_id}", response_model=JobStatusResponse)
216 | async def get_job_status(
217 | job_id: str,
218 | db: Session = Depends(get_db)
219 | ):
220 | """查询任务状态"""
221 | job = db.query(Job).filter(Job.id == job_id).first()
222 | if not job:
223 | raise HTTPException(
224 | status_code=status.HTTP_404_NOT_FOUND,
225 | detail="Job not found"
226 | )
227 |
228 | return JobStatusResponse(
229 | job_id=job.id,
230 | status=job.status,
231 | filename=job.filename,
232 | params=job.params,
233 | error=job.error
234 | )
235 |
236 |
237 | @app.get("/api/jobs/{job_id}/result")
238 | async def download_result(
239 | job_id: str,
240 | db: Session = Depends(get_db)
241 | ):
242 | """下载任务结果文件"""
243 | job = db.query(Job).filter(Job.id == job_id).first()
244 | if not job:
245 | raise HTTPException(
246 | status_code=status.HTTP_404_NOT_FOUND,
247 | detail="Job not found"
248 | )
249 |
250 | if job.status != "completed":
251 | raise HTTPException(
252 | status_code=status.HTTP_425_TOO_EARLY,
253 | detail="Job not completed"
254 | )
255 |
256 | result_file = job.result_file
257 | if not result_file or not os.path.exists(result_file):
258 | raise HTTPException(
259 | status_code=status.HTTP_404_NOT_FOUND,
260 | detail="Result file not found"
261 | )
262 |
263 | # 返回文件内容
264 | return FileResponse(
265 | result_file,
266 | filename=f"{job.filename}.md",
267 | media_type="text/markdown"
268 | )
269 |
270 |
271 | if __name__ == "__main__":
272 | import uvicorn
273 |
274 | uvicorn.run(app, host="0.0.0.0", port=port)
275 |
--------------------------------------------------------------------------------
/repository/db.py:
--------------------------------------------------------------------------------
1 | from sqlalchemy import create_engine, Column, String, Integer, JSON, DateTime
2 | from sqlalchemy.orm import declarative_base
3 | from sqlalchemy.orm import sessionmaker
4 | from datetime import datetime
5 |
6 | # SQLite 数据库路径
7 | DATABASE_URL = "sqlite:///./jobs.db"
8 |
9 | # 创建数据库引擎
10 | engine = create_engine(DATABASE_URL, connect_args={"check_same_thread": False})
11 |
12 | # 创建 SessionLocal
13 | SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
14 |
15 | # 声明基类
16 | Base = declarative_base()
17 |
18 |
19 | # 任务模型
20 | class Job(Base):
21 | __tablename__ = "jobs"
22 |
23 | id = Column(String, primary_key=True, index=True)
24 | status = Column(String, default="pending")
25 | filename = Column(String)
26 | params = Column(JSON)
27 | result_file = Column(String)
28 | error = Column(String)
29 | created_at = Column(DateTime, default=datetime.utcnow)
30 | updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
31 |
32 |
33 | # 创建数据库表
34 | Base.metadata.create_all(bind=engine)
35 |
36 |
37 | # 获取数据库会话
38 | def get_db():
39 | db = SessionLocal()
40 | try:
41 | yield db
42 | finally:
43 | db.close()
44 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | # Core dependencies
2 | beautifulsoup4~=4.12.3
3 | requests~=2.32.3
4 | mammoth~=1.9.0
5 | markdownify~=0.14.1
6 | numpy
7 | python-pptx==1.0.2
8 | pandas~=2.2.3
9 | openpyxl==3.1.5
10 | xlrd==2.0.1
11 | puremagic~=1.28
12 | pydub~=0.25.1
13 | olefile~=0.47
14 | youtube-transcript-api==0.6.3
15 | SpeechRecognition==3.14.0
16 | pathvalidate==3.2.3
17 | charset-normalizer==3.4.1
18 | openai~=1.59.7
19 | magic-pdf[full] --extra-index-url https://wheels.myhloli.com
20 | modelscope~=1.22.2
21 | huggingface_hub~=0.27.1
22 | slowapi~=0.1.9
23 | limits~=4.0.1
24 | python-multipart~=0.0.20
25 | uvicorn>=0.34.0
26 | sqlalchemy>=2.0.37
27 | # Development dependencies (optional, install with `-r dev-requirements.txt`)
28 | mypy>=1.0.0
29 |
30 | fastapi~=0.115.7
31 | pydantic~=2.10.5
32 | setuptools~=75.1.0
33 |
34 | streamlit~=1.29.0
--------------------------------------------------------------------------------