├── .github
    └── workflows
    │   └── docker-build.yml
├── .gitignore
├── Dockerfile
├── LICENSE
├── NOTICE
├── README.md
├── assets
    ├── magic-pdf-template.json
    ├── sponsor.JPG
    ├── streamlint_ui.png
    └── zsxq.JPG
├── client
    └── streamlit_client.py
├── core
    ├── __init__.py
    ├── base.py
    ├── converters
    │   ├── __init__.py
    │   ├── bingsearch.py
    │   ├── custommarkdownify.py
    │   ├── docx.py
    │   ├── html.py
    │   ├── image.py
    │   ├── ipynb.py
    │   ├── media.py
    │   ├── mineru
    │   │   ├── __init__.py
    │   │   ├── pdf_processor.py
    │   │   └── title_corrector.py
    │   ├── mp3.py
    │   ├── outlook.py
    │   ├── pdf.py
    │   ├── plaintext.py
    │   ├── pptx.py
    │   ├── rss.py
    │   ├── wav.py
    │   ├── wikipedia.py
    │   ├── xls.py
    │   ├── xlsx.py
    │   ├── youtube.py
    │   └── zip.py
    ├── markitdown.py
    └── model_manager.py
├── main.py
├── repository
    └── db.py
└── requirements.txt


/.github/workflows/docker-build.yml:
--------------------------------------------------------------------------------
 1 | name: Docker Image CI
 2 | 
 3 | on:
 4 |   push:
 5 |     tags:
 6 |       - 'v[0-9]+.[0-9]+.[0-9]+'
 7 | 
 8 | jobs:
 9 |   build:
10 | 
11 |     runs-on: ubuntu-latest
12 | 
13 |     env:
14 |       IMAGE_NAME: ${{ github.event.repository.name }}
15 | 
16 |     steps:
17 |     - uses: actions/checkout@v4
18 | 
19 |     - name: Remove 'v' prefix from tag
20 |       id: tag_name
21 |       run: |
22 |         TAG_NAME=${GITHUB_REF#refs/tags/}
23 |         TAG_NAME=${TAG_NAME#v}
24 |         echo "TAG_NAME=$TAG_NAME" >> $GITHUB_ENV
25 | 
26 |     - name: Log in to DockerHub
27 |       run: echo "${{ secrets.DOCKERHUB_TOKEN }}" | docker login -u "${{ secrets.DOCKERHUB_USERNAME }}" --password-stdin
28 | 
29 |     - name: Build the Docker image
30 |       run: docker build . --file Dockerfile --tag ${{ secrets.DOCKERHUB_USERNAME }}/${{ env.IMAGE_NAME }}:${{ env.TAG_NAME }}
31 | 
32 |     - name: Push the Docker image
33 |       run: docker push ${{ secrets.DOCKERHUB_USERNAME }}/${{ env.IMAGE_NAME }}:${{ env.TAG_NAME }}
34 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # UV
 98 | #   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #uv.lock
102 | 
103 | # poetry
104 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
106 | #   commonly ignored for libraries.
107 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108 | #poetry.lock
109 | 
110 | # pdm
111 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
112 | #pdm.lock
113 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
114 | #   in version control.
115 | #   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
116 | .pdm.toml
117 | .pdm-python
118 | .pdm-build/
119 | 
120 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
121 | __pypackages__/
122 | 
123 | # Celery stuff
124 | celerybeat-schedule
125 | celerybeat.pid
126 | 
127 | # SageMath parsed files
128 | *.sage.py
129 | 
130 | # Environments
131 | .env
132 | .venv
133 | env/
134 | venv/
135 | ENV/
136 | env.bak/
137 | venv.bak/
138 | 
139 | # Spyder project settings
140 | .spyderproject
141 | .spyproject
142 | 
143 | # Rope project settings
144 | .ropeproject
145 | 
146 | # mkdocs documentation
147 | /site
148 | 
149 | # mypy
150 | .mypy_cache/
151 | .dmypy.json
152 | dmypy.json
153 | 
154 | # Pyre type checker
155 | .pyre/
156 | 
157 | # pytype static type analyzer
158 | .pytype/
159 | 
160 | # Cython debug symbols
161 | cython_debug/
162 | 
163 | # PyCharm
164 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
165 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
166 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
167 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
168 | #.idea/
169 | 
170 | # Ruff stuff:
171 | .ruff_cache/
172 | 
173 | # PyPI configuration file
174 | .pypirc
175 | 
176 | # custome
177 | *.db
178 | output_files
179 | .DS_Store
180 | .idea
181 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.10-slim
 2 | 
 3 | ENV PYTHONDONTWRITEBYTECODE=1
 4 | 
 5 | ENV PYTHONUNBUFFERED=1
 6 | 
 7 | ENV HF_ENDPOINT="https://hf-mirror.com"
 8 | 
 9 | # 设置工作目录
10 | WORKDIR /app
11 | 
12 | # 将 requirements.txt 复制到容器中
13 | COPY requirements.txt .
14 | 
15 | # 安装依赖
16 | RUN pip install --upgrade -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple pip && \
17 |     pip install --no-cache-dir -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple -r requirements.txt
18 | 
19 | # 将应用代码复制到容器中
20 | COPY . .
21 | 
22 | # 暴露应用服务端口
23 | EXPOSE 20926
24 | 
25 | # 定义启动命令
26 | CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "20926"]
27 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | This project is licensed under the GNU Affero General Public License v3.0 (AGPL-3.0).
  2 | Portions of this software are derived from the following project, see NOTICE for details.
  3 | - [MarkitDown] (https://github.com/microsoft/markitdown) under the MIT License.
  4 | - [MinerU] (https://github.com/opendatalab/MinerU) under the AGPL License.
  5 | 
  6 | 
  7 | 
  8 |                     GNU AFFERO GENERAL PUBLIC LICENSE
  9 |                        Version 3, 19 November 2007
 10 | 
 11 |  Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
 12 |  Everyone is permitted to copy and distribute verbatim copies
 13 |  of this license document, but changing it is not allowed.
 14 | 
 15 |                             Preamble
 16 | 
 17 |   The GNU Affero General Public License is a free, copyleft license for
 18 | software and other kinds of works, specifically designed to ensure
 19 | cooperation with the community in the case of network server software.
 20 | 
 21 |   The licenses for most software and other practical works are designed
 22 | to take away your freedom to share and change the works.  By contrast,
 23 | our General Public Licenses are intended to guarantee your freedom to
 24 | share and change all versions of a program--to make sure it remains free
 25 | software for all its users.
 26 | 
 27 |   When we speak of free software, we are referring to freedom, not
 28 | price.  Our General Public Licenses are designed to make sure that you
 29 | have the freedom to distribute copies of free software (and charge for
 30 | them if you wish), that you receive source code or can get it if you
 31 | want it, that you can change the software or use pieces of it in new
 32 | free programs, and that you know you can do these things.
 33 | 
 34 |   Developers that use our General Public Licenses protect your rights
 35 | with two steps: (1) assert copyright on the software, and (2) offer
 36 | you this License which gives you legal permission to copy, distribute
 37 | and/or modify the software.
 38 | 
 39 |   A secondary benefit of defending all users' freedom is that
 40 | improvements made in alternate versions of the program, if they
 41 | receive widespread use, become available for other developers to
 42 | incorporate.  Many developers of free software are heartened and
 43 | encouraged by the resulting cooperation.  However, in the case of
 44 | software used on network servers, this result may fail to come about.
 45 | The GNU General Public License permits making a modified version and
 46 | letting the public access it on a server without ever releasing its
 47 | source code to the public.
 48 | 
 49 |   The GNU Affero General Public License is designed specifically to
 50 | ensure that, in such cases, the modified source code becomes available
 51 | to the community.  It requires the operator of a network server to
 52 | provide the source code of the modified version running there to the
 53 | users of that server.  Therefore, public use of a modified version, on
 54 | a publicly accessible server, gives the public access to the source
 55 | code of the modified version.
 56 | 
 57 |   An older license, called the Affero General Public License and
 58 | published by Affero, was designed to accomplish similar goals.  This is
 59 | a different license, not a version of the Affero GPL, but Affero has
 60 | released a new version of the Affero GPL which permits relicensing under
 61 | this license.
 62 | 
 63 |   The precise terms and conditions for copying, distribution and
 64 | modification follow.
 65 | 
 66 |                        TERMS AND CONDITIONS
 67 | 
 68 |   0. Definitions.
 69 | 
 70 |   "This License" refers to version 3 of the GNU Affero General Public License.
 71 | 
 72 |   "Copyright" also means copyright-like laws that apply to other kinds of
 73 | works, such as semiconductor masks.
 74 | 
 75 |   "The Program" refers to any copyrightable work licensed under this
 76 | License.  Each licensee is addressed as "you".  "Licensees" and
 77 | "recipients" may be individuals or organizations.
 78 | 
 79 |   To "modify" a work means to copy from or adapt all or part of the work
 80 | in a fashion requiring copyright permission, other than the making of an
 81 | exact copy.  The resulting work is called a "modified version" of the
 82 | earlier work or a work "based on" the earlier work.
 83 | 
 84 |   A "covered work" means either the unmodified Program or a work based
 85 | on the Program.
 86 | 
 87 |   To "propagate" a work means to do anything with it that, without
 88 | permission, would make you directly or secondarily liable for
 89 | infringement under applicable copyright law, except executing it on a
 90 | computer or modifying a private copy.  Propagation includes copying,
 91 | distribution (with or without modification), making available to the
 92 | public, and in some countries other activities as well.
 93 | 
 94 |   To "convey" a work means any kind of propagation that enables other
 95 | parties to make or receive copies.  Mere interaction with a user through
 96 | a computer network, with no transfer of a copy, is not conveying.
 97 | 
 98 |   An interactive user interface displays "Appropriate Legal Notices"
 99 | to the extent that it includes a convenient and prominently visible
100 | feature that (1) displays an appropriate copyright notice, and (2)
101 | tells the user that there is no warranty for the work (except to the
102 | extent that warranties are provided), that licensees may convey the
103 | work under this License, and how to view a copy of this License.  If
104 | the interface presents a list of user commands or options, such as a
105 | menu, a prominent item in the list meets this criterion.
106 | 
107 |   1. Source Code.
108 | 
109 |   The "source code" for a work means the preferred form of the work
110 | for making modifications to it.  "Object code" means any non-source
111 | form of a work.
112 | 
113 |   A "Standard Interface" means an interface that either is an official
114 | standard defined by a recognized standards body, or, in the case of
115 | interfaces specified for a particular programming language, one that
116 | is widely used among developers working in that language.
117 | 
118 |   The "System Libraries" of an executable work include anything, other
119 | than the work as a whole, that (a) is included in the normal form of
120 | packaging a Major Component, but which is not part of that Major
121 | Component, and (b) serves only to enable use of the work with that
122 | Major Component, or to implement a Standard Interface for which an
123 | implementation is available to the public in source code form.  A
124 | "Major Component", in this context, means a major essential component
125 | (kernel, window system, and so on) of the specific operating system
126 | (if any) on which the executable work runs, or a compiler used to
127 | produce the work, or an object code interpreter used to run it.
128 | 
129 |   The "Corresponding Source" for a work in object code form means all
130 | the source code needed to generate, install, and (for an executable
131 | work) run the object code and to modify the work, including scripts to
132 | control those activities.  However, it does not include the work's
133 | System Libraries, or general-purpose tools or generally available free
134 | programs which are used unmodified in performing those activities but
135 | which are not part of the work.  For example, Corresponding Source
136 | includes interface definition files associated with source files for
137 | the work, and the source code for shared libraries and dynamically
138 | linked subprograms that the work is specifically designed to require,
139 | such as by intimate data communication or control flow between those
140 | subprograms and other parts of the work.
141 | 
142 |   The Corresponding Source need not include anything that users
143 | can regenerate automatically from other parts of the Corresponding
144 | Source.
145 | 
146 |   The Corresponding Source for a work in source code form is that
147 | same work.
148 | 
149 |   2. Basic Permissions.
150 | 
151 |   All rights granted under this License are granted for the term of
152 | copyright on the Program, and are irrevocable provided the stated
153 | conditions are met.  This License explicitly affirms your unlimited
154 | permission to run the unmodified Program.  The output from running a
155 | covered work is covered by this License only if the output, given its
156 | content, constitutes a covered work.  This License acknowledges your
157 | rights of fair use or other equivalent, as provided by copyright law.
158 | 
159 |   You may make, run and propagate covered works that you do not
160 | convey, without conditions so long as your license otherwise remains
161 | in force.  You may convey covered works to others for the sole purpose
162 | of having them make modifications exclusively for you, or provide you
163 | with facilities for running those works, provided that you comply with
164 | the terms of this License in conveying all material for which you do
165 | not control copyright.  Those thus making or running the covered works
166 | for you must do so exclusively on your behalf, under your direction
167 | and control, on terms that prohibit them from making any copies of
168 | your copyrighted material outside their relationship with you.
169 | 
170 |   Conveying under any other circumstances is permitted solely under
171 | the conditions stated below.  Sublicensing is not allowed; section 10
172 | makes it unnecessary.
173 | 
174 |   3. Protecting Users' Legal Rights From Anti-Circumvention Law.
175 | 
176 |   No covered work shall be deemed part of an effective technological
177 | measure under any applicable law fulfilling obligations under article
178 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or
179 | similar laws prohibiting or restricting circumvention of such
180 | measures.
181 | 
182 |   When you convey a covered work, you waive any legal power to forbid
183 | circumvention of technological measures to the extent such circumvention
184 | is effected by exercising rights under this License with respect to
185 | the covered work, and you disclaim any intention to limit operation or
186 | modification of the work as a means of enforcing, against the work's
187 | users, your or third parties' legal rights to forbid circumvention of
188 | technological measures.
189 | 
190 |   4. Conveying Verbatim Copies.
191 | 
192 |   You may convey verbatim copies of the Program's source code as you
193 | receive it, in any medium, provided that you conspicuously and
194 | appropriately publish on each copy an appropriate copyright notice;
195 | keep intact all notices stating that this License and any
196 | non-permissive terms added in accord with section 7 apply to the code;
197 | keep intact all notices of the absence of any warranty; and give all
198 | recipients a copy of this License along with the Program.
199 | 
200 |   You may charge any price or no price for each copy that you convey,
201 | and you may offer support or warranty protection for a fee.
202 | 
203 |   5. Conveying Modified Source Versions.
204 | 
205 |   You may convey a work based on the Program, or the modifications to
206 | produce it from the Program, in the form of source code under the
207 | terms of section 4, provided that you also meet all of these conditions:
208 | 
209 |     a) The work must carry prominent notices stating that you modified
210 |     it, and giving a relevant date.
211 | 
212 |     b) The work must carry prominent notices stating that it is
213 |     released under this License and any conditions added under section
214 |     7.  This requirement modifies the requirement in section 4 to
215 |     "keep intact all notices".
216 | 
217 |     c) You must license the entire work, as a whole, under this
218 |     License to anyone who comes into possession of a copy.  This
219 |     License will therefore apply, along with any applicable section 7
220 |     additional terms, to the whole of the work, and all its parts,
221 |     regardless of how they are packaged.  This License gives no
222 |     permission to license the work in any other way, but it does not
223 |     invalidate such permission if you have separately received it.
224 | 
225 |     d) If the work has interactive user interfaces, each must display
226 |     Appropriate Legal Notices; however, if the Program has interactive
227 |     interfaces that do not display Appropriate Legal Notices, your
228 |     work need not make them do so.
229 | 
230 |   A compilation of a covered work with other separate and independent
231 | works, which are not by their nature extensions of the covered work,
232 | and which are not combined with it such as to form a larger program,
233 | in or on a volume of a storage or distribution medium, is called an
234 | "aggregate" if the compilation and its resulting copyright are not
235 | used to limit the access or legal rights of the compilation's users
236 | beyond what the individual works permit.  Inclusion of a covered work
237 | in an aggregate does not cause this License to apply to the other
238 | parts of the aggregate.
239 | 
240 |   6. Conveying Non-Source Forms.
241 | 
242 |   You may convey a covered work in object code form under the terms
243 | of sections 4 and 5, provided that you also convey the
244 | machine-readable Corresponding Source under the terms of this License,
245 | in one of these ways:
246 | 
247 |     a) Convey the object code in, or embodied in, a physical product
248 |     (including a physical distribution medium), accompanied by the
249 |     Corresponding Source fixed on a durable physical medium
250 |     customarily used for software interchange.
251 | 
252 |     b) Convey the object code in, or embodied in, a physical product
253 |     (including a physical distribution medium), accompanied by a
254 |     written offer, valid for at least three years and valid for as
255 |     long as you offer spare parts or customer support for that product
256 |     model, to give anyone who possesses the object code either (1) a
257 |     copy of the Corresponding Source for all the software in the
258 |     product that is covered by this License, on a durable physical
259 |     medium customarily used for software interchange, for a price no
260 |     more than your reasonable cost of physically performing this
261 |     conveying of source, or (2) access to copy the
262 |     Corresponding Source from a network server at no charge.
263 | 
264 |     c) Convey individual copies of the object code with a copy of the
265 |     written offer to provide the Corresponding Source.  This
266 |     alternative is allowed only occasionally and noncommercially, and
267 |     only if you received the object code with such an offer, in accord
268 |     with subsection 6b.
269 | 
270 |     d) Convey the object code by offering access from a designated
271 |     place (gratis or for a charge), and offer equivalent access to the
272 |     Corresponding Source in the same way through the same place at no
273 |     further charge.  You need not require recipients to copy the
274 |     Corresponding Source along with the object code.  If the place to
275 |     copy the object code is a network server, the Corresponding Source
276 |     may be on a different server (operated by you or a third party)
277 |     that supports equivalent copying facilities, provided you maintain
278 |     clear directions next to the object code saying where to find the
279 |     Corresponding Source.  Regardless of what server hosts the
280 |     Corresponding Source, you remain obligated to ensure that it is
281 |     available for as long as needed to satisfy these requirements.
282 | 
283 |     e) Convey the object code using peer-to-peer transmission, provided
284 |     you inform other peers where the object code and Corresponding
285 |     Source of the work are being offered to the general public at no
286 |     charge under subsection 6d.
287 | 
288 |   A separable portion of the object code, whose source code is excluded
289 | from the Corresponding Source as a System Library, need not be
290 | included in conveying the object code work.
291 | 
292 |   A "User Product" is either (1) a "consumer product", which means any
293 | tangible personal property which is normally used for personal, family,
294 | or household purposes, or (2) anything designed or sold for incorporation
295 | into a dwelling.  In determining whether a product is a consumer product,
296 | doubtful cases shall be resolved in favor of coverage.  For a particular
297 | product received by a particular user, "normally used" refers to a
298 | typical or common use of that class of product, regardless of the status
299 | of the particular user or of the way in which the particular user
300 | actually uses, or expects or is expected to use, the product.  A product
301 | is a consumer product regardless of whether the product has substantial
302 | commercial, industrial or non-consumer uses, unless such uses represent
303 | the only significant mode of use of the product.
304 | 
305 |   "Installation Information" for a User Product means any methods,
306 | procedures, authorization keys, or other information required to install
307 | and execute modified versions of a covered work in that User Product from
308 | a modified version of its Corresponding Source.  The information must
309 | suffice to ensure that the continued functioning of the modified object
310 | code is in no case prevented or interfered with solely because
311 | modification has been made.
312 | 
313 |   If you convey an object code work under this section in, or with, or
314 | specifically for use in, a User Product, and the conveying occurs as
315 | part of a transaction in which the right of possession and use of the
316 | User Product is transferred to the recipient in perpetuity or for a
317 | fixed term (regardless of how the transaction is characterized), the
318 | Corresponding Source conveyed under this section must be accompanied
319 | by the Installation Information.  But this requirement does not apply
320 | if neither you nor any third party retains the ability to install
321 | modified object code on the User Product (for example, the work has
322 | been installed in ROM).
323 | 
324 |   The requirement to provide Installation Information does not include a
325 | requirement to continue to provide support service, warranty, or updates
326 | for a work that has been modified or installed by the recipient, or for
327 | the User Product in which it has been modified or installed.  Access to a
328 | network may be denied when the modification itself materially and
329 | adversely affects the operation of the network or violates the rules and
330 | protocols for communication across the network.
331 | 
332 |   Corresponding Source conveyed, and Installation Information provided,
333 | in accord with this section must be in a format that is publicly
334 | documented (and with an implementation available to the public in
335 | source code form), and must require no special password or key for
336 | unpacking, reading or copying.
337 | 
338 |   7. Additional Terms.
339 | 
340 |   "Additional permissions" are terms that supplement the terms of this
341 | License by making exceptions from one or more of its conditions.
342 | Additional permissions that are applicable to the entire Program shall
343 | be treated as though they were included in this License, to the extent
344 | that they are valid under applicable law.  If additional permissions
345 | apply only to part of the Program, that part may be used separately
346 | under those permissions, but the entire Program remains governed by
347 | this License without regard to the additional permissions.
348 | 
349 |   When you convey a copy of a covered work, you may at your option
350 | remove any additional permissions from that copy, or from any part of
351 | it.  (Additional permissions may be written to require their own
352 | removal in certain cases when you modify the work.)  You may place
353 | additional permissions on material, added by you to a covered work,
354 | for which you have or can give appropriate copyright permission.
355 | 
356 |   Notwithstanding any other provision of this License, for material you
357 | add to a covered work, you may (if authorized by the copyright holders of
358 | that material) supplement the terms of this License with terms:
359 | 
360 |     a) Disclaiming warranty or limiting liability differently from the
361 |     terms of sections 15 and 16 of this License; or
362 | 
363 |     b) Requiring preservation of specified reasonable legal notices or
364 |     author attributions in that material or in the Appropriate Legal
365 |     Notices displayed by works containing it; or
366 | 
367 |     c) Prohibiting misrepresentation of the origin of that material, or
368 |     requiring that modified versions of such material be marked in
369 |     reasonable ways as different from the original version; or
370 | 
371 |     d) Limiting the use for publicity purposes of names of licensors or
372 |     authors of the material; or
373 | 
374 |     e) Declining to grant rights under trademark law for use of some
375 |     trade names, trademarks, or service marks; or
376 | 
377 |     f) Requiring indemnification of licensors and authors of that
378 |     material by anyone who conveys the material (or modified versions of
379 |     it) with contractual assumptions of liability to the recipient, for
380 |     any liability that these contractual assumptions directly impose on
381 |     those licensors and authors.
382 | 
383 |   All other non-permissive additional terms are considered "further
384 | restrictions" within the meaning of section 10.  If the Program as you
385 | received it, or any part of it, contains a notice stating that it is
386 | governed by this License along with a term that is a further
387 | restriction, you may remove that term.  If a license document contains
388 | a further restriction but permits relicensing or conveying under this
389 | License, you may add to a covered work material governed by the terms
390 | of that license document, provided that the further restriction does
391 | not survive such relicensing or conveying.
392 | 
393 |   If you add terms to a covered work in accord with this section, you
394 | must place, in the relevant source files, a statement of the
395 | additional terms that apply to those files, or a notice indicating
396 | where to find the applicable terms.
397 | 
398 |   Additional terms, permissive or non-permissive, may be stated in the
399 | form of a separately written license, or stated as exceptions;
400 | the above requirements apply either way.
401 | 
402 |   8. Termination.
403 | 
404 |   You may not propagate or modify a covered work except as expressly
405 | provided under this License.  Any attempt otherwise to propagate or
406 | modify it is void, and will automatically terminate your rights under
407 | this License (including any patent licenses granted under the third
408 | paragraph of section 11).
409 | 
410 |   However, if you cease all violation of this License, then your
411 | license from a particular copyright holder is reinstated (a)
412 | provisionally, unless and until the copyright holder explicitly and
413 | finally terminates your license, and (b) permanently, if the copyright
414 | holder fails to notify you of the violation by some reasonable means
415 | prior to 60 days after the cessation.
416 | 
417 |   Moreover, your license from a particular copyright holder is
418 | reinstated permanently if the copyright holder notifies you of the
419 | violation by some reasonable means, this is the first time you have
420 | received notice of violation of this License (for any work) from that
421 | copyright holder, and you cure the violation prior to 30 days after
422 | your receipt of the notice.
423 | 
424 |   Termination of your rights under this section does not terminate the
425 | licenses of parties who have received copies or rights from you under
426 | this License.  If your rights have been terminated and not permanently
427 | reinstated, you do not qualify to receive new licenses for the same
428 | material under section 10.
429 | 
430 |   9. Acceptance Not Required for Having Copies.
431 | 
432 |   You are not required to accept this License in order to receive or
433 | run a copy of the Program.  Ancillary propagation of a covered work
434 | occurring solely as a consequence of using peer-to-peer transmission
435 | to receive a copy likewise does not require acceptance.  However,
436 | nothing other than this License grants you permission to propagate or
437 | modify any covered work.  These actions infringe copyright if you do
438 | not accept this License.  Therefore, by modifying or propagating a
439 | covered work, you indicate your acceptance of this License to do so.
440 | 
441 |   10. Automatic Licensing of Downstream Recipients.
442 | 
443 |   Each time you convey a covered work, the recipient automatically
444 | receives a license from the original licensors, to run, modify and
445 | propagate that work, subject to this License.  You are not responsible
446 | for enforcing compliance by third parties with this License.
447 | 
448 |   An "entity transaction" is a transaction transferring control of an
449 | organization, or substantially all assets of one, or subdividing an
450 | organization, or merging organizations.  If propagation of a covered
451 | work results from an entity transaction, each party to that
452 | transaction who receives a copy of the work also receives whatever
453 | licenses to the work the party's predecessor in interest had or could
454 | give under the previous paragraph, plus a right to possession of the
455 | Corresponding Source of the work from the predecessor in interest, if
456 | the predecessor has it or can get it with reasonable efforts.
457 | 
458 |   You may not impose any further restrictions on the exercise of the
459 | rights granted or affirmed under this License.  For example, you may
460 | not impose a license fee, royalty, or other charge for exercise of
461 | rights granted under this License, and you may not initiate litigation
462 | (including a cross-claim or counterclaim in a lawsuit) alleging that
463 | any patent claim is infringed by making, using, selling, offering for
464 | sale, or importing the Program or any portion of it.
465 | 
466 |   11. Patents.
467 | 
468 |   A "contributor" is a copyright holder who authorizes use under this
469 | License of the Program or a work on which the Program is based.  The
470 | work thus licensed is called the contributor's "contributor version".
471 | 
472 |   A contributor's "essential patent claims" are all patent claims
473 | owned or controlled by the contributor, whether already acquired or
474 | hereafter acquired, that would be infringed by some manner, permitted
475 | by this License, of making, using, or selling its contributor version,
476 | but do not include claims that would be infringed only as a
477 | consequence of further modification of the contributor version.  For
478 | purposes of this definition, "control" includes the right to grant
479 | patent sublicenses in a manner consistent with the requirements of
480 | this License.
481 | 
482 |   Each contributor grants you a non-exclusive, worldwide, royalty-free
483 | patent license under the contributor's essential patent claims, to
484 | make, use, sell, offer for sale, import and otherwise run, modify and
485 | propagate the contents of its contributor version.
486 | 
487 |   In the following three paragraphs, a "patent license" is any express
488 | agreement or commitment, however denominated, not to enforce a patent
489 | (such as an express permission to practice a patent or covenant not to
490 | sue for patent infringement).  To "grant" such a patent license to a
491 | party means to make such an agreement or commitment not to enforce a
492 | patent against the party.
493 | 
494 |   If you convey a covered work, knowingly relying on a patent license,
495 | and the Corresponding Source of the work is not available for anyone
496 | to copy, free of charge and under the terms of this License, through a
497 | publicly available network server or other readily accessible means,
498 | then you must either (1) cause the Corresponding Source to be so
499 | available, or (2) arrange to deprive yourself of the benefit of the
500 | patent license for this particular work, or (3) arrange, in a manner
501 | consistent with the requirements of this License, to extend the patent
502 | license to downstream recipients.  "Knowingly relying" means you have
503 | actual knowledge that, but for the patent license, your conveying the
504 | covered work in a country, or your recipient's use of the covered work
505 | in a country, would infringe one or more identifiable patents in that
506 | country that you have reason to believe are valid.
507 | 
508 |   If, pursuant to or in connection with a single transaction or
509 | arrangement, you convey, or propagate by procuring conveyance of, a
510 | covered work, and grant a patent license to some of the parties
511 | receiving the covered work authorizing them to use, propagate, modify
512 | or convey a specific copy of the covered work, then the patent license
513 | you grant is automatically extended to all recipients of the covered
514 | work and works based on it.
515 | 
516 |   A patent license is "discriminatory" if it does not include within
517 | the scope of its coverage, prohibits the exercise of, or is
518 | conditioned on the non-exercise of one or more of the rights that are
519 | specifically granted under this License.  You may not convey a covered
520 | work if you are a party to an arrangement with a third party that is
521 | in the business of distributing software, under which you make payment
522 | to the third party based on the extent of your activity of conveying
523 | the work, and under which the third party grants, to any of the
524 | parties who would receive the covered work from you, a discriminatory
525 | patent license (a) in connection with copies of the covered work
526 | conveyed by you (or copies made from those copies), or (b) primarily
527 | for and in connection with specific products or compilations that
528 | contain the covered work, unless you entered into that arrangement,
529 | or that patent license was granted, prior to 28 March 2007.
530 | 
531 |   Nothing in this License shall be construed as excluding or limiting
532 | any implied license or other defenses to infringement that may
533 | otherwise be available to you under applicable patent law.
534 | 
535 |   12. No Surrender of Others' Freedom.
536 | 
537 |   If conditions are imposed on you (whether by court order, agreement or
538 | otherwise) that contradict the conditions of this License, they do not
539 | excuse you from the conditions of this License.  If you cannot convey a
540 | covered work so as to satisfy simultaneously your obligations under this
541 | License and any other pertinent obligations, then as a consequence you may
542 | not convey it at all.  For example, if you agree to terms that obligate you
543 | to collect a royalty for further conveying from those to whom you convey
544 | the Program, the only way you could satisfy both those terms and this
545 | License would be to refrain entirely from conveying the Program.
546 | 
547 |   13. Remote Network Interaction; Use with the GNU General Public License.
548 | 
549 |   Notwithstanding any other provision of this License, if you modify the
550 | Program, your modified version must prominently offer all users
551 | interacting with it remotely through a computer network (if your version
552 | supports such interaction) an opportunity to receive the Corresponding
553 | Source of your version by providing access to the Corresponding Source
554 | from a network server at no charge, through some standard or customary
555 | means of facilitating copying of software.  This Corresponding Source
556 | shall include the Corresponding Source for any work covered by version 3
557 | of the GNU General Public License that is incorporated pursuant to the
558 | following paragraph.
559 | 
560 |   Notwithstanding any other provision of this License, you have
561 | permission to link or combine any covered work with a work licensed
562 | under version 3 of the GNU General Public License into a single
563 | combined work, and to convey the resulting work.  The terms of this
564 | License will continue to apply to the part which is the covered work,
565 | but the work with which it is combined will remain governed by version
566 | 3 of the GNU General Public License.
567 | 
568 |   14. Revised Versions of this License.
569 | 
570 |   The Free Software Foundation may publish revised and/or new versions of
571 | the GNU Affero General Public License from time to time.  Such new versions
572 | will be similar in spirit to the present version, but may differ in detail to
573 | address new problems or concerns.
574 | 
575 |   Each version is given a distinguishing version number.  If the
576 | Program specifies that a certain numbered version of the GNU Affero General
577 | Public License "or any later version" applies to it, you have the
578 | option of following the terms and conditions either of that numbered
579 | version or of any later version published by the Free Software
580 | Foundation.  If the Program does not specify a version number of the
581 | GNU Affero General Public License, you may choose any version ever published
582 | by the Free Software Foundation.
583 | 
584 |   If the Program specifies that a proxy can decide which future
585 | versions of the GNU Affero General Public License can be used, that proxy's
586 | public statement of acceptance of a version permanently authorizes you
587 | to choose that version for the Program.
588 | 
589 |   Later license versions may give you additional or different
590 | permissions.  However, no additional obligations are imposed on any
591 | author or copyright holder as a result of your choosing to follow a
592 | later version.
593 | 
594 |   15. Disclaimer of Warranty.
595 | 
596 |   THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
597 | APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
598 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
599 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
600 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
601 | PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
602 | IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
603 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
604 | 
605 |   16. Limitation of Liability.
606 | 
607 |   IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
608 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
609 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
610 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
611 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
612 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
613 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
614 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
615 | SUCH DAMAGES.
616 | 
617 |   17. Interpretation of Sections 15 and 16.
618 | 
619 |   If the disclaimer of warranty and limitation of liability provided
620 | above cannot be given local legal effect according to their terms,
621 | reviewing courts shall apply local law that most closely approximates
622 | an absolute waiver of all civil liability in connection with the
623 | Program, unless a warranty or assumption of liability accompanies a
624 | copy of the Program in return for a fee.
625 | 
626 |                      END OF TERMS AND CONDITIONS
627 | 
628 |             How to Apply These Terms to Your New Programs
629 | 
630 |   If you develop a new program, and you want it to be of the greatest
631 | possible use to the public, the best way to achieve this is to make it
632 | free software which everyone can redistribute and change under these terms.
633 | 
634 |   To do so, attach the following notices to the program.  It is safest
635 | to attach them to the start of each source file to most effectively
636 | state the exclusion of warranty; and each file should have at least
637 | the "copyright" line and a pointer to where the full notice is found.
638 | 
639 |     <one line to give the program's name and a brief idea of what it does.>
640 |     Copyright (C) <year>  <name of author>
641 | 
642 |     This program is free software: you can redistribute it and/or modify
643 |     it under the terms of the GNU Affero General Public License as published
644 |     by the Free Software Foundation, either version 3 of the License, or
645 |     (at your option) any later version.
646 | 
647 |     This program is distributed in the hope that it will be useful,
648 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
649 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
650 |     GNU Affero General Public License for more details.
651 | 
652 |     You should have received a copy of the GNU Affero General Public License
653 |     along with this program.  If not, see <https://www.gnu.org/licenses/>.
654 | 
655 | Also add information on how to contact you by electronic and paper mail.
656 | 
657 |   If your software can interact with users remotely through a computer
658 | network, you should also make sure that it provides a way for users to
659 | get its source.  For example, if your program is a web application, its
660 | interface could display a "Source" link that leads users to an archive
661 | of the code.  There are many ways you could offer source, and different
662 | solutions will be better for different programs; see section 13 for the
663 | specific requirements.
664 | 
665 |   You should also get your employer (if you work as a programmer) or school,
666 | if any, to sign a "copyright disclaimer" for the program, if necessary.
667 | For more information on this, and how to apply and follow the GNU AGPL, see
668 | <https://www.gnu.org/licenses/>.
669 | 


--------------------------------------------------------------------------------
/NOTICE:
--------------------------------------------------------------------------------
 1 | 
 2 |     NOTICE
 3 | 
 4 |     This project includes code from the following MIT-licensed project(s):
 5 | 
 6 |     - Project Name: Markitdown
 7 |     - Repository: https://github.com/microsoft/markitdown
 8 |     - License: MIT
 9 | 
10 |     MIT License
11 | 
12 |     Copyright (c) Microsoft Corporation.
13 | 
14 |     Permission is hereby granted, free of charge, to any person obtaining a copy
15 |     of this software and associated documentation files (the "Software"), to deal
16 |     in the Software without restriction, including without limitation the rights
17 |     to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
18 |     copies of the Software, and to permit persons to whom the Software is
19 |     furnished to do so, subject to the following conditions:
20 | 
21 |     The above copyright notice and this permission notice shall be included in all
22 |     copies or substantial portions of the Software.
23 | 
24 |     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
25 |     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
26 |     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
27 |     AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
28 |     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
29 |     OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 |     SOFTWARE


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Markify
 2 | 
 3 | ✨ **轻松转换文件为 Markdown，助力 RAG 与 LLM 更智能地理解内容！** ✨  
 4 | 
 5 | 🚀 **基于 Markitdown 与 MinerU**，支持多种格式转换，并提供 **高质量 PDF 解析**，让你的文档更易处理、更易用！  
 6 | 
 7 | 📡 **支持 API & Streamlit 端**，随时随地高效转换，轻松集成！  
 8 | 
 9 | 📂 **支持多种文件格式**：
10 | - 📄 **文档**：PDF、Word、PPT、Excel  
11 | - 🖼 **多媒体**：图片、音频  
12 | - 🌐 **网页与数据**：HTML、CSV、JSON、XML  
13 | - 🗂 **压缩文件**：ZIP  
14 | 
15 | ⚡ **多种 PDF 解析模式，满足不同需求**：
16 | - 🚀 **快速模式**（基于 pdfminer，解析高效）  
17 | - 🏆 **高级模式**（结合 MinerU 深度解析，效果更佳）  
18 | - ☁️ **云端模式**（开发中，敬请期待！）  
19 | 
20 | 📖 **Markdown 化你的文件，助力 LLM 更好地理解与处理文档！** 💡
21 | 
22 | ![alt text](assets/streamlint_ui.png)
23 | ```shell
24 | streamlit run ./client/streamlit_client.py
25 | ```
26 | 
27 | ## API
28 | FastAPI自带API文档 http://127.0.0.1:20926/docs
29 | ### 上传文件，创建任务
30 | 请求
31 | ```shell
32 | curl -X 'POST' \
33 |   'http://127.0.0.1:20926/api/jobs' \
34 |   -H 'accept: application/json' \
35 |   -H 'Content-Type: multipart/form-data' \
36 |   -F 'file=@CoA.pdf;type=application/pdf' \
37 |   -F 'mode=advanced'
38 | ```
39 | 响应
40 | ```json
41 | {
42 |   "job_id": "29bbad6b-c167-41f0-8a29-99551c499263"
43 | }
44 | ```
45 | ### 查询任务状态
46 | 请求
47 | ```shell
48 | curl -X 'GET' \
49 |   'http://127.0.0.1:20926/api/jobs/29bbad6b-c167-41f0-8a29-99551c499263' \
50 |   -H 'accept: application/json'
51 | ```
52 | 响应
53 | ```json
54 | {
55 |   "job_id": "29bbad6b-c167-41f0-8a29-99551c499263",
56 |   "status": "completed",
57 |   "filename": "CoA.pdf",
58 |   "params": {
59 |     "mode": "advanced"
60 |   },
61 |   "error": null
62 | }
63 | ```
64 | ### 下载markdown文件
65 | 请求
66 | ```shell
67 | curl -X 'GET' \
68 |   'http://127.0.0.1:20926/api/jobs/29bbad6b-c167-41f0-8a29-99551c499263/result' \
69 |   -H 'accept: application/json'
70 | ```
71 | 响应
72 | 文件
73 | 
74 | 
75 | ## Docker部署
76 | ```shell
77 | docker pull wsjcuhk/markify:0.0.1
78 | docker run -d -p 20926:20926 wsjcuhk/markify:0.0.1
79 | ```
80 | 
81 | 
82 | ## TODO
83 | - 添加云端解析模式
84 | - 自动打包为Docker镜像
85 | 
86 | ## 赞助我
87 | 开源不易，有专业指导需求或赞助，可以加入我的知识星球，我会提供专业的技术指导。
88 | ![zsxq](assets/zsxq.JPG)
89 | 
90 | 
91 | ## 致敬
92 | 本项目参考微软markitdown和上海浦语mineru。
93 | - [markitdown](https://github.com/microsoft/markitdown)
94 | - [mineru](https://github.com/opendatalab/MinerU)
95 | 


--------------------------------------------------------------------------------
/assets/magic-pdf-template.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "bucket_info":{
 3 |         "bucket-name-1":["ak", "sk", "endpoint"],
 4 |         "bucket-name-2":["ak", "sk", "endpoint"]
 5 |     },
 6 |     "models-dir":"/tmp/models",
 7 |     "layoutreader-model-dir":"/tmp/layoutreader",
 8 |     "device-mode":"cpu",
 9 |     "layout-config": {
10 |         "model": "doclayout_yolo"
11 |     },
12 |     "formula-config": {
13 |         "mfd_model": "yolo_v8_mfd",
14 |         "mfr_model": "unimernet_small",
15 |         "enable": true
16 |     },
17 |     "table-config": {
18 |         "model": "rapid_table",
19 |         "sub_model": "slanet_plus",
20 |         "enable": true,
21 |         "max_time": 400
22 |     },
23 |     "llm-aided-config": {
24 |         "formula_aided": {
25 |             "api_key": "your_api_key",
26 |             "base_url": "https://dashscope.aliyuncs.com/compatible-mode/v1",
27 |             "model": "qwen2.5-7b-instruct",
28 |             "enable": false
29 |         },
30 |         "text_aided": {
31 |             "api_key": "your_api_key",
32 |             "base_url": "https://dashscope.aliyuncs.com/compatible-mode/v1",
33 |             "model": "qwen2.5-7b-instruct",
34 |             "enable": false
35 |         },
36 |         "title_aided": {
37 |             "api_key": "your_api_key",
38 |             "base_url": "https://dashscope.aliyuncs.com/compatible-mode/v1",
39 |             "model": "qwen2.5-32b-instruct",
40 |             "enable": false
41 |         }
42 |     },
43 |     "config_version": "1.1.1"
44 | }


--------------------------------------------------------------------------------
/assets/sponsor.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KylinMountain/markify/622070781f1cdb74506b3ec1ec7d1749c1ef458c/assets/sponsor.JPG


--------------------------------------------------------------------------------
/assets/streamlint_ui.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KylinMountain/markify/622070781f1cdb74506b3ec1ec7d1749c1ef458c/assets/streamlint_ui.png


--------------------------------------------------------------------------------
/assets/zsxq.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KylinMountain/markify/622070781f1cdb74506b3ec1ec7d1749c1ef458c/assets/zsxq.JPG


--------------------------------------------------------------------------------
/client/streamlit_client.py:
--------------------------------------------------------------------------------
  1 | import streamlit as st
  2 | import requests
  3 | import time
  4 | import os
  5 | 
  6 | # ============ 配置区 ============
  7 | BASE_URL = "http://localhost:20926"
  8 | 
  9 | 
 10 | # ============ 工具函数 ============
 11 | 
 12 | def fetch_jobs(page=0, limit=10):
 13 |     """
 14 |     从后端 /api/jobs 获取最新任务列表
 15 |     （请确保后端已实现 ?page=...&limit=... 分页参数）
 16 |     """
 17 |     url = f"{BASE_URL}/api/jobs?page={page}&limit={limit}"
 18 |     try:
 19 |         resp = requests.get(url)
 20 |         if resp.status_code == 200:
 21 |             return resp.json()  # 后端应返回一个任务列表 (list)
 22 |         else:
 23 |             st.error(f"获取任务列表失败: {resp.text}")
 24 |             return []
 25 |     except requests.RequestException as e:
 26 |         st.error(f"网络异常: {e}")
 27 |         return []
 28 | 
 29 | 
 30 | def upload_file(file, mode):
 31 |     """
 32 |     上传单个文件到后端，创建任务。
 33 |     成功后立刻刷新页面，以获取最新的任务列表。
 34 |     """
 35 |     files = {"file": file}
 36 |     data = {"mode": mode}
 37 |     try:
 38 |         response = requests.post(f"{BASE_URL}/api/jobs", files=files, data=data)
 39 |         if response.status_code == 202:
 40 |             st.success(f"文件 `{file.name}` 上传成功，已加入任务队列。")
 41 |             st.experimental_rerun()  # 触发页面刷新，从而 fetch_jobs()
 42 |         else:
 43 |             st.error(f"文件 `{file.name}` 上传失败: {response.text}")
 44 |     except requests.RequestException as e:
 45 |         st.error(f"网络异常：{e}")
 46 | 
 47 | 
 48 | def upload_url(url, mode):
 49 |     """
 50 |     上传单个 URL 到后端，创建任务。
 51 |     成功后立刻刷新页面，以获取最新的任务列表。
 52 |     """
 53 |     data = {"url": url, "mode": mode}
 54 |     try:
 55 |         response = requests.post(f"{BASE_URL}/api/jobs/url", json=data)
 56 |         if response.status_code == 202:
 57 |             st.success(f"URL `{url}` 提交成功，已加入任务队列。")
 58 |             st.experimental_rerun()
 59 |         else:
 60 |             st.error(f"URL `{url}` 上传失败: {response.text}")
 61 |     except requests.RequestException as e:
 62 |         st.error(f"网络异常：{e}")
 63 | 
 64 | 
 65 | def show_file_entry(job):
 66 |     """
 67 |     在右侧文件列表中渲染每个任务条目。
 68 |     后端返回的 job 数据结构示例（JSON）:
 69 |       {
 70 |         "job_id": "xxx",
 71 |         "status": "completed",
 72 |         "filename": "test.pdf",
 73 |         "params": {"mode": "simple"},
 74 |         "error": null,
 75 |         "created_at": "2025-02-25T10:00:00"
 76 |       }
 77 |     你也可以根据后端的返回字段进行修改。
 78 |     """
 79 |     col1, col2, col3, col4 = st.columns([3, 2, 2, 1])
 80 | 
 81 |     with col1:
 82 |         st.markdown(f"**{job['filename']}**")
 83 | 
 84 |     # 如果后端返回了 created_at，可显示
 85 |     with col2:
 86 |         created_time = job.get("created_at", "")
 87 |         st.markdown(f"{created_time}")
 88 | 
 89 |     with col3:
 90 |         status = job["status"]
 91 |         if status == "completed":
 92 |             status_icon = "✅"
 93 |         elif status == "failed":
 94 |             status_icon = "❌"
 95 |         else:
 96 |             status_icon = "⏳"
 97 |         st.markdown(f"{status_icon} {status}")
 98 | 
 99 |     with col4:
100 |         # 如果已完成，提供下载
101 |         if status == "completed":
102 |             try:
103 |                 result_response = requests.get(f"{BASE_URL}/api/jobs/{job['job_id']}/result")
104 |                 if result_response.status_code == 200:
105 |                     st.download_button(
106 |                         label="下载",
107 |                         data=result_response.content,
108 |                         file_name=f"{job['filename']}.md",
109 |                         mime="text/markdown",
110 |                         key=f"download_{job['job_id']}"  # 添加唯一 key
111 |                     )
112 |                 else:
113 |                     st.error("无法下载")
114 |             except requests.RequestException as e:
115 |                 st.error(f"下载异常：{e}")
116 | 
117 | 
118 | # ============ 主函数 ============
119 | 
120 | def main():
121 |     st.set_page_config(page_title="Markify", layout="wide")
122 | 
123 |     # 页面标题与说明
124 |     st.title("Markify - 文档处理")
125 |     st.markdown("在左侧上传文件或提交 URL，右侧实时查看进度并下载结果。")
126 | 
127 |     # 布局：左侧上传，右侧列表
128 |     left_col, right_col = st.columns([2, 3], gap="large")
129 | 
130 |     with left_col:
131 |         st.subheader("上传设置")
132 |         mode = st.selectbox("选择 PDF 处理模式", ["simple", "advanced", "cloud"])
133 | 
134 |         # 本地文件上传
135 |         uploaded_files = st.file_uploader(
136 |             "选择文件（任意类型）",
137 |             type=None,
138 |             accept_multiple_files=True
139 |         )
140 |         if uploaded_files and st.button("上传文件"):
141 |             for file in uploaded_files:
142 |                 upload_file(file, mode)
143 | 
144 |         # URL 上传
145 |         st.subheader("URL 上传")
146 |         file_urls = st.text_area("请输入文件 URL（每行一个）")
147 |         if file_urls and st.button("提交 URL"):
148 |             for url in file_urls.strip().split("\n"):
149 |                 if url:
150 |                     upload_url(url.strip(), mode)
151 | 
152 |         # 结果存储位置（仅作提示）
153 |         st.markdown(f"**解析结果存储路径**：`{os.path.expanduser('~')}/MinerU`")
154 | 
155 |     with right_col:
156 |         st.subheader("文件列表")
157 | 
158 |         # 手动刷新按钮
159 |         if st.button("刷新列表"):
160 |             st.experimental_rerun()
161 | 
162 |         # 从后端获取任务列表
163 |         jobs = fetch_jobs(page=0, limit=10)
164 |         if not jobs:
165 |             st.info("暂无任务，请上传后查看。")
166 |         else:
167 |             for job in jobs:
168 |                 show_file_entry(job)
169 | 
170 | 
171 | if __name__ == "__main__":
172 |     main()
173 | 


--------------------------------------------------------------------------------
/core/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KylinMountain/markify/622070781f1cdb74506b3ec1ec7d1749c1ef458c/core/__init__.py


--------------------------------------------------------------------------------
/core/base.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Union
 2 | 
 3 | 
 4 | class DocumentConverterResult:
 5 |     """The result of converting a document to text."""
 6 | 
 7 |     def __init__(self, title: Union[str, None] = None, text_content: str = ""):
 8 |         self.title: Union[str, None] = title
 9 |         self.text_content: str = text_content
10 | 
11 | 
12 | class DocumentConverter:
13 |     """Abstract superclass of all DocumentConverters."""
14 | 
15 |     def convert(
16 |             self, local_path: str, **kwargs: Any
17 |     ) -> Union[None, DocumentConverterResult]:
18 |         raise NotImplementedError()
19 | 
20 | 
21 | class FileConversionException(BaseException):
22 |     pass
23 | 
24 | 
25 | class UnsupportedFormatException(BaseException):
26 |     pass
27 | 


--------------------------------------------------------------------------------
/core/converters/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KylinMountain/markify/622070781f1cdb74506b3ec1ec7d1749c1ef458c/core/converters/__init__.py


--------------------------------------------------------------------------------
/core/converters/bingsearch.py:
--------------------------------------------------------------------------------
 1 | import base64
 2 | import binascii
 3 | import re
 4 | from typing import Union
 5 | from urllib.parse import parse_qs, urlparse
 6 | 
 7 | from bs4 import BeautifulSoup
 8 | 
 9 | from core.base import DocumentConverter, DocumentConverterResult
10 | from core.converters.custommarkdownify import _CustomMarkdownify
11 | 
12 | 
13 | class BingSerpConverter(DocumentConverter):
14 |     """
15 |     Handle Bing results pages (only the organic search results).
16 |     NOTE: It is better to use the Bing API
17 |     """
18 | 
19 |     def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
20 |         # Bail if not a Bing SERP
21 |         extension = kwargs.get("file_extension", "")
22 |         if extension.lower() not in [".html", ".htm"]:
23 |             return None
24 |         url = kwargs.get("url", "")
25 |         if not re.search(r"^https://www\.bing\.com/search\?q=", url):
26 |             return None
27 | 
28 |         # Parse the query parameters
29 |         parsed_params = parse_qs(urlparse(url).query)
30 |         query = parsed_params.get("q", [""])[0]
31 | 
32 |         # Parse the file
33 |         soup = None
34 |         with open(local_path, "rt", encoding="utf-8") as fh:
35 |             soup = BeautifulSoup(fh.read(), "html.parser")
36 | 
37 |         # Clean up some formatting
38 |         for tptt in soup.find_all(class_="tptt"):
39 |             if hasattr(tptt, "string") and tptt.string:
40 |                 tptt.string += " "
41 |         for slug in soup.find_all(class_="algoSlug_icon"):
42 |             slug.extract()
43 | 
44 |         # Parse the algorithmic results
45 |         _markdownify = _CustomMarkdownify()
46 |         results = list()
47 |         for result in soup.find_all(class_="b_algo"):
48 |             # Rewrite redirect urls
49 |             for a in result.find_all("a", href=True):
50 |                 parsed_href = urlparse(a["href"])
51 |                 qs = parse_qs(parsed_href.query)
52 | 
53 |                 # The destination is contained in the u parameter,
54 |                 # but appears to be base64 encoded, with some prefix
55 |                 if "u" in qs:
56 |                     u = (
57 |                         qs["u"][0][2:].strip() + "=="
58 |                     )  # Python 3 doesn't care about extra padding
59 | 
60 |                     try:
61 |                         # RFC 4648 / Base64URL" variant, which uses "-" and "_"
62 |                         a["href"] = base64.b64decode(u, altchars="-_").decode("utf-8")
63 |                     except UnicodeDecodeError:
64 |                         pass
65 |                     except binascii.Error:
66 |                         pass
67 | 
68 |             # Convert to markdown
69 |             md_result = _markdownify.convert_soup(result).strip()
70 |             lines = [line.strip() for line in re.split(r"\n+", md_result)]
71 |             results.append("\n".join([line for line in lines if len(line) > 0]))
72 | 
73 |         webpage_text = (
74 |             f"## A Bing search for '{query}' found the following results:\n\n"
75 |             + "\n\n".join(results)
76 |         )
77 | 
78 |         return DocumentConverterResult(
79 |             title=None if soup.title is None else soup.title.string,
80 |             text_content=webpage_text,
81 |         )
82 | 


--------------------------------------------------------------------------------
/core/converters/custommarkdownify.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from typing import Any
 3 | from urllib.parse import urlparse, urlunparse, quote, unquote
 4 | 
 5 | import markdownify
 6 | 
 7 | 
 8 | class _CustomMarkdownify(markdownify.MarkdownConverter):
 9 |     """
10 |     A custom version of markdownify's MarkdownConverter. Changes include:
11 | 
12 |     - Altering the default heading style to use '#', '##', etc.
13 |     - Removing javascript hyperlinks.
14 |     - Truncating images with large data:uri sources.
15 |     - Ensuring URIs are properly escaped, and do not conflict with Markdown syntax
16 |     """
17 | 
18 |     def __init__(self, **options: Any):
19 |         options["heading_style"] = options.get("heading_style", markdownify.ATX)
20 |         # Explicitly cast options to the expected type if necessary
21 |         super().__init__(**options)
22 | 
23 |     def convert_hn(self, n: int, el: Any, text: str, convert_as_inline: bool) -> str:
24 |         """Same as usual, but be sure to start with a new line"""
25 |         if not convert_as_inline:
26 |             if not re.search(r"^\n", text):
27 |                 return "\n" + super().convert_hn(n, el, text, convert_as_inline)  # type: ignore
28 | 
29 |         return super().convert_hn(n, el, text, convert_as_inline)  # type: ignore
30 | 
31 |     def convert_a(self, el: Any, text: str, convert_as_inline: bool):
32 |         """Same as usual converter, but removes Javascript links and escapes URIs."""
33 |         prefix, suffix, text = markdownify.chomp(text)  # type: ignore
34 |         if not text:
35 |             return ""
36 |         href = el.get("href")
37 |         title = el.get("title")
38 | 
39 |         # Escape URIs and skip non-http or file schemes
40 |         if href:
41 |             try:
42 |                 parsed_url = urlparse(href)  # type: ignore
43 |                 if parsed_url.scheme and parsed_url.scheme.lower() not in ["http", "https", "file"]:  # type: ignore
44 |                     return "%s%s%s" % (prefix, text, suffix)
45 |                 href = urlunparse(parsed_url._replace(path=quote(unquote(parsed_url.path))))  # type: ignore
46 |             except ValueError:  # It's not clear if this ever gets thrown
47 |                 return "%s%s%s" % (prefix, text, suffix)
48 | 
49 |         # For the replacement see #29: text nodes underscores are escaped
50 |         if (
51 |             self.options["autolinks"]
52 |             and text.replace(r"\_", "_") == href
53 |             and not title
54 |             and not self.options["default_title"]
55 |         ):
56 |             # Shortcut syntax
57 |             return "<%s>" % href
58 |         if self.options["default_title"] and not title:
59 |             title = href
60 |         title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
61 |         return (
62 |             "%s[%s](%s%s)%s" % (prefix, text, href, title_part, suffix)
63 |             if href
64 |             else text
65 |         )
66 | 
67 |     def convert_img(self, el: Any, text: str, convert_as_inline: bool) -> str:
68 |         """Same as usual converter, but removes data URIs"""
69 | 
70 |         alt = el.attrs.get("alt", None) or ""
71 |         src = el.attrs.get("src", None) or ""
72 |         title = el.attrs.get("title", None) or ""
73 |         title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
74 |         if (
75 |             convert_as_inline
76 |             and el.parent.name not in self.options["keep_inline_images_in"]
77 |         ):
78 |             return alt
79 | 
80 |         # Remove dataURIs
81 |         if src.startswith("data:"):
82 |             src = src.split(",")[0] + "..."
83 | 
84 |         return "![%s](%s%s)" % (alt, src, title_part)
85 | 
86 |     def convert_soup(self, soup: Any) -> str:
87 |         return super().convert_soup(soup)  # type: ignore
88 | 


--------------------------------------------------------------------------------
/core/converters/docx.py:
--------------------------------------------------------------------------------
 1 | from typing import Union
 2 | 
 3 | import mammoth
 4 | 
 5 | from core.base import DocumentConverterResult
 6 | from core.converters.html import HtmlConverter
 7 | 
 8 | 
 9 | class DocxConverter(HtmlConverter):
10 |     """
11 |     Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
12 |     """
13 | 
14 |     def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
15 |         # Bail if not a DOCX
16 |         extension = kwargs.get("file_extension", "")
17 |         if extension.lower() != ".docx":
18 |             return None
19 | 
20 |         result = None
21 |         with open(local_path, "rb") as docx_file:
22 |             style_map = kwargs.get("style_map", None)
23 | 
24 |             result = mammoth.convert_to_html(docx_file, style_map=style_map)
25 |             html_content = result.value
26 |             result = self._convert(html_content)
27 | 
28 |         return result
29 | 


--------------------------------------------------------------------------------
/core/converters/html.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Union
 2 | 
 3 | from bs4 import BeautifulSoup
 4 | 
 5 | from core.base import DocumentConverter, DocumentConverterResult
 6 | from core.converters.custommarkdownify import _CustomMarkdownify
 7 | 
 8 | 
 9 | class HtmlConverter(DocumentConverter):
10 |     """Anything with content type text/html"""
11 | 
12 |     def convert(
13 |         self, local_path: str, **kwargs: Any
14 |     ) -> Union[None, DocumentConverterResult]:
15 |         # Bail if not html
16 |         extension = kwargs.get("file_extension", "")
17 |         if extension.lower() not in [".html", ".htm"]:
18 |             return None
19 | 
20 |         result = None
21 |         with open(local_path, "rt", encoding="utf-8") as fh:
22 |             result = self._convert(fh.read())
23 | 
24 |         return result
25 | 
26 |     def _convert(self, html_content: str) -> Union[None, DocumentConverterResult]:
27 |         """Helper function that converts and HTML string."""
28 | 
29 |         # Parse the string
30 |         soup = BeautifulSoup(html_content, "html.parser")
31 | 
32 |         # Remove javascript and style blocks
33 |         for script in soup(["script", "style"]):
34 |             script.extract()
35 | 
36 |         # Print only the main content
37 |         body_elm = soup.find("body")
38 |         webpage_text = ""
39 |         if body_elm:
40 |             webpage_text = _CustomMarkdownify().convert_soup(body_elm)
41 |         else:
42 |             webpage_text = _CustomMarkdownify().convert_soup(soup)
43 | 
44 |         assert isinstance(webpage_text, str)
45 | 
46 |         return DocumentConverterResult(
47 |             title=None if soup.title is None else soup.title.string,
48 |             text_content=webpage_text,
49 |         )
50 | 


--------------------------------------------------------------------------------
/core/converters/image.py:
--------------------------------------------------------------------------------
 1 | import base64
 2 | import mimetypes
 3 | from typing import Union
 4 | 
 5 | from core.base import DocumentConverterResult
 6 | from core.converters.media import MediaConverter
 7 | 
 8 | 
 9 | class ImageConverter(MediaConverter):
10 |     """
11 |     Converts images to markdown via extraction of metadata (if `exiftool` is installed), OCR (if `easyocr` is installed), and description via a multimodal LLM (if an llm_client is configured).
12 |     """
13 | 
14 |     def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
15 |         # Bail if not an image
16 |         extension = kwargs.get("file_extension", "")
17 |         if extension.lower() not in [".jpg", ".jpeg", ".png"]:
18 |             return None
19 | 
20 |         md_content = ""
21 | 
22 |         # Add metadata
23 |         metadata = self._get_metadata(local_path, kwargs.get("exiftool_path"))
24 |         if metadata:
25 |             for f in [
26 |                 "ImageSize",
27 |                 "Title",
28 |                 "Caption",
29 |                 "Description",
30 |                 "Keywords",
31 |                 "Artist",
32 |                 "Author",
33 |                 "DateTimeOriginal",
34 |                 "CreateDate",
35 |                 "GPSPosition",
36 |             ]:
37 |                 if f in metadata:
38 |                     md_content += f"{f}: {metadata[f]}\n"
39 | 
40 |         # Try describing the image with GPTV
41 |         llm_client = kwargs.get("llm_client")
42 |         llm_model = kwargs.get("llm_model")
43 |         if llm_client is not None and llm_model is not None:
44 |             md_content += (
45 |                 "\n# Description:\n"
46 |                 + self._get_llm_description(
47 |                     local_path,
48 |                     extension,
49 |                     llm_client,
50 |                     llm_model,
51 |                     prompt=kwargs.get("llm_prompt"),
52 |                 ).strip()
53 |                 + "\n"
54 |             )
55 |         else:
56 |             md_content += """
57 |     Image description need set following env:
58 | 
59 |         - MARKIFY_LLM_API_BASE
60 |         - MARKIFY_LLM_API_KEY
61 |         - MARKIFY_LLM_MODE
62 | """
63 |         return DocumentConverterResult(
64 |             title=None,
65 |             text_content=md_content,
66 |         )
67 | 
68 |     def _get_llm_description(self, local_path, extension, client, model, prompt=None):
69 |         if prompt is None or prompt.strip() == "":
70 |             prompt = "Write a detailed caption for this image."
71 | 
72 |         data_uri = ""
73 |         with open(local_path, "rb") as image_file:
74 |             content_type, encoding = mimetypes.guess_type("_dummy" + extension)
75 |             if content_type is None:
76 |                 content_type = "image/jpeg"
77 |             image_base64 = base64.b64encode(image_file.read()).decode("utf-8")
78 |             data_uri = f"data:{content_type};base64,{image_base64}"
79 | 
80 |         messages = [
81 |             {
82 |                 "role": "user",
83 |                 "content": [
84 |                     {"type": "text", "text": prompt},
85 |                     {
86 |                         "type": "image_url",
87 |                         "image_url": {
88 |                             "url": data_uri,
89 |                         },
90 |                     },
91 |                 ],
92 |             }
93 |         ]
94 | 
95 |         response = client.chat.completions.create(model=model, messages=messages)
96 |         return response.choices[0].message.content
97 | 


--------------------------------------------------------------------------------
/core/converters/ipynb.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from typing import Any, Union
 3 | 
 4 | from core.base import DocumentConverter, DocumentConverterResult, FileConversionException
 5 | 
 6 | 
 7 | class IpynbConverter(DocumentConverter):
 8 |     """Converts Jupyter Notebook (.ipynb) files to Markdown."""
 9 | 
10 |     def convert(
11 |         self, local_path: str, **kwargs: Any
12 |     ) -> Union[None, DocumentConverterResult]:
13 |         # Bail if not ipynb
14 |         extension = kwargs.get("file_extension", "")
15 |         if extension.lower() != ".ipynb":
16 |             return None
17 | 
18 |         # Parse and convert the notebook
19 |         result = None
20 |         with open(local_path, "rt", encoding="utf-8") as fh:
21 |             notebook_content = json.load(fh)
22 |             result = self._convert(notebook_content)
23 | 
24 |         return result
25 | 
26 |     def _convert(self, notebook_content: dict) -> Union[None, DocumentConverterResult]:
27 |         """Helper function that converts notebook JSON content to Markdown."""
28 |         try:
29 |             md_output = []
30 |             title = None
31 | 
32 |             for cell in notebook_content.get("cells", []):
33 |                 cell_type = cell.get("cell_type", "")
34 |                 source_lines = cell.get("source", [])
35 | 
36 |                 if cell_type == "markdown":
37 |                     md_output.append("".join(source_lines))
38 | 
39 |                     # Extract the first # heading as title if not already found
40 |                     if title is None:
41 |                         for line in source_lines:
42 |                             if line.startswith("# "):
43 |                                 title = line.lstrip("# ").strip()
44 |                                 break
45 | 
46 |                 elif cell_type == "code":
47 |                     # Code cells are wrapped in Markdown code blocks
48 |                     md_output.append(f"```python\n{''.join(source_lines)}\n```")
49 |                 elif cell_type == "raw":
50 |                     md_output.append(f"```\n{''.join(source_lines)}\n```")
51 | 
52 |             md_text = "\n\n".join(md_output)
53 | 
54 |             # Check for title in notebook metadata
55 |             title = notebook_content.get("metadata", {}).get("title", title)
56 | 
57 |             return DocumentConverterResult(
58 |                 title=title,
59 |                 text_content=md_text,
60 |             )
61 | 
62 |         except Exception as e:
63 |             raise FileConversionException(
64 |                 f"Error converting .ipynb file: {str(e)}"
65 |             ) from e
66 | 


--------------------------------------------------------------------------------
/core/converters/media.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import shutil
 3 | import subprocess
 4 | from _warnings import warn
 5 | 
 6 | from core.base import DocumentConverter
 7 | 
 8 | 
 9 | class MediaConverter(DocumentConverter):
10 |     """
11 |     Abstract class for multi-modal media (e.g., images and audio)
12 |     """
13 | 
14 |     def _get_metadata(self, local_path, exiftool_path=None):
15 |         if not exiftool_path:
16 |             which_exiftool = shutil.which("exiftool")
17 |             if which_exiftool:
18 |                 warn(
19 |                     f"""Implicit discovery of 'exiftool' is disabled. If you would like to continue to use exiftool in MarkItDown, please set the exiftool_path parameter in the MarkItDown consructor. E.g., 
20 | 
21 |     md = MarkItDown(exiftool_path="{which_exiftool}")
22 | 
23 | This warning will be removed in future releases.
24 | """,
25 |                     DeprecationWarning,
26 |                 )
27 | 
28 |             return None
29 |         else:
30 |             try:
31 |                 result = subprocess.run(
32 |                     [exiftool_path, "-json", local_path], capture_output=True, text=True
33 |                 ).stdout
34 |                 return json.loads(result)[0]
35 |             except Exception:
36 |                 return None
37 | 


--------------------------------------------------------------------------------
/core/converters/mineru/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KylinMountain/markify/622070781f1cdb74506b3ec1ec7d1749c1ef458c/core/converters/mineru/__init__.py


--------------------------------------------------------------------------------
/core/converters/mineru/pdf_processor.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import urllib.parse
  3 | from pathlib import Path
  4 | from typing import Dict
  5 | 
  6 | from magic_pdf.config.enums import SupportedPdfParseMethod
  7 | from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
  8 | from magic_pdf.data.dataset import PymuDocDataset
  9 | from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
 10 | 
 11 | from core.converters.mineru.title_corrector import MarkdownTitleProcessor
 12 | 
 13 | 
 14 | class PDFProcessor:
 15 |     """PDF文档处理管道"""
 16 | 
 17 |     def __init__(self, output_dir: str = "output", base_url: str = "http://localhost:20926", **kwargs):
 18 |         self.output_dir = Path(output_dir)
 19 |         self.image_dir = self.output_dir / "images"
 20 |         self.base_url = base_url
 21 |         self._prepare_directories()
 22 | 
 23 |     def _prepare_directories(self):
 24 |         """创建输出目录结构"""
 25 |         self.image_dir.mkdir(parents=True, exist_ok=True)
 26 |         self.output_dir.mkdir(exist_ok=True)
 27 | 
 28 |     def process(self, pdf_path: str) -> Dict[str, str]:
 29 |         """处理PDF主流程"""
 30 |         pdf_path = Path(pdf_path)
 31 |         if not pdf_path.exists():
 32 |             raise FileNotFoundError(f"PDF文件不存在: {pdf_path}")
 33 | 
 34 |         name_stem = pdf_path.stem
 35 |         writers = {
 36 |             'image': FileBasedDataWriter(str(self.image_dir)),
 37 |             'markdown': FileBasedDataWriter(str(self.output_dir))
 38 |         }
 39 | 
 40 |         # 读取并解析PDF
 41 |         pdf_content = FileBasedDataReader("").read(str(pdf_path))
 42 |         dataset = PymuDocDataset(pdf_content)
 43 | 
 44 |         # 执行解析流程
 45 |         if dataset.classify() == SupportedPdfParseMethod.OCR:
 46 |             result = dataset.apply(doc_analyze, ocr=True).pipe_ocr_mode(writers['image'])
 47 |         else:
 48 |             result = dataset.apply(doc_analyze, ocr=False).pipe_txt_mode(writers['image'])
 49 | 
 50 |         # 生成输出文件
 51 |         output_files = self._generate_outputs(result, writers, name_stem)
 52 | 
 53 |         # 自动修正标题层级
 54 |         self._adjust_title_levels(output_files['markdown'])
 55 | 
 56 |         self._replace_image_paths(output_files['markdown'], self.base_url)
 57 | 
 58 |         return output_files
 59 | 
 60 |     def _generate_outputs(self, result, writers, name_stem: str) -> Dict[str, str]:
 61 |         """生成所有输出文件"""
 62 |         # 生成原始Markdown
 63 |         md_file = f"{name_stem}.md"
 64 |         result.dump_md(writers['markdown'], md_file, self.image_dir.name)
 65 | 
 66 |         # 生成中间文件
 67 |         # result.dump_content_list(writers['markdown'], f"{name_stem}_content.json")
 68 |         # result.dump_middle_json(writers['markdown'], f"{name_stem}_middle.json")
 69 | 
 70 |         return {
 71 |             'markdown': str(self.output_dir / md_file),
 72 |             'images': str(self.image_dir),
 73 |             # 'middle_json': str(self.output_dir / f"{name_stem}_middle.json")
 74 |         }
 75 | 
 76 |     def _replace_image_paths(self, md_path: str, base_url: str):
 77 |         """替换Markdown文件中的本地图像路径为HTTP URL"""
 78 |         with open(md_path, 'r', encoding='utf-8') as f:
 79 |             content = f.read()
 80 | 
 81 |         # 匹配 Markdown 中的图像链接，假设格式为 ![alt](images/xxxxx)
 82 |         pattern = r'!\[.*?\]\((images/.*?)\)'
 83 |         replacement = lambda m: f'![{m.group(0).split("]")[0].split("[")[1]}]({urllib.parse.urljoin(base_url, "images/")}{m.group(1).split("/")[-1]})'
 84 |         new_content = re.sub(pattern, replacement, content)
 85 | 
 86 |         # 将修改后的内容写回文件
 87 |         with open(md_path, 'w', encoding='utf-8') as f:
 88 |             f.write(new_content)
 89 | 
 90 |     def _adjust_title_levels(self, md_path: str):
 91 |         """执行Markdown标题修正"""
 92 |         processor = MarkdownTitleProcessor()
 93 |         processor.process_file(md_path)
 94 | 
 95 | 
 96 | if __name__ == "__main__":
 97 |     # 示例用法
 98 |     processor = PDFProcessor()
 99 |     result = processor.process("/path/to/your.pdf")
100 |     print(f"处理完成，输出文件：{result}")
101 | 


--------------------------------------------------------------------------------
/core/converters/mineru/title_corrector.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from typing import List, Tuple, Optional
  3 | from pathlib import Path
  4 | 
  5 | 
  6 | class MarkdownTitleProcessor:
  7 |     """智能Markdown标题层级处理器"""
  8 | 
  9 |     def __init__(self, title_patterns: Optional[List[Tuple[str, int]]] = None):
 10 |         """
 11 |         初始化标题处理器
 12 | 
 13 |         Args:
 14 |             title_patterns: 自定义标题模式列表，格式为[(正则模式, 基准层级), ...]
 15 |         """
 16 |         # 默认支持中英文混合标题模式
 17 |         self.title_patterns = title_patterns or [
 18 |             # 中文章节模式
 19 |             (r'^(第[一二三四五六七八九十百]+章)\s*[:：]?\s*.+', 1),
 20 |             (r'^(第[一二三四五六七八九十百]+节)\s*[:：]?\s*.+', 2),
 21 |             (r'^【.+】\s*.+', 2),
 22 | 
 23 |             # 英文章节模式
 24 |             (r'^(Chapter|CHAPTER)\s+\d+\.?\s*[:-]?\s*.+', 1),
 25 |             (r'^(Section|SECTION)\s+\d+\.?\d*\s*[:-]?\s*.+', 2),
 26 | 
 27 |             # 数字层级模式
 28 |             (r'^\d+(?![.]\d)', 1),  # 单独数字开头：1
 29 |             (r'^\d+\.\d+(?![.]\d)', 2),  # 二级编号：1.1
 30 |             (r'^\d+\.\d+\.\d+', 3),  # 三级编号：1.1.1
 31 |             (r'^\d+\.\d+\.\d+\.\d+', 4),  # 四级编号：1.1.1.1
 32 | 
 33 |             # 特殊标识
 34 |             (r'^(※|◆|►)\s*.+', 3),  # 特殊符号标题
 35 |             (r'^(Note|Warning):\s*.+', 4)  # 提示类标题
 36 |         ]
 37 | 
 38 |         # 编译正则表达式
 39 |         self.compiled_patterns = [
 40 |             (re.compile(pattern, re.IGNORECASE), level)
 41 |             for pattern, level in self.title_patterns
 42 |         ]
 43 | 
 44 |         # 层级栈管理
 45 |         self.level_stack = [0]  # [当前层级，父层级，祖父层级...]
 46 | 
 47 |     def _clean_title(self, title: str) -> str:
 48 |         """清洗标题内容"""
 49 |         # 移除常见干扰符号
 50 |         title = re.sub(r'^[【《〈（(]', '', title)
 51 |         title = re.sub(r'[】》〉）):.]$', '', title)
 52 |         # 去除首尾特殊符号
 53 |         return title.strip('※★▪•·\t ')
 54 | 
 55 |     def determine_level(self, title: str) -> int:
 56 |         """智能判断标题层级"""
 57 |         clean_title = self._clean_title(title)
 58 | 
 59 |         # 优先匹配预定义模式
 60 |         for pattern, base_level in self.compiled_patterns:
 61 |             if pattern.match(clean_title):
 62 |                 return self._calculate_relative_level(base_level)
 63 | 
 64 |         # 无匹配时根据上下文推断
 65 |         return self._infer_level_from_context(clean_title)
 66 | 
 67 |     def _calculate_relative_level(self, base_level: int) -> int:
 68 |         """计算相对层级"""
 69 |         # 当前基准层级深度
 70 |         current_depth = len(self.level_stack)
 71 | 
 72 |         # 如果基准层级比当前深，则作为子级
 73 |         if base_level > current_depth:
 74 |             return current_depth + 1
 75 |         # 如果基准层级较浅，则重置层级栈
 76 |         elif base_level < current_depth:
 77 |             self.level_stack = self.level_stack[:base_level]
 78 |         return base_level
 79 | 
 80 |     def _infer_level_from_context(self, title: str) -> int:
 81 |         """根据上下文推断层级"""
 82 |         # 根据标题长度和内容特征推断
 83 |         if len(title) < 15 and not re.search(r'\s', title):
 84 |             return min(len(self.level_stack) + 1, 6)
 85 |         return max(len(self.level_stack), 1)
 86 | 
 87 |     def process_line(self, line: str) -> str:
 88 |         """处理单行Markdown文本"""
 89 |         # 匹配标题行
 90 |         match = re.match(r'^(#+)\s+(.+)$', line.strip())
 91 |         if not match:
 92 |             return line
 93 | 
 94 |         original_level = len(match.group(1))
 95 |         title_content = match.group(2)
 96 | 
 97 |         # 计算新层级
 98 |         new_level = self.determine_level(title_content)
 99 |         new_level = max(1, min(new_level, 6))  # 限制在1-6级
100 | 
101 |         # 更新层级栈
102 |         if new_level > len(self.level_stack):
103 |             self.level_stack.append(new_level)
104 |         else:
105 |             self.level_stack = self.level_stack[:new_level]
106 | 
107 |         return f"{'#' * new_level} {title_content}\n"
108 | 
109 |     def process_file(self, input_path: str, output_path: Optional[str] = None):
110 |         """处理整个Markdown文件"""
111 |         input_file = Path(input_path)
112 |         output_file = Path(output_path) if output_path else input_file
113 | 
114 |         with input_file.open('r', encoding='utf-8') as f:
115 |             lines = f.readlines()
116 | 
117 |         processed_lines = []
118 |         for line in lines:
119 |             processed_lines.append(self.process_line(line))
120 | 
121 |         with output_file.open('w', encoding='utf-8') as f:
122 |             f.writelines(processed_lines)
123 | 
124 | 
125 | if __name__ == '__main__':
126 |     main()


--------------------------------------------------------------------------------
/core/converters/mp3.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import tempfile
 3 | from typing import Union
 4 | from warnings import catch_warnings, resetwarnings
 5 | 
 6 | # Optional Transcription support
 7 | IS_AUDIO_TRANSCRIPTION_CAPABLE = False
 8 | try:
 9 |     # Using warnings' catch_warnings to catch
10 |     # pydub's warning of ffmpeg or avconv missing
11 |     with catch_warnings(record=True) as w:
12 |         import pydub
13 | 
14 |         if w:
15 |             raise ModuleNotFoundError
16 |     import speech_recognition as sr
17 | 
18 |     IS_AUDIO_TRANSCRIPTION_CAPABLE = True
19 | except ModuleNotFoundError:
20 |     pass
21 | finally:
22 |     resetwarnings()
23 | 
24 | from core.base import DocumentConverterResult
25 | from core.converters.wav import WavConverter
26 | 
27 | 
28 | class Mp3Converter(WavConverter):
29 |     """
30 |     Converts MP3 files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` AND `pydub` are installed).
31 |     """
32 | 
33 |     def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
34 |         # Bail if not a MP3
35 |         extension = kwargs.get("file_extension", "")
36 |         if extension.lower() != ".mp3":
37 |             return None
38 | 
39 |         md_content = ""
40 | 
41 |         # Add metadata
42 |         metadata = self._get_metadata(local_path, kwargs.get("exiftool_path"))
43 |         if metadata:
44 |             for f in [
45 |                 "Title",
46 |                 "Artist",
47 |                 "Author",
48 |                 "Band",
49 |                 "Album",
50 |                 "Genre",
51 |                 "Track",
52 |                 "DateTimeOriginal",
53 |                 "CreateDate",
54 |                 "Duration",
55 |             ]:
56 |                 if f in metadata:
57 |                     md_content += f"{f}: {metadata[f]}\n"
58 | 
59 |         # Transcribe
60 |         if IS_AUDIO_TRANSCRIPTION_CAPABLE:
61 |             handle, temp_path = tempfile.mkstemp(suffix=".wav")
62 |             os.close(handle)
63 |             try:
64 |                 sound = pydub.AudioSegment.from_mp3(local_path)
65 |                 sound.export(temp_path, format="wav")
66 | 
67 |                 _args = dict()
68 |                 _args.update(kwargs)
69 |                 _args["file_extension"] = ".wav"
70 | 
71 |                 try:
72 |                     transcript = super()._transcribe_audio(temp_path).strip()
73 |                     md_content += "\n\n### Audio Transcript:\n" + (
74 |                         "[No speech detected]" if transcript == "" else transcript
75 |                     )
76 |                 except Exception:
77 |                     md_content += "\n\n### Audio Transcript:\nError. Could not transcribe this audio."
78 | 
79 |             finally:
80 |                 os.unlink(temp_path)
81 | 
82 |         # Return the result
83 |         return DocumentConverterResult(
84 |             title=None,
85 |             text_content=md_content.strip(),
86 |         )
87 | 


--------------------------------------------------------------------------------
/core/converters/outlook.py:
--------------------------------------------------------------------------------
 1 | from typing import Union, Any
 2 | 
 3 | from olefile import olefile
 4 | 
 5 | from core.base import FileConversionException, DocumentConverterResult, DocumentConverter
 6 | 
 7 | 
 8 | class OutlookMsgConverter(DocumentConverter):
 9 |     """Converts Outlook .msg files to markdown by extracting email metadata and content.
10 | 
11 |     Uses the olefile package to parse the .msg file structure and extract:
12 |     - Email headers (From, To, Subject)
13 |     - Email body content
14 |     """
15 | 
16 |     def convert(
17 |         self, local_path: str, **kwargs: Any
18 |     ) -> Union[None, DocumentConverterResult]:
19 |         # Bail if not a MSG file
20 |         extension = kwargs.get("file_extension", "")
21 |         if extension.lower() != ".msg":
22 |             return None
23 | 
24 |         try:
25 |             msg = olefile.OleFileIO(local_path)
26 |             # Extract email metadata
27 |             md_content = "# Email Message\n\n"
28 | 
29 |             # Get headers
30 |             headers = {
31 |                 "From": self._get_stream_data(msg, "__substg1.0_0C1F001F"),
32 |                 "To": self._get_stream_data(msg, "__substg1.0_0E04001F"),
33 |                 "Subject": self._get_stream_data(msg, "__substg1.0_0037001F"),
34 |             }
35 | 
36 |             # Add headers to markdown
37 |             for key, value in headers.items():
38 |                 if value:
39 |                     md_content += f"**{key}:** {value}\n"
40 | 
41 |             md_content += "\n## Content\n\n"
42 | 
43 |             # Get email body
44 |             body = self._get_stream_data(msg, "__substg1.0_1000001F")
45 |             if body:
46 |                 md_content += body
47 | 
48 |             msg.close()
49 | 
50 |             return DocumentConverterResult(
51 |                 title=headers.get("Subject"), text_content=md_content.strip()
52 |             )
53 | 
54 |         except Exception as e:
55 |             raise FileConversionException(
56 |                 f"Could not convert MSG file '{local_path}': {str(e)}"
57 |             )
58 | 
59 |     def _get_stream_data(
60 |         self, msg: olefile.OleFileIO, stream_path: str
61 |     ) -> Union[str, None]:
62 |         """Helper to safely extract and decode stream data from the MSG file."""
63 |         try:
64 |             if msg.exists(stream_path):
65 |                 data = msg.openstream(stream_path).read()
66 |                 # Try UTF-16 first (common for .msg files)
67 |                 try:
68 |                     return data.decode("utf-16-le").strip()
69 |                 except UnicodeDecodeError:
70 |                     # Fall back to UTF-8
71 |                     try:
72 |                         return data.decode("utf-8").strip()
73 |                     except UnicodeDecodeError:
74 |                         # Last resort - ignore errors
75 |                         return data.decode("utf-8", errors="ignore").strip()
76 |         except Exception:
77 |             pass
78 |         return None


--------------------------------------------------------------------------------
/core/converters/pdf.py:
--------------------------------------------------------------------------------
 1 | from typing import Union
 2 | from pathlib import Path
 3 | 
 4 | from core.base import DocumentConverter, DocumentConverterResult, FileConversionException
 5 | 
 6 | 
 7 | class PdfConverter(DocumentConverter):
 8 |     """默认PDF解析器（simple模式，基于pdfminer）"""
 9 | 
10 |     def convert(self, local_path: str, **kwargs) -> Union[None, DocumentConverterResult]:
11 |         # Bail if not a pdf
12 |         extension = kwargs.get("file_extension", "")
13 |         if extension.lower() != ".pdf":
14 |             return None
15 |         try:
16 |             import pdfminer.high_level
17 |             return DocumentConverterResult(
18 |                 title=None,
19 |                 text_content=pdfminer.high_level.extract_text(local_path)
20 |             )
21 |         except Exception as e:
22 |             raise FileConversionException(f"Simple PDF解析失败: {str(e)}")
23 | 
24 | 
25 | class AdvancedPdfConverter(DocumentConverter):
26 |     """使用mineru的增强PDF解析器（advanced模式）"""
27 | 
28 |     def convert(self, local_path: str, **kwargs) -> DocumentConverterResult:
29 |         # Bail if not a pdf
30 |         extension = kwargs.get("file_extension", "")
31 |         if extension.lower() != ".pdf":
32 |             return None
33 | 
34 |         try:
35 |             from core.converters.mineru.pdf_processor import PDFProcessor
36 |             processor = PDFProcessor(**kwargs)
37 |             result = processor.process(local_path)
38 |             
39 |             # 读取生成的markdown文件
40 |             with open(result["markdown"], "r", encoding="utf-8") as f:
41 |                 md_content = f.read()
42 |                 
43 |             return DocumentConverterResult(
44 |                 title=Path(local_path).stem,
45 |                 text_content=md_content
46 |             )
47 |         except ImportError:
48 |             raise RuntimeError("miner模块未找到，请安装mineru解析器")
49 |         except Exception as e:
50 |             raise FileConversionException(f"Advanced PDF解析失败: {str(e)}")
51 | 
52 | 
53 | class CloudPdfConverter(DocumentConverter):
54 |     """云端PDF解析器（预留cloud模式实现）"""
55 | 
56 |     def convert(self, local_path: str, **kwargs) -> DocumentConverterResult:
57 |         # Bail if not a pdf
58 |         extension = kwargs.get("file_extension", "")
59 |         if extension.lower() != ".pdf":
60 |             return None
61 |         raise NotImplementedError("Cloud模式尚未实现")
62 | 
63 | 


--------------------------------------------------------------------------------
/core/converters/plaintext.py:
--------------------------------------------------------------------------------
 1 | import mimetypes
 2 | from typing import Any, Union
 3 | 
 4 | from charset_normalizer import from_path
 5 | 
 6 | from core.base import DocumentConverter, DocumentConverterResult
 7 | 
 8 | 
 9 | class PlainTextConverter(DocumentConverter):
10 |     """Anything with content type text/plain"""
11 | 
12 |     def convert(
13 |         self, local_path: str, **kwargs: Any
14 |     ) -> Union[None, DocumentConverterResult]:
15 |         # Guess the content type from any file extension that might be around
16 |         content_type, _ = mimetypes.guess_type(
17 |             "__placeholder" + kwargs.get("file_extension", "")
18 |         )
19 | 
20 |         # Only accept text files
21 |         if content_type is None:
22 |             return None
23 |         elif all(
24 |             not content_type.lower().startswith(type_prefix)
25 |             for type_prefix in ["text/", "application/json"]
26 |         ):
27 |             return None
28 | 
29 |         text_content = str(from_path(local_path).best())
30 |         return DocumentConverterResult(
31 |             title=None,
32 |             text_content=text_content,
33 |         )
34 | 


--------------------------------------------------------------------------------
/core/converters/pptx.py:
--------------------------------------------------------------------------------
  1 | import html
  2 | import re
  3 | from typing import Union
  4 | 
  5 | import pptx
  6 | 
  7 | from core.base import DocumentConverterResult
  8 | from core.converters.html import HtmlConverter
  9 | 
 10 | 
 11 | class PptxConverter(HtmlConverter):
 12 |     """
 13 |     Converts PPTX files to Markdown. Supports heading, tables and images with alt text.
 14 |     """
 15 | 
 16 |     def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
 17 |         # Bail if not a PPTX
 18 |         extension = kwargs.get("file_extension", "")
 19 |         if extension.lower() != ".pptx":
 20 |             return None
 21 | 
 22 |         md_content = ""
 23 | 
 24 |         presentation = pptx.Presentation(local_path)
 25 |         slide_num = 0
 26 |         for slide in presentation.slides:
 27 |             slide_num += 1
 28 | 
 29 |             md_content += f"\n\n<!-- Slide number: {slide_num} -->\n"
 30 | 
 31 |             title = slide.shapes.title
 32 |             for shape in slide.shapes:
 33 |                 # Pictures
 34 |                 if self._is_picture(shape):
 35 |                     # https://github.com/scanny/python-pptx/pull/512#issuecomment-1713100069
 36 |                     alt_text = ""
 37 |                     try:
 38 |                         alt_text = shape._element._nvXxPr.cNvPr.attrib.get("descr", "")
 39 |                     except Exception:
 40 |                         pass
 41 | 
 42 |                     # A placeholder name
 43 |                     filename = re.sub(r"\W", "", shape.name) + ".jpg"
 44 |                     md_content += (
 45 |                         "\n!["
 46 |                         + (alt_text if alt_text else shape.name)
 47 |                         + "]("
 48 |                         + filename
 49 |                         + ")\n"
 50 |                     )
 51 | 
 52 |                 # Tables
 53 |                 if self._is_table(shape):
 54 |                     html_table = "<html><body><table>"
 55 |                     first_row = True
 56 |                     for row in shape.table.rows:
 57 |                         html_table += "<tr>"
 58 |                         for cell in row.cells:
 59 |                             if first_row:
 60 |                                 html_table += "<th>" + html.escape(cell.text) + "</th>"
 61 |                             else:
 62 |                                 html_table += "<td>" + html.escape(cell.text) + "</td>"
 63 |                         html_table += "</tr>"
 64 |                         first_row = False
 65 |                     html_table += "</table></body></html>"
 66 |                     md_content += (
 67 |                         "\n" + self._convert(html_table).text_content.strip() + "\n"
 68 |                     )
 69 | 
 70 |                 # Charts
 71 |                 if shape.has_chart:
 72 |                     md_content += self._convert_chart_to_markdown(shape.chart)
 73 | 
 74 |                 # Text areas
 75 |                 elif shape.has_text_frame:
 76 |                     if shape == title:
 77 |                         md_content += "# " + shape.text.lstrip() + "\n"
 78 |                     else:
 79 |                         md_content += shape.text + "\n"
 80 | 
 81 |             md_content = md_content.strip()
 82 | 
 83 |             if slide.has_notes_slide:
 84 |                 md_content += "\n\n### Notes:\n"
 85 |                 notes_frame = slide.notes_slide.notes_text_frame
 86 |                 if notes_frame is not None:
 87 |                     md_content += notes_frame.text
 88 |                 md_content = md_content.strip()
 89 | 
 90 |         return DocumentConverterResult(
 91 |             title=None,
 92 |             text_content=md_content.strip(),
 93 |         )
 94 | 
 95 |     def _is_picture(self, shape):
 96 |         if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PICTURE:
 97 |             return True
 98 |         if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PLACEHOLDER:
 99 |             if hasattr(shape, "image"):
100 |                 return True
101 |         return False
102 | 
103 |     def _is_table(self, shape):
104 |         if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.TABLE:
105 |             return True
106 |         return False
107 | 
108 |     def _convert_chart_to_markdown(self, chart):
109 |         md = "\n\n### Chart"
110 |         if chart.has_title:
111 |             md += f": {chart.chart_title.text_frame.text}"
112 |         md += "\n\n"
113 |         data = []
114 |         category_names = [c.label for c in chart.plots[0].categories]
115 |         series_names = [s.name for s in chart.series]
116 |         data.append(["Category"] + series_names)
117 | 
118 |         for idx, category in enumerate(category_names):
119 |             row = [category]
120 |             for series in chart.series:
121 |                 row.append(series.values[idx])
122 |             data.append(row)
123 | 
124 |         markdown_table = []
125 |         for row in data:
126 |             markdown_table.append("| " + " | ".join(map(str, row)) + " |")
127 |         header = markdown_table[0]
128 |         separator = "|" + "|".join(["---"] * len(data[0])) + "|"
129 |         return md + "\n".join([header, separator] + markdown_table[1:])
130 | 


--------------------------------------------------------------------------------
/core/converters/rss.py:
--------------------------------------------------------------------------------
  1 | import traceback
  2 | from typing import Union
  3 | from xml.dom import minidom
  4 | 
  5 | from bs4 import BeautifulSoup
  6 | 
  7 | from core.base import DocumentConverter, DocumentConverterResult
  8 | from core.converters.custommarkdownify import _CustomMarkdownify
  9 | 
 10 | 
 11 | class RSSConverter(DocumentConverter):
 12 |     """Convert RSS / Atom type to markdown"""
 13 | 
 14 |     def convert(
 15 |         self, local_path: str, **kwargs
 16 |     ) -> Union[None, DocumentConverterResult]:
 17 |         # Bail if not RSS type
 18 |         extension = kwargs.get("file_extension", "")
 19 |         if extension.lower() not in [".xml", ".rss", ".atom"]:
 20 |             return None
 21 |         try:
 22 |             doc = minidom.parse(local_path)
 23 |         except BaseException as _:
 24 |             return None
 25 |         result = None
 26 |         if doc.getElementsByTagName("rss"):
 27 |             # A RSS feed must have a root element of <rss>
 28 |             result = self._parse_rss_type(doc)
 29 |         elif doc.getElementsByTagName("feed"):
 30 |             root = doc.getElementsByTagName("feed")[0]
 31 |             if root.getElementsByTagName("entry"):
 32 |                 # An Atom feed must have a root element of <feed> and at least one <entry>
 33 |                 result = self._parse_atom_type(doc)
 34 |             else:
 35 |                 return None
 36 |         else:
 37 |             # not rss or atom
 38 |             return None
 39 | 
 40 |         return result
 41 | 
 42 |     def _parse_atom_type(
 43 |         self, doc: minidom.Document
 44 |     ) -> Union[None, DocumentConverterResult]:
 45 |         """Parse the type of an Atom feed.
 46 | 
 47 |         Returns None if the feed type is not recognized or something goes wrong.
 48 |         """
 49 |         try:
 50 |             root = doc.getElementsByTagName("feed")[0]
 51 |             title = self._get_data_by_tag_name(root, "title")
 52 |             subtitle = self._get_data_by_tag_name(root, "subtitle")
 53 |             entries = root.getElementsByTagName("entry")
 54 |             md_text = f"# {title}\n"
 55 |             if subtitle:
 56 |                 md_text += f"{subtitle}\n"
 57 |             for entry in entries:
 58 |                 entry_title = self._get_data_by_tag_name(entry, "title")
 59 |                 entry_summary = self._get_data_by_tag_name(entry, "summary")
 60 |                 entry_updated = self._get_data_by_tag_name(entry, "updated")
 61 |                 entry_content = self._get_data_by_tag_name(entry, "content")
 62 | 
 63 |                 if entry_title:
 64 |                     md_text += f"\n## {entry_title}\n"
 65 |                 if entry_updated:
 66 |                     md_text += f"Updated on: {entry_updated}\n"
 67 |                 if entry_summary:
 68 |                     md_text += self._parse_content(entry_summary)
 69 |                 if entry_content:
 70 |                     md_text += self._parse_content(entry_content)
 71 | 
 72 |             return DocumentConverterResult(
 73 |                 title=title,
 74 |                 text_content=md_text,
 75 |             )
 76 |         except BaseException as _:
 77 |             return None
 78 | 
 79 |     def _parse_rss_type(
 80 |         self, doc: minidom.Document
 81 |     ) -> Union[None, DocumentConverterResult]:
 82 |         """Parse the type of an RSS feed.
 83 | 
 84 |         Returns None if the feed type is not recognized or something goes wrong.
 85 |         """
 86 |         try:
 87 |             root = doc.getElementsByTagName("rss")[0]
 88 |             channel = root.getElementsByTagName("channel")
 89 |             if not channel:
 90 |                 return None
 91 |             channel = channel[0]
 92 |             channel_title = self._get_data_by_tag_name(channel, "title")
 93 |             channel_description = self._get_data_by_tag_name(channel, "description")
 94 |             items = channel.getElementsByTagName("item")
 95 |             if channel_title:
 96 |                 md_text = f"# {channel_title}\n"
 97 |             if channel_description:
 98 |                 md_text += f"{channel_description}\n"
 99 |             if not items:
100 |                 items = []
101 |             for item in items:
102 |                 title = self._get_data_by_tag_name(item, "title")
103 |                 description = self._get_data_by_tag_name(item, "description")
104 |                 pubDate = self._get_data_by_tag_name(item, "pubDate")
105 |                 content = self._get_data_by_tag_name(item, "content:encoded")
106 | 
107 |                 if title:
108 |                     md_text += f"\n## {title}\n"
109 |                 if pubDate:
110 |                     md_text += f"Published on: {pubDate}\n"
111 |                 if description:
112 |                     md_text += self._parse_content(description)
113 |                 if content:
114 |                     md_text += self._parse_content(content)
115 | 
116 |             return DocumentConverterResult(
117 |                 title=channel_title,
118 |                 text_content=md_text,
119 |             )
120 |         except BaseException as _:
121 |             print(traceback.format_exc())
122 |             return None
123 | 
124 |     def _parse_content(self, content: str) -> str:
125 |         """Parse the content of an RSS feed item"""
126 |         try:
127 |             # using bs4 because many RSS feeds have HTML-styled content
128 |             soup = BeautifulSoup(content, "html.parser")
129 |             return _CustomMarkdownify().convert_soup(soup)
130 |         except BaseException as _:
131 |             return content
132 | 
133 |     def _get_data_by_tag_name(
134 |         self, element: minidom.Element, tag_name: str
135 |     ) -> Union[str, None]:
136 |         """Get data from first child element with the given tag name.
137 |         Returns None when no such element is found.
138 |         """
139 |         nodes = element.getElementsByTagName(tag_name)
140 |         if not nodes:
141 |             return None
142 |         fc = nodes[0].firstChild
143 |         if fc:
144 |             return fc.data
145 |         return None
146 | 


--------------------------------------------------------------------------------
/core/converters/wav.py:
--------------------------------------------------------------------------------
 1 | from typing import Union
 2 | from warnings import catch_warnings, resetwarnings
 3 | 
 4 | # Optional Transcription support
 5 | IS_AUDIO_TRANSCRIPTION_CAPABLE = False
 6 | try:
 7 |     # Using warnings' catch_warnings to catch
 8 |     # pydub's warning of ffmpeg or avconv missing
 9 |     with catch_warnings(record=True) as w:
10 |         import pydub
11 | 
12 |         if w:
13 |             raise ModuleNotFoundError
14 |     import speech_recognition as sr
15 | 
16 |     IS_AUDIO_TRANSCRIPTION_CAPABLE = True
17 | except ModuleNotFoundError:
18 |     pass
19 | finally:
20 |     resetwarnings()
21 | 
22 | from core.base import DocumentConverterResult
23 | from core.converters.media import MediaConverter
24 | 
25 | 
26 | class WavConverter(MediaConverter):
27 |     """
28 |     Converts WAV files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` is installed).
29 |     """
30 | 
31 |     def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
32 |         # Bail if not a WAV
33 |         extension = kwargs.get("file_extension", "")
34 |         if extension.lower() != ".wav":
35 |             return None
36 | 
37 |         md_content = ""
38 | 
39 |         # Add metadata
40 |         metadata = self._get_metadata(local_path, kwargs.get("exiftool_path"))
41 |         if metadata:
42 |             for f in [
43 |                 "Title",
44 |                 "Artist",
45 |                 "Author",
46 |                 "Band",
47 |                 "Album",
48 |                 "Genre",
49 |                 "Track",
50 |                 "DateTimeOriginal",
51 |                 "CreateDate",
52 |                 "Duration",
53 |             ]:
54 |                 if f in metadata:
55 |                     md_content += f"{f}: {metadata[f]}\n"
56 | 
57 |         # Transcribe
58 |         if IS_AUDIO_TRANSCRIPTION_CAPABLE:
59 |             try:
60 |                 transcript = self._transcribe_audio(local_path)
61 |                 md_content += "\n\n### Audio Transcript:\n" + (
62 |                     "[No speech detected]" if transcript == "" else transcript
63 |                 )
64 |             except Exception:
65 |                 md_content += (
66 |                     "\n\n### Audio Transcript:\nError. Could not transcribe this audio."
67 |                 )
68 | 
69 |         return DocumentConverterResult(
70 |             title=None,
71 |             text_content=md_content.strip(),
72 |         )
73 | 
74 |     def _transcribe_audio(self, local_path) -> str:
75 |         recognizer = sr.Recognizer()
76 |         with sr.AudioFile(local_path) as source:
77 |             audio = recognizer.record(source)
78 |             return recognizer.recognize_google(audio).strip()
79 | 


--------------------------------------------------------------------------------
/core/converters/wikipedia.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from typing import Any, Union
 3 | 
 4 | from bs4 import BeautifulSoup
 5 | 
 6 | from core.base import DocumentConverter, DocumentConverterResult
 7 | from core.converters.custommarkdownify import _CustomMarkdownify
 8 | 
 9 | 
10 | class WikipediaConverter(DocumentConverter):
11 |     """Handle Wikipedia pages separately, focusing only on the main document content."""
12 | 
13 |     def convert(
14 |         self, local_path: str, **kwargs: Any
15 |     ) -> Union[None, DocumentConverterResult]:
16 |         # Bail if not Wikipedia
17 |         extension = kwargs.get("file_extension", "")
18 |         if extension.lower() not in [".html", ".htm"]:
19 |             return None
20 |         url = kwargs.get("url", "")
21 |         if not re.search(r"^https?:\/\/[a-zA-Z]{2,3}\.wikipedia.org\/", url):
22 |             return None
23 | 
24 |         # Parse the file
25 |         soup = None
26 |         with open(local_path, "rt", encoding="utf-8") as fh:
27 |             soup = BeautifulSoup(fh.read(), "html.parser")
28 | 
29 |         # Remove javascript and style blocks
30 |         for script in soup(["script", "style"]):
31 |             script.extract()
32 | 
33 |         # Print only the main content
34 |         body_elm = soup.find("div", {"id": "mw-content-text"})
35 |         title_elm = soup.find("span", {"class": "mw-page-title-main"})
36 | 
37 |         webpage_text = ""
38 |         main_title = None if soup.title is None else soup.title.string
39 | 
40 |         if body_elm:
41 |             # What's the title
42 |             if title_elm and len(title_elm) > 0:
43 |                 main_title = title_elm.string  # type: ignore
44 |                 assert isinstance(main_title, str)
45 | 
46 |             # Convert the page
47 |             webpage_text = f"# {main_title}\n\n" + _CustomMarkdownify().convert_soup(
48 |                 body_elm
49 |             )
50 |         else:
51 |             webpage_text = _CustomMarkdownify().convert_soup(soup)
52 | 
53 |         return DocumentConverterResult(
54 |             title=main_title,
55 |             text_content=webpage_text,
56 |         )
57 | 


--------------------------------------------------------------------------------
/core/converters/xls.py:
--------------------------------------------------------------------------------
 1 | from typing import Union
 2 | 
 3 | import pandas as pd
 4 | 
 5 | from core.base import DocumentConverterResult
 6 | from core.converters.html import HtmlConverter
 7 | 
 8 | 
 9 | class XlsConverter(HtmlConverter):
10 |     """
11 |     Converts XLS files to Markdown, with each sheet presented as a separate Markdown table.
12 |     """
13 | 
14 |     def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
15 |         # Bail if not a XLS
16 |         extension = kwargs.get("file_extension", "")
17 |         if extension.lower() != ".xls":
18 |             return None
19 | 
20 |         sheets = pd.read_excel(local_path, sheet_name=None, engine="xlrd")
21 |         md_content = ""
22 |         for s in sheets:
23 |             md_content += f"## {s}\n"
24 |             html_content = sheets[s].to_html(index=False)
25 |             md_content += self._convert(html_content).text_content.strip() + "\n\n"
26 | 
27 |         return DocumentConverterResult(
28 |             title=None,
29 |             text_content=md_content.strip(),
30 |         )
31 | 


--------------------------------------------------------------------------------
/core/converters/xlsx.py:
--------------------------------------------------------------------------------
 1 | from typing import Union
 2 | 
 3 | import pandas as pd
 4 | 
 5 | from core.base import DocumentConverterResult
 6 | from core.converters.html import HtmlConverter
 7 | 
 8 | 
 9 | class XlsxConverter(HtmlConverter):
10 |     """
11 |     Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table.
12 |     """
13 | 
14 |     def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
15 |         # Bail if not a XLSX
16 |         extension = kwargs.get("file_extension", "")
17 |         if extension.lower() != ".xlsx":
18 |             return None
19 | 
20 |         sheets = pd.read_excel(local_path, sheet_name=None, engine="openpyxl")
21 |         md_content = ""
22 |         for s in sheets:
23 |             md_content += f"## {s}\n"
24 |             html_content = sheets[s].to_html(index=False)
25 |             md_content += self._convert(html_content).text_content.strip() + "\n\n"
26 | 
27 |         return DocumentConverterResult(
28 |             title=None,
29 |             text_content=md_content.strip(),
30 |         )
31 | 


--------------------------------------------------------------------------------
/core/converters/youtube.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import re
  3 | from typing import Any, Union, Dict, List
  4 | from urllib.parse import urlparse, parse_qs
  5 | 
  6 | from bs4 import BeautifulSoup
  7 | 
  8 | # Optional YouTube transcription support
  9 | try:
 10 |     from youtube_transcript_api import YouTubeTranscriptApi
 11 | 
 12 |     IS_YOUTUBE_TRANSCRIPT_CAPABLE = True
 13 | except ModuleNotFoundError:
 14 |     pass
 15 | 
 16 | 
 17 | from core.base import DocumentConverter, DocumentConverterResult
 18 | 
 19 | 
 20 | class YouTubeConverter(DocumentConverter):
 21 |     """Handle YouTube specially, focusing on the video title, description, and transcript."""
 22 | 
 23 |     def convert(
 24 |         self, local_path: str, **kwargs: Any
 25 |     ) -> Union[None, DocumentConverterResult]:
 26 |         # Bail if not YouTube
 27 |         extension = kwargs.get("file_extension", "")
 28 |         if extension.lower() not in [".html", ".htm"]:
 29 |             return None
 30 |         url = kwargs.get("url", "")
 31 |         if not url.startswith("https://www.youtube.com/watch?"):
 32 |             return None
 33 | 
 34 |         # Parse the file
 35 |         soup = None
 36 |         with open(local_path, "rt", encoding="utf-8") as fh:
 37 |             soup = BeautifulSoup(fh.read(), "html.parser")
 38 | 
 39 |         # Read the meta tags
 40 |         assert soup.title is not None and soup.title.string is not None
 41 |         metadata: Dict[str, str] = {"title": soup.title.string}
 42 |         for meta in soup(["meta"]):
 43 |             for a in meta.attrs:
 44 |                 if a in ["itemprop", "property", "name"]:
 45 |                     metadata[meta[a]] = meta.get("content", "")
 46 |                     break
 47 | 
 48 |         # We can also try to read the full description. This is more prone to breaking, since it reaches into the page implementation
 49 |         try:
 50 |             for script in soup(["script"]):
 51 |                 content = script.text
 52 |                 if "ytInitialData" in content:
 53 |                     lines = re.split(r"\r?\n", content)
 54 |                     obj_start = lines[0].find("{")
 55 |                     obj_end = lines[0].rfind("}")
 56 |                     if obj_start >= 0 and obj_end >= 0:
 57 |                         data = json.loads(lines[0][obj_start : obj_end + 1])
 58 |                         attrdesc = self._findKey(data, "attributedDescriptionBodyText")  # type: ignore
 59 |                         if attrdesc:
 60 |                             metadata["description"] = str(attrdesc["content"])
 61 |                     break
 62 |         except Exception:
 63 |             pass
 64 | 
 65 |         # Start preparing the page
 66 |         webpage_text = "# YouTube\n"
 67 | 
 68 |         title = self._get(metadata, ["title", "og:title", "name"])  # type: ignore
 69 |         assert isinstance(title, str)
 70 | 
 71 |         if title:
 72 |             webpage_text += f"\n## {title}\n"
 73 | 
 74 |         stats = ""
 75 |         views = self._get(metadata, ["interactionCount"])  # type: ignore
 76 |         if views:
 77 |             stats += f"- **Views:** {views}\n"
 78 | 
 79 |         keywords = self._get(metadata, ["keywords"])  # type: ignore
 80 |         if keywords:
 81 |             stats += f"- **Keywords:** {keywords}\n"
 82 | 
 83 |         runtime = self._get(metadata, ["duration"])  # type: ignore
 84 |         if runtime:
 85 |             stats += f"- **Runtime:** {runtime}\n"
 86 | 
 87 |         if len(stats) > 0:
 88 |             webpage_text += f"\n### Video Metadata\n{stats}\n"
 89 | 
 90 |         description = self._get(metadata, ["description", "og:description"])  # type: ignore
 91 |         if description:
 92 |             webpage_text += f"\n### Description\n{description}\n"
 93 | 
 94 |         if IS_YOUTUBE_TRANSCRIPT_CAPABLE:
 95 |             transcript_text = ""
 96 |             parsed_url = urlparse(url)  # type: ignore
 97 |             params = parse_qs(parsed_url.query)  # type: ignore
 98 |             if "v" in params:
 99 |                 assert isinstance(params["v"][0], str)
100 |                 video_id = str(params["v"][0])
101 |                 try:
102 |                     youtube_transcript_languages = kwargs.get(
103 |                         "youtube_transcript_languages", ("en",)
104 |                     )
105 |                     # Must be a single transcript.
106 |                     transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=youtube_transcript_languages)  # type: ignore
107 |                     transcript_text = " ".join([part["text"] for part in transcript])  # type: ignore
108 |                     # Alternative formatting:
109 |                     # formatter = TextFormatter()
110 |                     # formatter.format_transcript(transcript)
111 |                 except Exception:
112 |                     pass
113 |             if transcript_text:
114 |                 webpage_text += f"\n### Transcript\n{transcript_text}\n"
115 | 
116 |         title = title if title else soup.title.string
117 |         assert isinstance(title, str)
118 | 
119 |         return DocumentConverterResult(
120 |             title=title,
121 |             text_content=webpage_text,
122 |         )
123 | 
124 |     def _get(
125 |         self,
126 |         metadata: Dict[str, str],
127 |         keys: List[str],
128 |         default: Union[str, None] = None,
129 |     ) -> Union[str, None]:
130 |         for k in keys:
131 |             if k in metadata:
132 |                 return metadata[k]
133 |         return default
134 | 
135 |     def _findKey(self, json: Any, key: str) -> Union[str, None]:  # TODO: Fix json type
136 |         if isinstance(json, list):
137 |             for elm in json:
138 |                 ret = self._findKey(elm, key)
139 |                 if ret is not None:
140 |                     return ret
141 |         elif isinstance(json, dict):
142 |             for k in json:
143 |                 if k == key:
144 |                     return json[k]
145 |                 else:
146 |                     ret = self._findKey(json[k], key)
147 |                     if ret is not None:
148 |                         return ret
149 |         return None
150 | 


--------------------------------------------------------------------------------
/core/converters/zip.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import shutil
  3 | import zipfile
  4 | from typing import Any, Union
  5 | 
  6 | from core.base import DocumentConverterResult, DocumentConverter
  7 | 
  8 | 
  9 | class ZipConverter(DocumentConverter):
 10 |     """Converts ZIP files to markdown by extracting and converting all contained files.
 11 | 
 12 |     The converter extracts the ZIP contents to a temporary directory, processes each file
 13 |     using appropriate converters based on file extensions, and then combines the results
 14 |     into a single markdown document. The temporary directory is cleaned up after processing.
 15 | 
 16 |     Example output format:
 17 |     ```markdown
 18 |     Content from the zip file `example.zip`:
 19 | 
 20 |     ## File: docs/readme.txt
 21 | 
 22 |     This is the content of readme.txt
 23 |     Multiple lines are preserved
 24 | 
 25 |     ## File: images/example.jpg
 26 | 
 27 |     ImageSize: 1920x1080
 28 |     DateTimeOriginal: 2024-02-15 14:30:00
 29 |     Description: A beautiful landscape photo
 30 | 
 31 |     ## File: data/report.xlsx
 32 | 
 33 |     ## Sheet1
 34 |     | Column1 | Column2 | Column3 |
 35 |     |---------|---------|---------|
 36 |     | data1   | data2   | data3   |
 37 |     | data4   | data5   | data6   |
 38 |     ```
 39 | 
 40 |     Key features:
 41 |     - Maintains original file structure in headings
 42 |     - Processes nested files recursively
 43 |     - Uses appropriate converters for each file type
 44 |     - Preserves formatting of converted content
 45 |     - Cleans up temporary files after processing
 46 |     """
 47 | 
 48 |     def convert(
 49 |         self, local_path: str, **kwargs: Any
 50 |     ) -> Union[None, DocumentConverterResult]:
 51 |         # Bail if not a ZIP
 52 |         extension = kwargs.get("file_extension", "")
 53 |         if extension.lower() != ".zip":
 54 |             return None
 55 | 
 56 |         # Get parent converters list if available
 57 |         parent_converters = kwargs.get("_parent_converters", [])
 58 |         if not parent_converters:
 59 |             return DocumentConverterResult(
 60 |                 title=None,
 61 |                 text_content=f"[ERROR] No converters available to process zip contents from: {local_path}",
 62 |             )
 63 | 
 64 |         extracted_zip_folder_name = (
 65 |             f"extracted_{os.path.basename(local_path).replace('.zip', '_zip')}"
 66 |         )
 67 |         extraction_dir = os.path.normpath(
 68 |             os.path.join(os.path.dirname(local_path), extracted_zip_folder_name)
 69 |         )
 70 |         md_content = f"Content from the zip file `{os.path.basename(local_path)}`:\n\n"
 71 | 
 72 |         try:
 73 |             # Extract the zip file safely
 74 |             with zipfile.ZipFile(local_path, "r") as zipObj:
 75 |                 # Safeguard against path traversal
 76 |                 for member in zipObj.namelist():
 77 |                     member_path = os.path.normpath(os.path.join(extraction_dir, member))
 78 |                     if (
 79 |                         not os.path.commonprefix([extraction_dir, member_path])
 80 |                         == extraction_dir
 81 |                     ):
 82 |                         raise ValueError(
 83 |                             f"Path traversal detected in zip file: {member}"
 84 |                         )
 85 | 
 86 |                 # Extract all files safely
 87 |                 zipObj.extractall(path=extraction_dir)
 88 | 
 89 |             # Process each extracted file
 90 |             for root, dirs, files in os.walk(extraction_dir):
 91 |                 for name in files:
 92 |                     file_path = os.path.join(root, name)
 93 |                     relative_path = os.path.relpath(file_path, extraction_dir)
 94 | 
 95 |                     # Get file extension
 96 |                     _, file_extension = os.path.splitext(name)
 97 | 
 98 |                     # Update kwargs for the file
 99 |                     file_kwargs = kwargs.copy()
100 |                     file_kwargs["file_extension"] = file_extension
101 |                     file_kwargs["_parent_converters"] = parent_converters
102 | 
103 |                     # Try converting the file using available converters
104 |                     for converter in parent_converters:
105 |                         # Skip the zip converter to avoid infinite recursion
106 |                         if isinstance(converter, ZipConverter):
107 |                             continue
108 | 
109 |                         result = converter.convert(file_path, **file_kwargs)
110 |                         if result is not None:
111 |                             md_content += f"\n## File: {relative_path}\n\n"
112 |                             md_content += result.text_content + "\n\n"
113 |                             break
114 | 
115 |             # Clean up extracted files if specified
116 |             if kwargs.get("cleanup_extracted", True):
117 |                 shutil.rmtree(extraction_dir)
118 | 
119 |             return DocumentConverterResult(title=None, text_content=md_content.strip())
120 | 
121 |         except zipfile.BadZipFile:
122 |             return DocumentConverterResult(
123 |                 title=None,
124 |                 text_content=f"[ERROR] Invalid or corrupted zip file: {local_path}",
125 |             )
126 |         except ValueError as ve:
127 |             return DocumentConverterResult(
128 |                 title=None,
129 |                 text_content=f"[ERROR] Security error in zip file {local_path}: {str(ve)}",
130 |             )
131 |         except Exception as e:
132 |             return DocumentConverterResult(
133 |                 title=None,
134 |                 text_content=f"[ERROR] Failed to process zip file {local_path}: {str(e)}",
135 |             )


--------------------------------------------------------------------------------
/core/markitdown.py:
--------------------------------------------------------------------------------
  1 | # type: ignore
  2 | import copy
  3 | import mimetypes
  4 | import os
  5 | import re
  6 | import tempfile
  7 | import traceback
  8 | from pathlib import Path
  9 | from typing import Any, List, Optional, Union
 10 | from urllib.parse import urlparse
 11 | 
 12 | # File-format detection
 13 | import puremagic
 14 | import requests
 15 | 
 16 | from core.base import DocumentConverterResult, DocumentConverter, FileConversionException, UnsupportedFormatException
 17 | from core.converters.bingsearch import BingSerpConverter
 18 | from core.converters.docx import DocxConverter
 19 | from core.converters.html import HtmlConverter
 20 | from core.converters.image import ImageConverter
 21 | from core.converters.ipynb import IpynbConverter
 22 | from core.converters.mp3 import Mp3Converter
 23 | from core.converters.outlook import OutlookMsgConverter
 24 | from core.converters.plaintext import PlainTextConverter
 25 | from core.converters.pptx import PptxConverter
 26 | from core.converters.rss import RSSConverter
 27 | from core.converters.wav import WavConverter
 28 | from core.converters.wikipedia import WikipediaConverter
 29 | from core.converters.xls import XlsConverter
 30 | from core.converters.xlsx import XlsxConverter
 31 | from core.converters.youtube import YouTubeConverter
 32 | from core.converters.zip import ZipConverter
 33 | 
 34 | 
 35 | class MarkItDown:
 36 |     """(In preview) An extremely simple text-based document reader, suitable for LLM use.
 37 |     This reader will convert common file-types or webpages to Markdown."""
 38 | 
 39 |     def __init__(
 40 |         self,
 41 |         requests_session: Optional[requests.Session] = None,
 42 |         llm_client: Optional[Any] = None,
 43 |         llm_model: Optional[str] = None,
 44 |         style_map: Optional[str] = None,
 45 |         exiftool_path: Optional[str] = None,
 46 |         mode: str = "simple",  # simple|advanced|cloud
 47 |     ):
 48 |         self.mode = mode
 49 |         if requests_session is None:
 50 |             self._requests_session = requests.Session()
 51 |         else:
 52 |             self._requests_session = requests_session
 53 | 
 54 |         if exiftool_path is None:
 55 |             exiftool_path = os.environ.get("EXIFTOOL_PATH")
 56 | 
 57 |         self._llm_client = llm_client
 58 |         self._llm_model = llm_model
 59 |         self._style_map = style_map
 60 |         self._exiftool_path = exiftool_path
 61 | 
 62 |         self._page_converters: List[DocumentConverter] = []
 63 | 
 64 |         # Register converters for successful browsing operations
 65 |         # Later registrations are tried first / take higher priority than earlier registrations
 66 |         # To this end, the most specific converters should appear below the most generic converters
 67 |         self.register_page_converter(PlainTextConverter())
 68 |         self.register_page_converter(HtmlConverter())
 69 |         self.register_page_converter(RSSConverter())
 70 |         self.register_page_converter(WikipediaConverter())
 71 |         self.register_page_converter(YouTubeConverter())
 72 |         self.register_page_converter(BingSerpConverter())
 73 |         self.register_page_converter(DocxConverter())
 74 |         self.register_page_converter(XlsxConverter())
 75 |         self.register_page_converter(XlsConverter())
 76 |         self.register_page_converter(PptxConverter())
 77 |         self.register_page_converter(WavConverter())
 78 |         self.register_page_converter(Mp3Converter())
 79 |         self.register_page_converter(ImageConverter())
 80 |         self.register_page_converter(IpynbConverter())
 81 | 
 82 |         # 动态注册PDF转换器
 83 |         # 确保PDF转换器只处理PDF文件
 84 |         if self.mode == 'advanced':
 85 |             from core.converters.pdf import AdvancedPdfConverter
 86 |             self.register_page_converter(AdvancedPdfConverter())
 87 |         elif self.mode == 'cloud':
 88 |             from core.converters.pdf import CloudPdfConverter
 89 |             self.register_page_converter(CloudPdfConverter())
 90 |         else:  # 默认simple模式
 91 |             from core.converters.pdf import PdfConverter
 92 |             self.register_page_converter(PdfConverter())
 93 | 
 94 |         self.register_page_converter(ZipConverter())
 95 |         self.register_page_converter(OutlookMsgConverter())
 96 | 
 97 |     def convert(
 98 |         self, source: Union[str, requests.Response, Path], **kwargs: Any
 99 |     ) -> DocumentConverterResult:  # TODO: deal with kwargs
100 |         """
101 |         Args:
102 |             - source: can be a string representing a path either as string pathlib path object or url, or a requests.response object
103 |             - extension: specifies the file extension to use when interpreting the file. If None, infer from source (path, uri, content-type, etc.)
104 |         """
105 | 
106 |         # Local path or url
107 |         if isinstance(source, str):
108 |             if (
109 |                 source.startswith("http://")
110 |                 or source.startswith("https://")
111 |                 or source.startswith("file://")
112 |             ):
113 |                 return self.convert_url(source, **kwargs)
114 |             else:
115 |                 return self.convert_local(source, **kwargs)
116 |         # Request response
117 |         elif isinstance(source, requests.Response):
118 |             return self.convert_response(source, **kwargs)
119 |         elif isinstance(source, Path):
120 |             return self.convert_local(source, **kwargs)
121 | 
122 |     def convert_local(
123 |         self, path: Union[str, Path], **kwargs: Any
124 |     ) -> DocumentConverterResult:  # TODO: deal with kwargs
125 |         if isinstance(path, Path):
126 |             path = str(path)
127 |         # Prepare a list of extensions to try (in order of priority)
128 |         ext = kwargs.get("file_extension")
129 |         extensions = [ext] if ext is not None else []
130 | 
131 |         # Get extension alternatives from the path and puremagic
132 |         base, ext = os.path.splitext(path)
133 |         self._append_ext(extensions, ext)
134 | 
135 |         for g in self._guess_ext_magic(path):
136 |             self._append_ext(extensions, g)
137 | 
138 |         # Convert
139 |         return self._convert(path, extensions, **kwargs)
140 | 
141 |     # TODO what should stream's type be?
142 |     def convert_stream(
143 |         self, stream: Any, **kwargs: Any
144 |     ) -> DocumentConverterResult:  # TODO: deal with kwargs
145 |         # Prepare a list of extensions to try (in order of priority)
146 |         ext = kwargs.get("file_extension")
147 |         extensions = [ext] if ext is not None else []
148 | 
149 |         # Save the file locally to a temporary file. It will be deleted before this method exits
150 |         handle, temp_path = tempfile.mkstemp()
151 |         fh = os.fdopen(handle, "wb")
152 |         result = None
153 |         try:
154 |             # Write to the temporary file
155 |             content = stream.read()
156 |             if isinstance(content, str):
157 |                 fh.write(content.encode("utf-8"))
158 |             else:
159 |                 fh.write(content)
160 |             fh.close()
161 | 
162 |             # Use puremagic to check for more extension options
163 |             for g in self._guess_ext_magic(temp_path):
164 |                 self._append_ext(extensions, g)
165 | 
166 |             # Convert
167 |             result = self._convert(temp_path, extensions, **kwargs)
168 |         # Clean up
169 |         finally:
170 |             try:
171 |                 fh.close()
172 |             except Exception:
173 |                 pass
174 |             os.unlink(temp_path)
175 | 
176 |         return result
177 | 
178 |     def convert_url(
179 |         self, url: str, **kwargs: Any
180 |     ) -> DocumentConverterResult:  # TODO: fix kwargs type
181 |         # Send a HTTP request to the URL
182 |         response = self._requests_session.get(url, stream=True)
183 |         response.raise_for_status()
184 |         return self.convert_response(response, **kwargs)
185 | 
186 |     def convert_response(
187 |         self, response: requests.Response, **kwargs: Any
188 |     ) -> DocumentConverterResult:  # TODO fix kwargs type
189 |         # Prepare a list of extensions to try (in order of priority)
190 |         ext = kwargs.get("file_extension")
191 |         extensions = [ext] if ext is not None else []
192 | 
193 |         # Guess from the mimetype
194 |         content_type = response.headers.get("content-type", "").split(";")[0]
195 |         self._append_ext(extensions, mimetypes.guess_extension(content_type))
196 | 
197 |         # Read the content disposition if there is one
198 |         content_disposition = response.headers.get("content-disposition", "")
199 |         m = re.search(r"filename=([^;]+)", content_disposition)
200 |         if m:
201 |             base, ext = os.path.splitext(m.group(1).strip("\"'"))
202 |             self._append_ext(extensions, ext)
203 | 
204 |         # Read from the extension from the path
205 |         base, ext = os.path.splitext(urlparse(response.url).path)
206 |         self._append_ext(extensions, ext)
207 | 
208 |         # Save the file locally to a temporary file. It will be deleted before this method exits
209 |         handle, temp_path = tempfile.mkstemp()
210 |         fh = os.fdopen(handle, "wb")
211 |         result = None
212 |         try:
213 |             # Download the file
214 |             for chunk in response.iter_content(chunk_size=512):
215 |                 fh.write(chunk)
216 |             fh.close()
217 | 
218 |             # Use puremagic to check for more extension options
219 |             for g in self._guess_ext_magic(temp_path):
220 |                 self._append_ext(extensions, g)
221 | 
222 |             # Convert
223 |             result = self._convert(temp_path, extensions, url=response.url, **kwargs)
224 |         # Clean up
225 |         finally:
226 |             try:
227 |                 fh.close()
228 |             except Exception:
229 |                 pass
230 |             os.unlink(temp_path)
231 | 
232 |         return result
233 | 
234 |     def _convert(
235 |         self, local_path: str, extensions: List[Union[str, None]], **kwargs
236 |     ) -> DocumentConverterResult:
237 |         error_trace = ""
238 |         for ext in extensions + [None]:  # Try last with no extension
239 |             for converter in self._page_converters:
240 |                 _kwargs = copy.deepcopy(kwargs)
241 | 
242 |                 # Overwrite file_extension appropriately
243 |                 if ext is None:
244 |                     if "file_extension" in _kwargs:
245 |                         del _kwargs["file_extension"]
246 |                 else:
247 |                     _kwargs.update({"file_extension": ext})
248 | 
249 |                 # Copy any additional global options
250 |                 if "llm_client" not in _kwargs and self._llm_client is not None:
251 |                     _kwargs["llm_client"] = self._llm_client
252 | 
253 |                 if "llm_model" not in _kwargs and self._llm_model is not None:
254 |                     _kwargs["llm_model"] = self._llm_model
255 | 
256 |                 if "style_map" not in _kwargs and self._style_map is not None:
257 |                     _kwargs["style_map"] = self._style_map
258 | 
259 |                 if "exiftool_path" not in _kwargs and self._exiftool_path is not None:
260 |                     _kwargs["exiftool_path"] = self._exiftool_path
261 | 
262 |                 # Add the list of converters for nested processing
263 |                 _kwargs["_parent_converters"] = self._page_converters
264 | 
265 |                 # If we hit an error log it and keep trying
266 |                 try:
267 |                     res = converter.convert(local_path, **_kwargs)
268 |                 except Exception:
269 |                     error_trace = ("\n\n" + traceback.format_exc()).strip()
270 | 
271 |                 if res is not None:
272 |                     # Normalize the content
273 |                     res.text_content = "\n".join(
274 |                         [line.rstrip() for line in re.split(r"\r?\n", res.text_content)]
275 |                     )
276 |                     res.text_content = re.sub(r"\n{3,}", "\n\n", res.text_content)
277 | 
278 |                     # Todo
279 |                     return res
280 | 
281 |         # If we got this far without success, report any exceptions
282 |         if len(error_trace) > 0:
283 |             raise FileConversionException(
284 |                 f"Could not convert '{local_path}' to Markdown. File type was recognized as {extensions}. While converting the file, the following error was encountered:\n\n{error_trace}"
285 |             )
286 | 
287 |         # Nothing can handle it!
288 |         raise UnsupportedFormatException(
289 |             f"Could not convert '{local_path}' to Markdown. The formats {extensions} are not supported."
290 |         )
291 | 
292 |     def _append_ext(self, extensions, ext):
293 |         """Append a unique non-None, non-empty extension to a list of extensions."""
294 |         if ext is None:
295 |             return
296 |         ext = ext.strip()
297 |         if ext == "":
298 |             return
299 |         # if ext not in extensions:
300 |         extensions.append(ext)
301 | 
302 |     def _guess_ext_magic(self, path):
303 |         """Use puremagic (a Python implementation of libmagic) to guess a file's extension based on the first few bytes."""
304 |         # Use puremagic to guess
305 |         try:
306 |             guesses = puremagic.magic_file(path)
307 | 
308 |             # Fix for: https://github.com/microsoft/markitdown/issues/222
309 |             # If there are no guesses, then try again after trimming leading ASCII whitespaces.
310 |             # ASCII whitespace characters are those byte values in the sequence b' \t\n\r\x0b\f'
311 |             # (space, tab, newline, carriage return, vertical tab, form feed).
312 |             if len(guesses) == 0:
313 |                 with open(path, "rb") as file:
314 |                     while True:
315 |                         char = file.read(1)
316 |                         if not char:  # End of file
317 |                             break
318 |                         if not char.isspace():
319 |                             file.seek(file.tell() - 1)
320 |                             break
321 |                     try:
322 |                         guesses = puremagic.magic_stream(file)
323 |                     except puremagic.main.PureError:
324 |                         pass
325 | 
326 |             extensions = list()
327 |             for g in guesses:
328 |                 ext = g.extension.strip()
329 |                 if len(ext) > 0:
330 |                     if not ext.startswith("."):
331 |                         ext = "." + ext
332 |                     if ext not in extensions:
333 |                         extensions.append(ext)
334 |             return extensions
335 |         except FileNotFoundError:
336 |             pass
337 |         except IsADirectoryError:
338 |             pass
339 |         except PermissionError:
340 |             pass
341 |         return []
342 | 
343 |     def register_page_converter(self, converter: DocumentConverter) -> None:
344 |         """Register a page text converter."""
345 |         self._page_converters.insert(0, converter)
346 | 


--------------------------------------------------------------------------------
/core/model_manager.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | from pathlib import Path
  4 | 
  5 | from huggingface_hub import snapshot_download as hf_download
  6 | from modelscope.hub.snapshot_download import snapshot_download as ms_download
  7 | 
  8 | DEFAULT_CONFIG_NAME = "magic-pdf.json"
  9 | GITHUB_TEMPLATE_URL = "https://raw.githubusercontent.com/opendatalab/MinerU/master/magic-pdf.template.json"
 10 | MODEL_REPOS = {
 11 |     'main': 'opendatalab/PDF-Extract-Kit-1.0',
 12 |     'layout': 'hantian/layoutreader'
 13 | }
 14 | 
 15 | 
 16 | class ModelConfigurator:
 17 |     """模型配置管理器"""
 18 | 
 19 |     def __init__(self, device='cpu', models_dir=None, use_modelscope=True):
 20 |         self.device = device
 21 |         self.use_modelscope = use_modelscope
 22 |         self.models_dir = models_dir
 23 |         self.config_path = self._get_config_path()
 24 |         self.mineru_patterns = [
 25 |             "models/Layout/LayoutLMv3/*",
 26 |             "models/Layout/YOLO/*",
 27 |             "models/MFD/YOLO/*",
 28 |             "models/MFR/unimernet_small_2501/*",
 29 |             "models/TabRec/TableMaster/*",
 30 |             "models/TabRec/StructEqTable/*",
 31 |         ]
 32 |         if self.use_modelscope:
 33 |             MODEL_REPOS['layout'] = 'ppaanngggg/layoutreader'
 34 | 
 35 |     def _get_cache_dir(self, model_type):
 36 |         """获取符合各库规范的缓存目录"""
 37 |         if self.models_dir:
 38 |             custom_dir = Path(self.models_dir).expanduser().resolve()
 39 |             return custom_dir / model_type
 40 | 
 41 |         # 自动识别默认缓存路径
 42 |         if self.use_modelscope:
 43 |             return Path.home() / ".cache/modelscope/hub" / MODEL_REPOS[model_type]
 44 |         else:
 45 |             return Path.home() / ".cache/huggingface/hub" / MODEL_REPOS[model_type]
 46 | 
 47 |     def _get_config_path(self):
 48 |         """获取配置文件路径"""
 49 |         env_path = os.getenv('MINERU_TOOLS_CONFIG_JSON')
 50 |         return Path(env_path) if env_path else Path.home() / DEFAULT_CONFIG_NAME
 51 | 
 52 |     def setup_environment(self):
 53 |         """配置环境并下载模型"""
 54 |         self._download_models()
 55 |         self._generate_config()
 56 |         os.environ['MINERU_TOOLS_CONFIG_JSON'] = str(self.config_path)
 57 | 
 58 |     def _download_models(self):
 59 |         """改进后的下载方法"""
 60 |         downloader = ms_download if self.use_modelscope else hf_download
 61 | 
 62 |         model_paths = {}
 63 |         for model_type in ['main', 'layout']:
 64 |             cache_dir = self._get_cache_dir(model_type)
 65 | 
 66 |             print(f"下载 {model_type} 模型到: {cache_dir}")
 67 | 
 68 |             # 保留库的默认缓存行为，仅在指定--models-dir时覆盖
 69 |             download_args = {
 70 |                 'repo_id': MODEL_REPOS[model_type],
 71 |                 'local_dir': str(cache_dir),  # 新增参数确保文件存储在指定位置
 72 |                 'allow_patterns': self.mineru_patterns if model_type == 'main' else None  # 添加过滤规则
 73 |             }
 74 | 
 75 |             # 仅在自定义路径时覆盖缓存目录
 76 |             if self.models_dir:
 77 |                 download_args['cache_dir'] = str(cache_dir.parent)
 78 | 
 79 |             snapshot_path = downloader(**download_args)
 80 | 
 81 |             # 处理特殊目录结构
 82 |             if model_type == 'main':
 83 |                 self.main_model_path = Path(snapshot_path) / 'models'
 84 |             else:
 85 |                 self.layout_model_path = Path(snapshot_path)
 86 | 
 87 |         return model_paths
 88 | 
 89 |     def _generate_config(self):
 90 |         """生成配置文件"""
 91 |         template_path = "assets/magic-pdf-template.json"
 92 |         try:
 93 |             with open(template_path, "r") as f:
 94 |                 template_config = json.load(f)
 95 |             print(f"成功加载模板配置: {template_path}")
 96 |         except Exception as e:
 97 |             print(f"加载模板配置失败，使用默认值: {e}")
 98 |             template_config = {}
 99 | 
100 |         custom_config = {
101 |             "device-mode": self.device,
102 |             "models-dir": str(self.main_model_path),
103 |             "layoutreader-model-dir": str(self.layout_model_path),
104 |         }
105 |         template_config.update(custom_config)
106 |         config = template_config
107 | 
108 |         if self.config_path.exists():
109 |             with open(self.config_path, 'r') as f:
110 |                 existing_config = json.load(f)
111 |             existing_config.update(custom_config)
112 |             config = existing_config
113 | 
114 |         with open(self.config_path, 'w') as f:
115 |             json.dump(config, f, indent=2)
116 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | import io
  2 | import os
  3 | import uuid
  4 | from contextlib import asynccontextmanager
  5 | from pathlib import Path
  6 | from typing import Optional, List
  7 | 
  8 | import openai
  9 | from fastapi import FastAPI, UploadFile, File, HTTPException, status, Depends, BackgroundTasks, Form, Query
 10 | from fastapi.responses import FileResponse
 11 | from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
 12 | from pydantic import BaseModel
 13 | from sqlalchemy.orm import Session
 14 | from fastapi.staticfiles import StaticFiles
 15 | 
 16 | from core.markitdown import MarkItDown
 17 | from core.base import DocumentConverterResult
 18 | from core.model_manager import ModelConfigurator
 19 | from repository.db import get_db, Job
 20 | 
 21 | # 安全验证
 22 | security = HTTPBearer()
 23 | 
 24 | # 从环境变量获取API密钥
 25 | API_KEY = os.getenv("MARKIT_API_KEY", "secret-key")
 26 | OUTPUT_DIR = Path("output")
 27 | OUTPUT_DIR.mkdir(exist_ok=True)
 28 | MINER_RUNNING_DEVICE = os.getenv("MINER_RUNNING_DEVICE", "cpu")
 29 | port = int(os.getenv("PORT", 20926))
 30 | 
 31 | 
 32 | # 依赖项：API Key 验证
 33 | async def verify_api_key(
 34 |         credentials: HTTPAuthorizationCredentials = Depends(security)
 35 | ):
 36 |     if credentials.scheme != "Bearer" or credentials.credentials != API_KEY:
 37 |         raise HTTPException(
 38 |             status_code=status.HTTP_401_UNAUTHORIZED,
 39 |             detail="Invalid API Key",
 40 |         )
 41 |     return credentials
 42 | 
 43 | 
 44 | @asynccontextmanager
 45 | async def lifespan(app: FastAPI):
 46 |     """服务启动和关闭时的生命周期管理"""
 47 |     try:
 48 |         # 初始化模型
 49 |         configurator = ModelConfigurator(
 50 |             device=os.getenv("MINERU_DEVICE", MINER_RUNNING_DEVICE),
 51 |             use_modelscope=os.getenv("MINERU_USE_MODELSCOPE", "true").lower() in ("true", "1")
 52 |         )
 53 |         configurator.setup_environment()
 54 |         print("模型初始化完成")
 55 |     except Exception as e:
 56 |         print(f"模型初始化失败: {str(e)}")
 57 |         raise
 58 | 
 59 |     yield  # 应用运行期间
 60 | 
 61 |     # 清理逻辑（可选）
 62 |     print("服务关闭，清理资源...")
 63 | 
 64 | 
 65 | # FastAPI 应用
 66 | app = FastAPI(lifespan=lifespan)
 67 | if not os.path.exists("output/images"):
 68 |     os.mkdir("output/images")
 69 | app.mount("/images", StaticFiles(directory="output/images"), name="images")
 70 | 
 71 | 
 72 | # from slowapi import Limiter, _rate_limit_exceeded_handler
 73 | # from slowapi.errors import RateLimitExceeded
 74 | # from slowapi.util import get_remote_address
 75 | # limiter = Limiter(key_func=get_remote_address)
 76 | # app.state.limiter = limiter
 77 | # app.add_exception_handler(RateLimitExceeded, _rate_limit_exceeded_handler)
 78 | # @limiter.limit("100/minute")
 79 | 
 80 | 
 81 | # 数据模型
 82 | class JobStatusResponse(BaseModel):
 83 |     job_id: str
 84 |     status: str
 85 |     filename: str
 86 |     params: dict
 87 |     error: Optional[str]
 88 | 
 89 | 
 90 | class JobResultResponse(BaseModel):
 91 |     job_id: str
 92 |     download_url: str
 93 |     format: str
 94 | 
 95 | 
 96 | oai_client = None
 97 | if os.getenv("MARKIFY_LLM_API_KEY", None) and os.getenv("MARKIFY_LLM_API_BASE", None):
 98 |     oai_client = openai.OpenAI(
 99 |         api_key=os.getenv("MARKIFY_LLM_API_KEY", None),
100 |         base_url=os.getenv("MARKIFY_LLM_API_BASE", None)
101 |     )
102 | 
103 | 
104 | def process_file(db: Session, job_id: str, file_content: bytes, filename: str, mode: str = "simple"):
105 |     """处理各种文件的后台任务"""
106 |     try:
107 |         # 更新任务状态为 processing
108 |         job = db.query(Job).filter(Job.id == job_id).first()
109 |         if not job:
110 |             raise ValueError(f"Job {job_id} not found")
111 | 
112 |         job.status = "processing"
113 |         db.commit()
114 | 
115 |         # 创建处理器
116 |         markitdown = MarkItDown(mode=mode,
117 |                                 llm_client=oai_client,
118 |                                 llm_model=os.getenv("MARKIFY_LLM_MODEL", None)
119 |                                 )
120 | 
121 |         # 根据输入类型处理
122 |         if filename.endswith('.md'):
123 |             result = DocumentConverterResult(text_content=file_content.decode('utf-8'))
124 |         else:
125 |             # 将字节内容转为文件流
126 |             file_stream = io.BytesIO(file_content)
127 |             result = markitdown.convert_stream(file_stream, base_url="http://localhost:20926")
128 | 
129 |         # 保存结果到文件
130 |         output_file = OUTPUT_DIR / f"{job_id}.md"
131 |         with open(output_file, "w", encoding="utf-8") as f:
132 |             f.write(result.text_content)
133 | 
134 |         # 更新任务状态为 completed
135 |         job.status = "completed"
136 |         job.result_file = str(output_file)
137 |         db.commit()
138 | 
139 |     except Exception as e:
140 |         # 更新任务状态为 failed
141 |         job.status = "failed"
142 |         job.error = f"{type(e).__name__}: {str(e)}"
143 |         db.commit()
144 | 
145 | 
146 | @app.post("/api/jobs", status_code=status.HTTP_202_ACCEPTED)
147 | async def upload_file(
148 |         background_tasks: BackgroundTasks,
149 |         file: UploadFile = File(...),
150 |         mode: str = Form("simple"),
151 |         db: Session = Depends(get_db)
152 | ):
153 |     """上传文件并启动转换任务"""
154 |     # 生成任务ID
155 |     job_id = str(uuid.uuid4())
156 | 
157 |     try:
158 |         # 读取文件内容
159 |         content = await file.read()
160 | 
161 |         # 创建任务记录
162 |         job = Job(
163 |             id=job_id,
164 |             filename=file.filename,
165 |             params={"mode": mode},
166 |             status="pending"
167 |         )
168 |         db.add(job)
169 |         db.commit()
170 | 
171 |         # 启动后台任务
172 |         background_tasks.add_task(
173 |             process_file,
174 |             db=db,
175 |             job_id=job_id,
176 |             file_content=content,
177 |             filename=file.filename,
178 |             mode=mode
179 |         )
180 | 
181 |         return {"job_id": job_id}
182 | 
183 |     except Exception as e:
184 |         raise HTTPException(
185 |             status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
186 |             detail=f"File upload failed: {str(e)}"
187 |         )
188 | 
189 | 
190 | @app.get("/api/jobs", response_model=List[JobStatusResponse])
191 | async def list_jobs(
192 |         db: Session = Depends(get_db),
193 |         page: int = Query(0, ge=0, description=""),
194 |         limit: int = Query(10, gt=0, le=100, description="default 10，max 100")):
195 |     """查询任务状态"""
196 |     jobs = db.query(Job).order_by(Job.created_at.desc()).limit(limit).offset(page * limit).all()
197 |     if not jobs:
198 |         raise HTTPException(
199 |             status_code=status.HTTP_404_NOT_FOUND,
200 |             detail="Job not found"
201 |         )
202 | 
203 |     response_list = []
204 |     for job in jobs:
205 |         response_list.append(JobStatusResponse(
206 |             job_id=job.id,
207 |             status=job.status,
208 |             filename=job.filename,
209 |             params=job.params,
210 |             error=job.error
211 |         ))
212 |     return response_list
213 | 
214 | 
215 | @app.get("/api/jobs/{job_id}", response_model=JobStatusResponse)
216 | async def get_job_status(
217 |         job_id: str,
218 |         db: Session = Depends(get_db)
219 | ):
220 |     """查询任务状态"""
221 |     job = db.query(Job).filter(Job.id == job_id).first()
222 |     if not job:
223 |         raise HTTPException(
224 |             status_code=status.HTTP_404_NOT_FOUND,
225 |             detail="Job not found"
226 |         )
227 | 
228 |     return JobStatusResponse(
229 |         job_id=job.id,
230 |         status=job.status,
231 |         filename=job.filename,
232 |         params=job.params,
233 |         error=job.error
234 |     )
235 | 
236 | 
237 | @app.get("/api/jobs/{job_id}/result")
238 | async def download_result(
239 |         job_id: str,
240 |         db: Session = Depends(get_db)
241 | ):
242 |     """下载任务结果文件"""
243 |     job = db.query(Job).filter(Job.id == job_id).first()
244 |     if not job:
245 |         raise HTTPException(
246 |             status_code=status.HTTP_404_NOT_FOUND,
247 |             detail="Job not found"
248 |         )
249 | 
250 |     if job.status != "completed":
251 |         raise HTTPException(
252 |             status_code=status.HTTP_425_TOO_EARLY,
253 |             detail="Job not completed"
254 |         )
255 | 
256 |     result_file = job.result_file
257 |     if not result_file or not os.path.exists(result_file):
258 |         raise HTTPException(
259 |             status_code=status.HTTP_404_NOT_FOUND,
260 |             detail="Result file not found"
261 |         )
262 | 
263 |     # 返回文件内容
264 |     return FileResponse(
265 |         result_file,
266 |         filename=f"{job.filename}.md",
267 |         media_type="text/markdown"
268 |     )
269 | 
270 | 
271 | if __name__ == "__main__":
272 |     import uvicorn
273 | 
274 |     uvicorn.run(app, host="0.0.0.0", port=port)
275 | 


--------------------------------------------------------------------------------
/repository/db.py:
--------------------------------------------------------------------------------
 1 | from sqlalchemy import create_engine, Column, String, Integer, JSON, DateTime
 2 | from sqlalchemy.orm import declarative_base
 3 | from sqlalchemy.orm import sessionmaker
 4 | from datetime import datetime
 5 | 
 6 | # SQLite 数据库路径
 7 | DATABASE_URL = "sqlite:///./jobs.db"
 8 | 
 9 | # 创建数据库引擎
10 | engine = create_engine(DATABASE_URL, connect_args={"check_same_thread": False})
11 | 
12 | # 创建 SessionLocal
13 | SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
14 | 
15 | # 声明基类
16 | Base = declarative_base()
17 | 
18 | 
19 | # 任务模型
20 | class Job(Base):
21 |     __tablename__ = "jobs"
22 | 
23 |     id = Column(String, primary_key=True, index=True)
24 |     status = Column(String, default="pending")
25 |     filename = Column(String)
26 |     params = Column(JSON)
27 |     result_file = Column(String)
28 |     error = Column(String)
29 |     created_at = Column(DateTime, default=datetime.utcnow)
30 |     updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
31 | 
32 | 
33 | # 创建数据库表
34 | Base.metadata.create_all(bind=engine)
35 | 
36 | 
37 | # 获取数据库会话
38 | def get_db():
39 |     db = SessionLocal()
40 |     try:
41 |         yield db
42 |     finally:
43 |         db.close()
44 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | # Core dependencies
 2 | beautifulsoup4~=4.12.3
 3 | requests~=2.32.3
 4 | mammoth~=1.9.0
 5 | markdownify~=0.14.1
 6 | numpy
 7 | python-pptx==1.0.2
 8 | pandas~=2.2.3
 9 | openpyxl==3.1.5
10 | xlrd==2.0.1
11 | puremagic~=1.28
12 | pydub~=0.25.1
13 | olefile~=0.47
14 | youtube-transcript-api==0.6.3
15 | SpeechRecognition==3.14.0
16 | pathvalidate==3.2.3
17 | charset-normalizer==3.4.1
18 | openai~=1.59.7
19 | magic-pdf[full] --extra-index-url https://wheels.myhloli.com
20 | modelscope~=1.22.2
21 | huggingface_hub~=0.27.1
22 | slowapi~=0.1.9
23 | limits~=4.0.1
24 | python-multipart~=0.0.20
25 | uvicorn>=0.34.0
26 | sqlalchemy>=2.0.37
27 | # Development dependencies (optional, install with `-r dev-requirements.txt`)
28 | mypy>=1.0.0
29 | 
30 | fastapi~=0.115.7
31 | pydantic~=2.10.5
32 | setuptools~=75.1.0
33 | 
34 | streamlit~=1.29.0


--------------------------------------------------------------------------------