├── .github └── workflows │ └── docker-build.yml ├── .gitignore ├── Dockerfile ├── LICENSE ├── NOTICE ├── README.md ├── assets ├── magic-pdf-template.json ├── sponsor.JPG ├── streamlint_ui.png └── zsxq.JPG ├── client └── streamlit_client.py ├── core ├── __init__.py ├── base.py ├── converters │ ├── __init__.py │ ├── bingsearch.py │ ├── custommarkdownify.py │ ├── docx.py │ ├── html.py │ ├── image.py │ ├── ipynb.py │ ├── media.py │ ├── mineru │ │ ├── __init__.py │ │ ├── pdf_processor.py │ │ └── title_corrector.py │ ├── mp3.py │ ├── outlook.py │ ├── pdf.py │ ├── plaintext.py │ ├── pptx.py │ ├── rss.py │ ├── wav.py │ ├── wikipedia.py │ ├── xls.py │ ├── xlsx.py │ ├── youtube.py │ └── zip.py ├── markitdown.py └── model_manager.py ├── main.py ├── repository └── db.py └── requirements.txt /.github/workflows/docker-build.yml: -------------------------------------------------------------------------------- 1 | name: Docker Image CI 2 | 3 | on: 4 | push: 5 | tags: 6 | - 'v[0-9]+.[0-9]+.[0-9]+' 7 | 8 | jobs: 9 | build: 10 | 11 | runs-on: ubuntu-latest 12 | 13 | env: 14 | IMAGE_NAME: ${{ github.event.repository.name }} 15 | 16 | steps: 17 | - uses: actions/checkout@v4 18 | 19 | - name: Remove 'v' prefix from tag 20 | id: tag_name 21 | run: | 22 | TAG_NAME=${GITHUB_REF#refs/tags/} 23 | TAG_NAME=${TAG_NAME#v} 24 | echo "TAG_NAME=$TAG_NAME" >> $GITHUB_ENV 25 | 26 | - name: Log in to DockerHub 27 | run: echo "${{ secrets.DOCKERHUB_TOKEN }}" | docker login -u "${{ secrets.DOCKERHUB_USERNAME }}" --password-stdin 28 | 29 | - name: Build the Docker image 30 | run: docker build . --file Dockerfile --tag ${{ secrets.DOCKERHUB_USERNAME }}/${{ env.IMAGE_NAME }}:${{ env.TAG_NAME }} 31 | 32 | - name: Push the Docker image 33 | run: docker push ${{ secrets.DOCKERHUB_USERNAME }}/${{ env.IMAGE_NAME }}:${{ env.TAG_NAME }} 34 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # UV 98 | # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | #uv.lock 102 | 103 | # poetry 104 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 105 | # This is especially recommended for binary packages to ensure reproducibility, and is more 106 | # commonly ignored for libraries. 107 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 108 | #poetry.lock 109 | 110 | # pdm 111 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 112 | #pdm.lock 113 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 114 | # in version control. 115 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 116 | .pdm.toml 117 | .pdm-python 118 | .pdm-build/ 119 | 120 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 121 | __pypackages__/ 122 | 123 | # Celery stuff 124 | celerybeat-schedule 125 | celerybeat.pid 126 | 127 | # SageMath parsed files 128 | *.sage.py 129 | 130 | # Environments 131 | .env 132 | .venv 133 | env/ 134 | venv/ 135 | ENV/ 136 | env.bak/ 137 | venv.bak/ 138 | 139 | # Spyder project settings 140 | .spyderproject 141 | .spyproject 142 | 143 | # Rope project settings 144 | .ropeproject 145 | 146 | # mkdocs documentation 147 | /site 148 | 149 | # mypy 150 | .mypy_cache/ 151 | .dmypy.json 152 | dmypy.json 153 | 154 | # Pyre type checker 155 | .pyre/ 156 | 157 | # pytype static type analyzer 158 | .pytype/ 159 | 160 | # Cython debug symbols 161 | cython_debug/ 162 | 163 | # PyCharm 164 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 165 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 166 | # and can be added to the global gitignore or merged into this file. For a more nuclear 167 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 168 | #.idea/ 169 | 170 | # Ruff stuff: 171 | .ruff_cache/ 172 | 173 | # PyPI configuration file 174 | .pypirc 175 | 176 | # custome 177 | *.db 178 | output_files 179 | .DS_Store 180 | .idea 181 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.10-slim 2 | 3 | ENV PYTHONDONTWRITEBYTECODE=1 4 | 5 | ENV PYTHONUNBUFFERED=1 6 | 7 | ENV HF_ENDPOINT="https://hf-mirror.com" 8 | 9 | # 设置工作目录 10 | WORKDIR /app 11 | 12 | # 将 requirements.txt 复制到容器中 13 | COPY requirements.txt . 14 | 15 | # 安装依赖 16 | RUN pip install --upgrade -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple pip && \ 17 | pip install --no-cache-dir -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple -r requirements.txt 18 | 19 | # 将应用代码复制到容器中 20 | COPY . . 21 | 22 | # 暴露应用服务端口 23 | EXPOSE 20926 24 | 25 | # 定义启动命令 26 | CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "20926"] 27 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | This project is licensed under the GNU Affero General Public License v3.0 (AGPL-3.0). 2 | Portions of this software are derived from the following project, see NOTICE for details. 3 | - [MarkitDown] (https://github.com/microsoft/markitdown) under the MIT License. 4 | - [MinerU] (https://github.com/opendatalab/MinerU) under the AGPL License. 5 | 6 | 7 | 8 | GNU AFFERO GENERAL PUBLIC LICENSE 9 | Version 3, 19 November 2007 10 | 11 | Copyright (C) 2007 Free Software Foundation, Inc. 12 | Everyone is permitted to copy and distribute verbatim copies 13 | of this license document, but changing it is not allowed. 14 | 15 | Preamble 16 | 17 | The GNU Affero General Public License is a free, copyleft license for 18 | software and other kinds of works, specifically designed to ensure 19 | cooperation with the community in the case of network server software. 20 | 21 | The licenses for most software and other practical works are designed 22 | to take away your freedom to share and change the works. By contrast, 23 | our General Public Licenses are intended to guarantee your freedom to 24 | share and change all versions of a program--to make sure it remains free 25 | software for all its users. 26 | 27 | When we speak of free software, we are referring to freedom, not 28 | price. Our General Public Licenses are designed to make sure that you 29 | have the freedom to distribute copies of free software (and charge for 30 | them if you wish), that you receive source code or can get it if you 31 | want it, that you can change the software or use pieces of it in new 32 | free programs, and that you know you can do these things. 33 | 34 | Developers that use our General Public Licenses protect your rights 35 | with two steps: (1) assert copyright on the software, and (2) offer 36 | you this License which gives you legal permission to copy, distribute 37 | and/or modify the software. 38 | 39 | A secondary benefit of defending all users' freedom is that 40 | improvements made in alternate versions of the program, if they 41 | receive widespread use, become available for other developers to 42 | incorporate. Many developers of free software are heartened and 43 | encouraged by the resulting cooperation. However, in the case of 44 | software used on network servers, this result may fail to come about. 45 | The GNU General Public License permits making a modified version and 46 | letting the public access it on a server without ever releasing its 47 | source code to the public. 48 | 49 | The GNU Affero General Public License is designed specifically to 50 | ensure that, in such cases, the modified source code becomes available 51 | to the community. It requires the operator of a network server to 52 | provide the source code of the modified version running there to the 53 | users of that server. Therefore, public use of a modified version, on 54 | a publicly accessible server, gives the public access to the source 55 | code of the modified version. 56 | 57 | An older license, called the Affero General Public License and 58 | published by Affero, was designed to accomplish similar goals. This is 59 | a different license, not a version of the Affero GPL, but Affero has 60 | released a new version of the Affero GPL which permits relicensing under 61 | this license. 62 | 63 | The precise terms and conditions for copying, distribution and 64 | modification follow. 65 | 66 | TERMS AND CONDITIONS 67 | 68 | 0. Definitions. 69 | 70 | "This License" refers to version 3 of the GNU Affero General Public License. 71 | 72 | "Copyright" also means copyright-like laws that apply to other kinds of 73 | works, such as semiconductor masks. 74 | 75 | "The Program" refers to any copyrightable work licensed under this 76 | License. Each licensee is addressed as "you". "Licensees" and 77 | "recipients" may be individuals or organizations. 78 | 79 | To "modify" a work means to copy from or adapt all or part of the work 80 | in a fashion requiring copyright permission, other than the making of an 81 | exact copy. The resulting work is called a "modified version" of the 82 | earlier work or a work "based on" the earlier work. 83 | 84 | A "covered work" means either the unmodified Program or a work based 85 | on the Program. 86 | 87 | To "propagate" a work means to do anything with it that, without 88 | permission, would make you directly or secondarily liable for 89 | infringement under applicable copyright law, except executing it on a 90 | computer or modifying a private copy. Propagation includes copying, 91 | distribution (with or without modification), making available to the 92 | public, and in some countries other activities as well. 93 | 94 | To "convey" a work means any kind of propagation that enables other 95 | parties to make or receive copies. Mere interaction with a user through 96 | a computer network, with no transfer of a copy, is not conveying. 97 | 98 | An interactive user interface displays "Appropriate Legal Notices" 99 | to the extent that it includes a convenient and prominently visible 100 | feature that (1) displays an appropriate copyright notice, and (2) 101 | tells the user that there is no warranty for the work (except to the 102 | extent that warranties are provided), that licensees may convey the 103 | work under this License, and how to view a copy of this License. If 104 | the interface presents a list of user commands or options, such as a 105 | menu, a prominent item in the list meets this criterion. 106 | 107 | 1. Source Code. 108 | 109 | The "source code" for a work means the preferred form of the work 110 | for making modifications to it. "Object code" means any non-source 111 | form of a work. 112 | 113 | A "Standard Interface" means an interface that either is an official 114 | standard defined by a recognized standards body, or, in the case of 115 | interfaces specified for a particular programming language, one that 116 | is widely used among developers working in that language. 117 | 118 | The "System Libraries" of an executable work include anything, other 119 | than the work as a whole, that (a) is included in the normal form of 120 | packaging a Major Component, but which is not part of that Major 121 | Component, and (b) serves only to enable use of the work with that 122 | Major Component, or to implement a Standard Interface for which an 123 | implementation is available to the public in source code form. A 124 | "Major Component", in this context, means a major essential component 125 | (kernel, window system, and so on) of the specific operating system 126 | (if any) on which the executable work runs, or a compiler used to 127 | produce the work, or an object code interpreter used to run it. 128 | 129 | The "Corresponding Source" for a work in object code form means all 130 | the source code needed to generate, install, and (for an executable 131 | work) run the object code and to modify the work, including scripts to 132 | control those activities. However, it does not include the work's 133 | System Libraries, or general-purpose tools or generally available free 134 | programs which are used unmodified in performing those activities but 135 | which are not part of the work. For example, Corresponding Source 136 | includes interface definition files associated with source files for 137 | the work, and the source code for shared libraries and dynamically 138 | linked subprograms that the work is specifically designed to require, 139 | such as by intimate data communication or control flow between those 140 | subprograms and other parts of the work. 141 | 142 | The Corresponding Source need not include anything that users 143 | can regenerate automatically from other parts of the Corresponding 144 | Source. 145 | 146 | The Corresponding Source for a work in source code form is that 147 | same work. 148 | 149 | 2. Basic Permissions. 150 | 151 | All rights granted under this License are granted for the term of 152 | copyright on the Program, and are irrevocable provided the stated 153 | conditions are met. This License explicitly affirms your unlimited 154 | permission to run the unmodified Program. The output from running a 155 | covered work is covered by this License only if the output, given its 156 | content, constitutes a covered work. This License acknowledges your 157 | rights of fair use or other equivalent, as provided by copyright law. 158 | 159 | You may make, run and propagate covered works that you do not 160 | convey, without conditions so long as your license otherwise remains 161 | in force. You may convey covered works to others for the sole purpose 162 | of having them make modifications exclusively for you, or provide you 163 | with facilities for running those works, provided that you comply with 164 | the terms of this License in conveying all material for which you do 165 | not control copyright. Those thus making or running the covered works 166 | for you must do so exclusively on your behalf, under your direction 167 | and control, on terms that prohibit them from making any copies of 168 | your copyrighted material outside their relationship with you. 169 | 170 | Conveying under any other circumstances is permitted solely under 171 | the conditions stated below. Sublicensing is not allowed; section 10 172 | makes it unnecessary. 173 | 174 | 3. Protecting Users' Legal Rights From Anti-Circumvention Law. 175 | 176 | No covered work shall be deemed part of an effective technological 177 | measure under any applicable law fulfilling obligations under article 178 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or 179 | similar laws prohibiting or restricting circumvention of such 180 | measures. 181 | 182 | When you convey a covered work, you waive any legal power to forbid 183 | circumvention of technological measures to the extent such circumvention 184 | is effected by exercising rights under this License with respect to 185 | the covered work, and you disclaim any intention to limit operation or 186 | modification of the work as a means of enforcing, against the work's 187 | users, your or third parties' legal rights to forbid circumvention of 188 | technological measures. 189 | 190 | 4. Conveying Verbatim Copies. 191 | 192 | You may convey verbatim copies of the Program's source code as you 193 | receive it, in any medium, provided that you conspicuously and 194 | appropriately publish on each copy an appropriate copyright notice; 195 | keep intact all notices stating that this License and any 196 | non-permissive terms added in accord with section 7 apply to the code; 197 | keep intact all notices of the absence of any warranty; and give all 198 | recipients a copy of this License along with the Program. 199 | 200 | You may charge any price or no price for each copy that you convey, 201 | and you may offer support or warranty protection for a fee. 202 | 203 | 5. Conveying Modified Source Versions. 204 | 205 | You may convey a work based on the Program, or the modifications to 206 | produce it from the Program, in the form of source code under the 207 | terms of section 4, provided that you also meet all of these conditions: 208 | 209 | a) The work must carry prominent notices stating that you modified 210 | it, and giving a relevant date. 211 | 212 | b) The work must carry prominent notices stating that it is 213 | released under this License and any conditions added under section 214 | 7. This requirement modifies the requirement in section 4 to 215 | "keep intact all notices". 216 | 217 | c) You must license the entire work, as a whole, under this 218 | License to anyone who comes into possession of a copy. This 219 | License will therefore apply, along with any applicable section 7 220 | additional terms, to the whole of the work, and all its parts, 221 | regardless of how they are packaged. This License gives no 222 | permission to license the work in any other way, but it does not 223 | invalidate such permission if you have separately received it. 224 | 225 | d) If the work has interactive user interfaces, each must display 226 | Appropriate Legal Notices; however, if the Program has interactive 227 | interfaces that do not display Appropriate Legal Notices, your 228 | work need not make them do so. 229 | 230 | A compilation of a covered work with other separate and independent 231 | works, which are not by their nature extensions of the covered work, 232 | and which are not combined with it such as to form a larger program, 233 | in or on a volume of a storage or distribution medium, is called an 234 | "aggregate" if the compilation and its resulting copyright are not 235 | used to limit the access or legal rights of the compilation's users 236 | beyond what the individual works permit. Inclusion of a covered work 237 | in an aggregate does not cause this License to apply to the other 238 | parts of the aggregate. 239 | 240 | 6. Conveying Non-Source Forms. 241 | 242 | You may convey a covered work in object code form under the terms 243 | of sections 4 and 5, provided that you also convey the 244 | machine-readable Corresponding Source under the terms of this License, 245 | in one of these ways: 246 | 247 | a) Convey the object code in, or embodied in, a physical product 248 | (including a physical distribution medium), accompanied by the 249 | Corresponding Source fixed on a durable physical medium 250 | customarily used for software interchange. 251 | 252 | b) Convey the object code in, or embodied in, a physical product 253 | (including a physical distribution medium), accompanied by a 254 | written offer, valid for at least three years and valid for as 255 | long as you offer spare parts or customer support for that product 256 | model, to give anyone who possesses the object code either (1) a 257 | copy of the Corresponding Source for all the software in the 258 | product that is covered by this License, on a durable physical 259 | medium customarily used for software interchange, for a price no 260 | more than your reasonable cost of physically performing this 261 | conveying of source, or (2) access to copy the 262 | Corresponding Source from a network server at no charge. 263 | 264 | c) Convey individual copies of the object code with a copy of the 265 | written offer to provide the Corresponding Source. This 266 | alternative is allowed only occasionally and noncommercially, and 267 | only if you received the object code with such an offer, in accord 268 | with subsection 6b. 269 | 270 | d) Convey the object code by offering access from a designated 271 | place (gratis or for a charge), and offer equivalent access to the 272 | Corresponding Source in the same way through the same place at no 273 | further charge. You need not require recipients to copy the 274 | Corresponding Source along with the object code. If the place to 275 | copy the object code is a network server, the Corresponding Source 276 | may be on a different server (operated by you or a third party) 277 | that supports equivalent copying facilities, provided you maintain 278 | clear directions next to the object code saying where to find the 279 | Corresponding Source. Regardless of what server hosts the 280 | Corresponding Source, you remain obligated to ensure that it is 281 | available for as long as needed to satisfy these requirements. 282 | 283 | e) Convey the object code using peer-to-peer transmission, provided 284 | you inform other peers where the object code and Corresponding 285 | Source of the work are being offered to the general public at no 286 | charge under subsection 6d. 287 | 288 | A separable portion of the object code, whose source code is excluded 289 | from the Corresponding Source as a System Library, need not be 290 | included in conveying the object code work. 291 | 292 | A "User Product" is either (1) a "consumer product", which means any 293 | tangible personal property which is normally used for personal, family, 294 | or household purposes, or (2) anything designed or sold for incorporation 295 | into a dwelling. In determining whether a product is a consumer product, 296 | doubtful cases shall be resolved in favor of coverage. For a particular 297 | product received by a particular user, "normally used" refers to a 298 | typical or common use of that class of product, regardless of the status 299 | of the particular user or of the way in which the particular user 300 | actually uses, or expects or is expected to use, the product. A product 301 | is a consumer product regardless of whether the product has substantial 302 | commercial, industrial or non-consumer uses, unless such uses represent 303 | the only significant mode of use of the product. 304 | 305 | "Installation Information" for a User Product means any methods, 306 | procedures, authorization keys, or other information required to install 307 | and execute modified versions of a covered work in that User Product from 308 | a modified version of its Corresponding Source. The information must 309 | suffice to ensure that the continued functioning of the modified object 310 | code is in no case prevented or interfered with solely because 311 | modification has been made. 312 | 313 | If you convey an object code work under this section in, or with, or 314 | specifically for use in, a User Product, and the conveying occurs as 315 | part of a transaction in which the right of possession and use of the 316 | User Product is transferred to the recipient in perpetuity or for a 317 | fixed term (regardless of how the transaction is characterized), the 318 | Corresponding Source conveyed under this section must be accompanied 319 | by the Installation Information. But this requirement does not apply 320 | if neither you nor any third party retains the ability to install 321 | modified object code on the User Product (for example, the work has 322 | been installed in ROM). 323 | 324 | The requirement to provide Installation Information does not include a 325 | requirement to continue to provide support service, warranty, or updates 326 | for a work that has been modified or installed by the recipient, or for 327 | the User Product in which it has been modified or installed. Access to a 328 | network may be denied when the modification itself materially and 329 | adversely affects the operation of the network or violates the rules and 330 | protocols for communication across the network. 331 | 332 | Corresponding Source conveyed, and Installation Information provided, 333 | in accord with this section must be in a format that is publicly 334 | documented (and with an implementation available to the public in 335 | source code form), and must require no special password or key for 336 | unpacking, reading or copying. 337 | 338 | 7. Additional Terms. 339 | 340 | "Additional permissions" are terms that supplement the terms of this 341 | License by making exceptions from one or more of its conditions. 342 | Additional permissions that are applicable to the entire Program shall 343 | be treated as though they were included in this License, to the extent 344 | that they are valid under applicable law. If additional permissions 345 | apply only to part of the Program, that part may be used separately 346 | under those permissions, but the entire Program remains governed by 347 | this License without regard to the additional permissions. 348 | 349 | When you convey a copy of a covered work, you may at your option 350 | remove any additional permissions from that copy, or from any part of 351 | it. (Additional permissions may be written to require their own 352 | removal in certain cases when you modify the work.) You may place 353 | additional permissions on material, added by you to a covered work, 354 | for which you have or can give appropriate copyright permission. 355 | 356 | Notwithstanding any other provision of this License, for material you 357 | add to a covered work, you may (if authorized by the copyright holders of 358 | that material) supplement the terms of this License with terms: 359 | 360 | a) Disclaiming warranty or limiting liability differently from the 361 | terms of sections 15 and 16 of this License; or 362 | 363 | b) Requiring preservation of specified reasonable legal notices or 364 | author attributions in that material or in the Appropriate Legal 365 | Notices displayed by works containing it; or 366 | 367 | c) Prohibiting misrepresentation of the origin of that material, or 368 | requiring that modified versions of such material be marked in 369 | reasonable ways as different from the original version; or 370 | 371 | d) Limiting the use for publicity purposes of names of licensors or 372 | authors of the material; or 373 | 374 | e) Declining to grant rights under trademark law for use of some 375 | trade names, trademarks, or service marks; or 376 | 377 | f) Requiring indemnification of licensors and authors of that 378 | material by anyone who conveys the material (or modified versions of 379 | it) with contractual assumptions of liability to the recipient, for 380 | any liability that these contractual assumptions directly impose on 381 | those licensors and authors. 382 | 383 | All other non-permissive additional terms are considered "further 384 | restrictions" within the meaning of section 10. If the Program as you 385 | received it, or any part of it, contains a notice stating that it is 386 | governed by this License along with a term that is a further 387 | restriction, you may remove that term. If a license document contains 388 | a further restriction but permits relicensing or conveying under this 389 | License, you may add to a covered work material governed by the terms 390 | of that license document, provided that the further restriction does 391 | not survive such relicensing or conveying. 392 | 393 | If you add terms to a covered work in accord with this section, you 394 | must place, in the relevant source files, a statement of the 395 | additional terms that apply to those files, or a notice indicating 396 | where to find the applicable terms. 397 | 398 | Additional terms, permissive or non-permissive, may be stated in the 399 | form of a separately written license, or stated as exceptions; 400 | the above requirements apply either way. 401 | 402 | 8. Termination. 403 | 404 | You may not propagate or modify a covered work except as expressly 405 | provided under this License. Any attempt otherwise to propagate or 406 | modify it is void, and will automatically terminate your rights under 407 | this License (including any patent licenses granted under the third 408 | paragraph of section 11). 409 | 410 | However, if you cease all violation of this License, then your 411 | license from a particular copyright holder is reinstated (a) 412 | provisionally, unless and until the copyright holder explicitly and 413 | finally terminates your license, and (b) permanently, if the copyright 414 | holder fails to notify you of the violation by some reasonable means 415 | prior to 60 days after the cessation. 416 | 417 | Moreover, your license from a particular copyright holder is 418 | reinstated permanently if the copyright holder notifies you of the 419 | violation by some reasonable means, this is the first time you have 420 | received notice of violation of this License (for any work) from that 421 | copyright holder, and you cure the violation prior to 30 days after 422 | your receipt of the notice. 423 | 424 | Termination of your rights under this section does not terminate the 425 | licenses of parties who have received copies or rights from you under 426 | this License. If your rights have been terminated and not permanently 427 | reinstated, you do not qualify to receive new licenses for the same 428 | material under section 10. 429 | 430 | 9. Acceptance Not Required for Having Copies. 431 | 432 | You are not required to accept this License in order to receive or 433 | run a copy of the Program. Ancillary propagation of a covered work 434 | occurring solely as a consequence of using peer-to-peer transmission 435 | to receive a copy likewise does not require acceptance. However, 436 | nothing other than this License grants you permission to propagate or 437 | modify any covered work. These actions infringe copyright if you do 438 | not accept this License. Therefore, by modifying or propagating a 439 | covered work, you indicate your acceptance of this License to do so. 440 | 441 | 10. Automatic Licensing of Downstream Recipients. 442 | 443 | Each time you convey a covered work, the recipient automatically 444 | receives a license from the original licensors, to run, modify and 445 | propagate that work, subject to this License. You are not responsible 446 | for enforcing compliance by third parties with this License. 447 | 448 | An "entity transaction" is a transaction transferring control of an 449 | organization, or substantially all assets of one, or subdividing an 450 | organization, or merging organizations. If propagation of a covered 451 | work results from an entity transaction, each party to that 452 | transaction who receives a copy of the work also receives whatever 453 | licenses to the work the party's predecessor in interest had or could 454 | give under the previous paragraph, plus a right to possession of the 455 | Corresponding Source of the work from the predecessor in interest, if 456 | the predecessor has it or can get it with reasonable efforts. 457 | 458 | You may not impose any further restrictions on the exercise of the 459 | rights granted or affirmed under this License. For example, you may 460 | not impose a license fee, royalty, or other charge for exercise of 461 | rights granted under this License, and you may not initiate litigation 462 | (including a cross-claim or counterclaim in a lawsuit) alleging that 463 | any patent claim is infringed by making, using, selling, offering for 464 | sale, or importing the Program or any portion of it. 465 | 466 | 11. Patents. 467 | 468 | A "contributor" is a copyright holder who authorizes use under this 469 | License of the Program or a work on which the Program is based. The 470 | work thus licensed is called the contributor's "contributor version". 471 | 472 | A contributor's "essential patent claims" are all patent claims 473 | owned or controlled by the contributor, whether already acquired or 474 | hereafter acquired, that would be infringed by some manner, permitted 475 | by this License, of making, using, or selling its contributor version, 476 | but do not include claims that would be infringed only as a 477 | consequence of further modification of the contributor version. For 478 | purposes of this definition, "control" includes the right to grant 479 | patent sublicenses in a manner consistent with the requirements of 480 | this License. 481 | 482 | Each contributor grants you a non-exclusive, worldwide, royalty-free 483 | patent license under the contributor's essential patent claims, to 484 | make, use, sell, offer for sale, import and otherwise run, modify and 485 | propagate the contents of its contributor version. 486 | 487 | In the following three paragraphs, a "patent license" is any express 488 | agreement or commitment, however denominated, not to enforce a patent 489 | (such as an express permission to practice a patent or covenant not to 490 | sue for patent infringement). To "grant" such a patent license to a 491 | party means to make such an agreement or commitment not to enforce a 492 | patent against the party. 493 | 494 | If you convey a covered work, knowingly relying on a patent license, 495 | and the Corresponding Source of the work is not available for anyone 496 | to copy, free of charge and under the terms of this License, through a 497 | publicly available network server or other readily accessible means, 498 | then you must either (1) cause the Corresponding Source to be so 499 | available, or (2) arrange to deprive yourself of the benefit of the 500 | patent license for this particular work, or (3) arrange, in a manner 501 | consistent with the requirements of this License, to extend the patent 502 | license to downstream recipients. "Knowingly relying" means you have 503 | actual knowledge that, but for the patent license, your conveying the 504 | covered work in a country, or your recipient's use of the covered work 505 | in a country, would infringe one or more identifiable patents in that 506 | country that you have reason to believe are valid. 507 | 508 | If, pursuant to or in connection with a single transaction or 509 | arrangement, you convey, or propagate by procuring conveyance of, a 510 | covered work, and grant a patent license to some of the parties 511 | receiving the covered work authorizing them to use, propagate, modify 512 | or convey a specific copy of the covered work, then the patent license 513 | you grant is automatically extended to all recipients of the covered 514 | work and works based on it. 515 | 516 | A patent license is "discriminatory" if it does not include within 517 | the scope of its coverage, prohibits the exercise of, or is 518 | conditioned on the non-exercise of one or more of the rights that are 519 | specifically granted under this License. You may not convey a covered 520 | work if you are a party to an arrangement with a third party that is 521 | in the business of distributing software, under which you make payment 522 | to the third party based on the extent of your activity of conveying 523 | the work, and under which the third party grants, to any of the 524 | parties who would receive the covered work from you, a discriminatory 525 | patent license (a) in connection with copies of the covered work 526 | conveyed by you (or copies made from those copies), or (b) primarily 527 | for and in connection with specific products or compilations that 528 | contain the covered work, unless you entered into that arrangement, 529 | or that patent license was granted, prior to 28 March 2007. 530 | 531 | Nothing in this License shall be construed as excluding or limiting 532 | any implied license or other defenses to infringement that may 533 | otherwise be available to you under applicable patent law. 534 | 535 | 12. No Surrender of Others' Freedom. 536 | 537 | If conditions are imposed on you (whether by court order, agreement or 538 | otherwise) that contradict the conditions of this License, they do not 539 | excuse you from the conditions of this License. If you cannot convey a 540 | covered work so as to satisfy simultaneously your obligations under this 541 | License and any other pertinent obligations, then as a consequence you may 542 | not convey it at all. For example, if you agree to terms that obligate you 543 | to collect a royalty for further conveying from those to whom you convey 544 | the Program, the only way you could satisfy both those terms and this 545 | License would be to refrain entirely from conveying the Program. 546 | 547 | 13. Remote Network Interaction; Use with the GNU General Public License. 548 | 549 | Notwithstanding any other provision of this License, if you modify the 550 | Program, your modified version must prominently offer all users 551 | interacting with it remotely through a computer network (if your version 552 | supports such interaction) an opportunity to receive the Corresponding 553 | Source of your version by providing access to the Corresponding Source 554 | from a network server at no charge, through some standard or customary 555 | means of facilitating copying of software. This Corresponding Source 556 | shall include the Corresponding Source for any work covered by version 3 557 | of the GNU General Public License that is incorporated pursuant to the 558 | following paragraph. 559 | 560 | Notwithstanding any other provision of this License, you have 561 | permission to link or combine any covered work with a work licensed 562 | under version 3 of the GNU General Public License into a single 563 | combined work, and to convey the resulting work. The terms of this 564 | License will continue to apply to the part which is the covered work, 565 | but the work with which it is combined will remain governed by version 566 | 3 of the GNU General Public License. 567 | 568 | 14. Revised Versions of this License. 569 | 570 | The Free Software Foundation may publish revised and/or new versions of 571 | the GNU Affero General Public License from time to time. Such new versions 572 | will be similar in spirit to the present version, but may differ in detail to 573 | address new problems or concerns. 574 | 575 | Each version is given a distinguishing version number. If the 576 | Program specifies that a certain numbered version of the GNU Affero General 577 | Public License "or any later version" applies to it, you have the 578 | option of following the terms and conditions either of that numbered 579 | version or of any later version published by the Free Software 580 | Foundation. If the Program does not specify a version number of the 581 | GNU Affero General Public License, you may choose any version ever published 582 | by the Free Software Foundation. 583 | 584 | If the Program specifies that a proxy can decide which future 585 | versions of the GNU Affero General Public License can be used, that proxy's 586 | public statement of acceptance of a version permanently authorizes you 587 | to choose that version for the Program. 588 | 589 | Later license versions may give you additional or different 590 | permissions. However, no additional obligations are imposed on any 591 | author or copyright holder as a result of your choosing to follow a 592 | later version. 593 | 594 | 15. Disclaimer of Warranty. 595 | 596 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY 597 | APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT 598 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY 599 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, 600 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 601 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM 602 | IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF 603 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 604 | 605 | 16. Limitation of Liability. 606 | 607 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 608 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS 609 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY 610 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE 611 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF 612 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD 613 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), 614 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF 615 | SUCH DAMAGES. 616 | 617 | 17. Interpretation of Sections 15 and 16. 618 | 619 | If the disclaimer of warranty and limitation of liability provided 620 | above cannot be given local legal effect according to their terms, 621 | reviewing courts shall apply local law that most closely approximates 622 | an absolute waiver of all civil liability in connection with the 623 | Program, unless a warranty or assumption of liability accompanies a 624 | copy of the Program in return for a fee. 625 | 626 | END OF TERMS AND CONDITIONS 627 | 628 | How to Apply These Terms to Your New Programs 629 | 630 | If you develop a new program, and you want it to be of the greatest 631 | possible use to the public, the best way to achieve this is to make it 632 | free software which everyone can redistribute and change under these terms. 633 | 634 | To do so, attach the following notices to the program. It is safest 635 | to attach them to the start of each source file to most effectively 636 | state the exclusion of warranty; and each file should have at least 637 | the "copyright" line and a pointer to where the full notice is found. 638 | 639 | 640 | Copyright (C) 641 | 642 | This program is free software: you can redistribute it and/or modify 643 | it under the terms of the GNU Affero General Public License as published 644 | by the Free Software Foundation, either version 3 of the License, or 645 | (at your option) any later version. 646 | 647 | This program is distributed in the hope that it will be useful, 648 | but WITHOUT ANY WARRANTY; without even the implied warranty of 649 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 650 | GNU Affero General Public License for more details. 651 | 652 | You should have received a copy of the GNU Affero General Public License 653 | along with this program. If not, see . 654 | 655 | Also add information on how to contact you by electronic and paper mail. 656 | 657 | If your software can interact with users remotely through a computer 658 | network, you should also make sure that it provides a way for users to 659 | get its source. For example, if your program is a web application, its 660 | interface could display a "Source" link that leads users to an archive 661 | of the code. There are many ways you could offer source, and different 662 | solutions will be better for different programs; see section 13 for the 663 | specific requirements. 664 | 665 | You should also get your employer (if you work as a programmer) or school, 666 | if any, to sign a "copyright disclaimer" for the program, if necessary. 667 | For more information on this, and how to apply and follow the GNU AGPL, see 668 | . 669 | -------------------------------------------------------------------------------- /NOTICE: -------------------------------------------------------------------------------- 1 | 2 | NOTICE 3 | 4 | This project includes code from the following MIT-licensed project(s): 5 | 6 | - Project Name: Markitdown 7 | - Repository: https://github.com/microsoft/markitdown 8 | - License: MIT 9 | 10 | MIT License 11 | 12 | Copyright (c) Microsoft Corporation. 13 | 14 | Permission is hereby granted, free of charge, to any person obtaining a copy 15 | of this software and associated documentation files (the "Software"), to deal 16 | in the Software without restriction, including without limitation the rights 17 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 18 | copies of the Software, and to permit persons to whom the Software is 19 | furnished to do so, subject to the following conditions: 20 | 21 | The above copyright notice and this permission notice shall be included in all 22 | copies or substantial portions of the Software. 23 | 24 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 25 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 26 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 27 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 28 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 29 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 30 | SOFTWARE -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Markify 2 | 3 | ✨ **轻松转换文件为 Markdown,助力 RAG 与 LLM 更智能地理解内容!** ✨ 4 | 5 | 🚀 **基于 Markitdown 与 MinerU**,支持多种格式转换,并提供 **高质量 PDF 解析**,让你的文档更易处理、更易用! 6 | 7 | 📡 **支持 API & Streamlit 端**,随时随地高效转换,轻松集成! 8 | 9 | 📂 **支持多种文件格式**: 10 | - 📄 **文档**:PDF、Word、PPT、Excel 11 | - 🖼 **多媒体**:图片、音频 12 | - 🌐 **网页与数据**:HTML、CSV、JSON、XML 13 | - 🗂 **压缩文件**:ZIP 14 | 15 | ⚡ **多种 PDF 解析模式,满足不同需求**: 16 | - 🚀 **快速模式**(基于 pdfminer,解析高效) 17 | - 🏆 **高级模式**(结合 MinerU 深度解析,效果更佳) 18 | - ☁️ **云端模式**(开发中,敬请期待!) 19 | 20 | 📖 **Markdown 化你的文件,助力 LLM 更好地理解与处理文档!** 💡 21 | 22 | ![alt text](assets/streamlint_ui.png) 23 | ```shell 24 | streamlit run ./client/streamlit_client.py 25 | ``` 26 | 27 | ## API 28 | FastAPI自带API文档 http://127.0.0.1:20926/docs 29 | ### 上传文件,创建任务 30 | 请求 31 | ```shell 32 | curl -X 'POST' \ 33 | 'http://127.0.0.1:20926/api/jobs' \ 34 | -H 'accept: application/json' \ 35 | -H 'Content-Type: multipart/form-data' \ 36 | -F 'file=@CoA.pdf;type=application/pdf' \ 37 | -F 'mode=advanced' 38 | ``` 39 | 响应 40 | ```json 41 | { 42 | "job_id": "29bbad6b-c167-41f0-8a29-99551c499263" 43 | } 44 | ``` 45 | ### 查询任务状态 46 | 请求 47 | ```shell 48 | curl -X 'GET' \ 49 | 'http://127.0.0.1:20926/api/jobs/29bbad6b-c167-41f0-8a29-99551c499263' \ 50 | -H 'accept: application/json' 51 | ``` 52 | 响应 53 | ```json 54 | { 55 | "job_id": "29bbad6b-c167-41f0-8a29-99551c499263", 56 | "status": "completed", 57 | "filename": "CoA.pdf", 58 | "params": { 59 | "mode": "advanced" 60 | }, 61 | "error": null 62 | } 63 | ``` 64 | ### 下载markdown文件 65 | 请求 66 | ```shell 67 | curl -X 'GET' \ 68 | 'http://127.0.0.1:20926/api/jobs/29bbad6b-c167-41f0-8a29-99551c499263/result' \ 69 | -H 'accept: application/json' 70 | ``` 71 | 响应 72 | 文件 73 | 74 | 75 | ## Docker部署 76 | ```shell 77 | docker pull wsjcuhk/markify:0.0.1 78 | docker run -d -p 20926:20926 wsjcuhk/markify:0.0.1 79 | ``` 80 | 81 | 82 | ## TODO 83 | - 添加云端解析模式 84 | - 自动打包为Docker镜像 85 | 86 | ## 赞助我 87 | 开源不易,有专业指导需求或赞助,可以加入我的知识星球,我会提供专业的技术指导。 88 | ![zsxq](assets/zsxq.JPG) 89 | 90 | 91 | ## 致敬 92 | 本项目参考微软markitdown和上海浦语mineru。 93 | - [markitdown](https://github.com/microsoft/markitdown) 94 | - [mineru](https://github.com/opendatalab/MinerU) 95 | -------------------------------------------------------------------------------- /assets/magic-pdf-template.json: -------------------------------------------------------------------------------- 1 | { 2 | "bucket_info":{ 3 | "bucket-name-1":["ak", "sk", "endpoint"], 4 | "bucket-name-2":["ak", "sk", "endpoint"] 5 | }, 6 | "models-dir":"/tmp/models", 7 | "layoutreader-model-dir":"/tmp/layoutreader", 8 | "device-mode":"cpu", 9 | "layout-config": { 10 | "model": "doclayout_yolo" 11 | }, 12 | "formula-config": { 13 | "mfd_model": "yolo_v8_mfd", 14 | "mfr_model": "unimernet_small", 15 | "enable": true 16 | }, 17 | "table-config": { 18 | "model": "rapid_table", 19 | "sub_model": "slanet_plus", 20 | "enable": true, 21 | "max_time": 400 22 | }, 23 | "llm-aided-config": { 24 | "formula_aided": { 25 | "api_key": "your_api_key", 26 | "base_url": "https://dashscope.aliyuncs.com/compatible-mode/v1", 27 | "model": "qwen2.5-7b-instruct", 28 | "enable": false 29 | }, 30 | "text_aided": { 31 | "api_key": "your_api_key", 32 | "base_url": "https://dashscope.aliyuncs.com/compatible-mode/v1", 33 | "model": "qwen2.5-7b-instruct", 34 | "enable": false 35 | }, 36 | "title_aided": { 37 | "api_key": "your_api_key", 38 | "base_url": "https://dashscope.aliyuncs.com/compatible-mode/v1", 39 | "model": "qwen2.5-32b-instruct", 40 | "enable": false 41 | } 42 | }, 43 | "config_version": "1.1.1" 44 | } -------------------------------------------------------------------------------- /assets/sponsor.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KylinMountain/markify/622070781f1cdb74506b3ec1ec7d1749c1ef458c/assets/sponsor.JPG -------------------------------------------------------------------------------- /assets/streamlint_ui.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KylinMountain/markify/622070781f1cdb74506b3ec1ec7d1749c1ef458c/assets/streamlint_ui.png -------------------------------------------------------------------------------- /assets/zsxq.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KylinMountain/markify/622070781f1cdb74506b3ec1ec7d1749c1ef458c/assets/zsxq.JPG -------------------------------------------------------------------------------- /client/streamlit_client.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import requests 3 | import time 4 | import os 5 | 6 | # ============ 配置区 ============ 7 | BASE_URL = "http://localhost:20926" 8 | 9 | 10 | # ============ 工具函数 ============ 11 | 12 | def fetch_jobs(page=0, limit=10): 13 | """ 14 | 从后端 /api/jobs 获取最新任务列表 15 | (请确保后端已实现 ?page=...&limit=... 分页参数) 16 | """ 17 | url = f"{BASE_URL}/api/jobs?page={page}&limit={limit}" 18 | try: 19 | resp = requests.get(url) 20 | if resp.status_code == 200: 21 | return resp.json() # 后端应返回一个任务列表 (list) 22 | else: 23 | st.error(f"获取任务列表失败: {resp.text}") 24 | return [] 25 | except requests.RequestException as e: 26 | st.error(f"网络异常: {e}") 27 | return [] 28 | 29 | 30 | def upload_file(file, mode): 31 | """ 32 | 上传单个文件到后端,创建任务。 33 | 成功后立刻刷新页面,以获取最新的任务列表。 34 | """ 35 | files = {"file": file} 36 | data = {"mode": mode} 37 | try: 38 | response = requests.post(f"{BASE_URL}/api/jobs", files=files, data=data) 39 | if response.status_code == 202: 40 | st.success(f"文件 `{file.name}` 上传成功,已加入任务队列。") 41 | st.experimental_rerun() # 触发页面刷新,从而 fetch_jobs() 42 | else: 43 | st.error(f"文件 `{file.name}` 上传失败: {response.text}") 44 | except requests.RequestException as e: 45 | st.error(f"网络异常:{e}") 46 | 47 | 48 | def upload_url(url, mode): 49 | """ 50 | 上传单个 URL 到后端,创建任务。 51 | 成功后立刻刷新页面,以获取最新的任务列表。 52 | """ 53 | data = {"url": url, "mode": mode} 54 | try: 55 | response = requests.post(f"{BASE_URL}/api/jobs/url", json=data) 56 | if response.status_code == 202: 57 | st.success(f"URL `{url}` 提交成功,已加入任务队列。") 58 | st.experimental_rerun() 59 | else: 60 | st.error(f"URL `{url}` 上传失败: {response.text}") 61 | except requests.RequestException as e: 62 | st.error(f"网络异常:{e}") 63 | 64 | 65 | def show_file_entry(job): 66 | """ 67 | 在右侧文件列表中渲染每个任务条目。 68 | 后端返回的 job 数据结构示例(JSON): 69 | { 70 | "job_id": "xxx", 71 | "status": "completed", 72 | "filename": "test.pdf", 73 | "params": {"mode": "simple"}, 74 | "error": null, 75 | "created_at": "2025-02-25T10:00:00" 76 | } 77 | 你也可以根据后端的返回字段进行修改。 78 | """ 79 | col1, col2, col3, col4 = st.columns([3, 2, 2, 1]) 80 | 81 | with col1: 82 | st.markdown(f"**{job['filename']}**") 83 | 84 | # 如果后端返回了 created_at,可显示 85 | with col2: 86 | created_time = job.get("created_at", "") 87 | st.markdown(f"{created_time}") 88 | 89 | with col3: 90 | status = job["status"] 91 | if status == "completed": 92 | status_icon = "✅" 93 | elif status == "failed": 94 | status_icon = "❌" 95 | else: 96 | status_icon = "⏳" 97 | st.markdown(f"{status_icon} {status}") 98 | 99 | with col4: 100 | # 如果已完成,提供下载 101 | if status == "completed": 102 | try: 103 | result_response = requests.get(f"{BASE_URL}/api/jobs/{job['job_id']}/result") 104 | if result_response.status_code == 200: 105 | st.download_button( 106 | label="下载", 107 | data=result_response.content, 108 | file_name=f"{job['filename']}.md", 109 | mime="text/markdown", 110 | key=f"download_{job['job_id']}" # 添加唯一 key 111 | ) 112 | else: 113 | st.error("无法下载") 114 | except requests.RequestException as e: 115 | st.error(f"下载异常:{e}") 116 | 117 | 118 | # ============ 主函数 ============ 119 | 120 | def main(): 121 | st.set_page_config(page_title="Markify", layout="wide") 122 | 123 | # 页面标题与说明 124 | st.title("Markify - 文档处理") 125 | st.markdown("在左侧上传文件或提交 URL,右侧实时查看进度并下载结果。") 126 | 127 | # 布局:左侧上传,右侧列表 128 | left_col, right_col = st.columns([2, 3], gap="large") 129 | 130 | with left_col: 131 | st.subheader("上传设置") 132 | mode = st.selectbox("选择 PDF 处理模式", ["simple", "advanced", "cloud"]) 133 | 134 | # 本地文件上传 135 | uploaded_files = st.file_uploader( 136 | "选择文件(任意类型)", 137 | type=None, 138 | accept_multiple_files=True 139 | ) 140 | if uploaded_files and st.button("上传文件"): 141 | for file in uploaded_files: 142 | upload_file(file, mode) 143 | 144 | # URL 上传 145 | st.subheader("URL 上传") 146 | file_urls = st.text_area("请输入文件 URL(每行一个)") 147 | if file_urls and st.button("提交 URL"): 148 | for url in file_urls.strip().split("\n"): 149 | if url: 150 | upload_url(url.strip(), mode) 151 | 152 | # 结果存储位置(仅作提示) 153 | st.markdown(f"**解析结果存储路径**:`{os.path.expanduser('~')}/MinerU`") 154 | 155 | with right_col: 156 | st.subheader("文件列表") 157 | 158 | # 手动刷新按钮 159 | if st.button("刷新列表"): 160 | st.experimental_rerun() 161 | 162 | # 从后端获取任务列表 163 | jobs = fetch_jobs(page=0, limit=10) 164 | if not jobs: 165 | st.info("暂无任务,请上传后查看。") 166 | else: 167 | for job in jobs: 168 | show_file_entry(job) 169 | 170 | 171 | if __name__ == "__main__": 172 | main() 173 | -------------------------------------------------------------------------------- /core/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KylinMountain/markify/622070781f1cdb74506b3ec1ec7d1749c1ef458c/core/__init__.py -------------------------------------------------------------------------------- /core/base.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Union 2 | 3 | 4 | class DocumentConverterResult: 5 | """The result of converting a document to text.""" 6 | 7 | def __init__(self, title: Union[str, None] = None, text_content: str = ""): 8 | self.title: Union[str, None] = title 9 | self.text_content: str = text_content 10 | 11 | 12 | class DocumentConverter: 13 | """Abstract superclass of all DocumentConverters.""" 14 | 15 | def convert( 16 | self, local_path: str, **kwargs: Any 17 | ) -> Union[None, DocumentConverterResult]: 18 | raise NotImplementedError() 19 | 20 | 21 | class FileConversionException(BaseException): 22 | pass 23 | 24 | 25 | class UnsupportedFormatException(BaseException): 26 | pass 27 | -------------------------------------------------------------------------------- /core/converters/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KylinMountain/markify/622070781f1cdb74506b3ec1ec7d1749c1ef458c/core/converters/__init__.py -------------------------------------------------------------------------------- /core/converters/bingsearch.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import binascii 3 | import re 4 | from typing import Union 5 | from urllib.parse import parse_qs, urlparse 6 | 7 | from bs4 import BeautifulSoup 8 | 9 | from core.base import DocumentConverter, DocumentConverterResult 10 | from core.converters.custommarkdownify import _CustomMarkdownify 11 | 12 | 13 | class BingSerpConverter(DocumentConverter): 14 | """ 15 | Handle Bing results pages (only the organic search results). 16 | NOTE: It is better to use the Bing API 17 | """ 18 | 19 | def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: 20 | # Bail if not a Bing SERP 21 | extension = kwargs.get("file_extension", "") 22 | if extension.lower() not in [".html", ".htm"]: 23 | return None 24 | url = kwargs.get("url", "") 25 | if not re.search(r"^https://www\.bing\.com/search\?q=", url): 26 | return None 27 | 28 | # Parse the query parameters 29 | parsed_params = parse_qs(urlparse(url).query) 30 | query = parsed_params.get("q", [""])[0] 31 | 32 | # Parse the file 33 | soup = None 34 | with open(local_path, "rt", encoding="utf-8") as fh: 35 | soup = BeautifulSoup(fh.read(), "html.parser") 36 | 37 | # Clean up some formatting 38 | for tptt in soup.find_all(class_="tptt"): 39 | if hasattr(tptt, "string") and tptt.string: 40 | tptt.string += " " 41 | for slug in soup.find_all(class_="algoSlug_icon"): 42 | slug.extract() 43 | 44 | # Parse the algorithmic results 45 | _markdownify = _CustomMarkdownify() 46 | results = list() 47 | for result in soup.find_all(class_="b_algo"): 48 | # Rewrite redirect urls 49 | for a in result.find_all("a", href=True): 50 | parsed_href = urlparse(a["href"]) 51 | qs = parse_qs(parsed_href.query) 52 | 53 | # The destination is contained in the u parameter, 54 | # but appears to be base64 encoded, with some prefix 55 | if "u" in qs: 56 | u = ( 57 | qs["u"][0][2:].strip() + "==" 58 | ) # Python 3 doesn't care about extra padding 59 | 60 | try: 61 | # RFC 4648 / Base64URL" variant, which uses "-" and "_" 62 | a["href"] = base64.b64decode(u, altchars="-_").decode("utf-8") 63 | except UnicodeDecodeError: 64 | pass 65 | except binascii.Error: 66 | pass 67 | 68 | # Convert to markdown 69 | md_result = _markdownify.convert_soup(result).strip() 70 | lines = [line.strip() for line in re.split(r"\n+", md_result)] 71 | results.append("\n".join([line for line in lines if len(line) > 0])) 72 | 73 | webpage_text = ( 74 | f"## A Bing search for '{query}' found the following results:\n\n" 75 | + "\n\n".join(results) 76 | ) 77 | 78 | return DocumentConverterResult( 79 | title=None if soup.title is None else soup.title.string, 80 | text_content=webpage_text, 81 | ) 82 | -------------------------------------------------------------------------------- /core/converters/custommarkdownify.py: -------------------------------------------------------------------------------- 1 | import re 2 | from typing import Any 3 | from urllib.parse import urlparse, urlunparse, quote, unquote 4 | 5 | import markdownify 6 | 7 | 8 | class _CustomMarkdownify(markdownify.MarkdownConverter): 9 | """ 10 | A custom version of markdownify's MarkdownConverter. Changes include: 11 | 12 | - Altering the default heading style to use '#', '##', etc. 13 | - Removing javascript hyperlinks. 14 | - Truncating images with large data:uri sources. 15 | - Ensuring URIs are properly escaped, and do not conflict with Markdown syntax 16 | """ 17 | 18 | def __init__(self, **options: Any): 19 | options["heading_style"] = options.get("heading_style", markdownify.ATX) 20 | # Explicitly cast options to the expected type if necessary 21 | super().__init__(**options) 22 | 23 | def convert_hn(self, n: int, el: Any, text: str, convert_as_inline: bool) -> str: 24 | """Same as usual, but be sure to start with a new line""" 25 | if not convert_as_inline: 26 | if not re.search(r"^\n", text): 27 | return "\n" + super().convert_hn(n, el, text, convert_as_inline) # type: ignore 28 | 29 | return super().convert_hn(n, el, text, convert_as_inline) # type: ignore 30 | 31 | def convert_a(self, el: Any, text: str, convert_as_inline: bool): 32 | """Same as usual converter, but removes Javascript links and escapes URIs.""" 33 | prefix, suffix, text = markdownify.chomp(text) # type: ignore 34 | if not text: 35 | return "" 36 | href = el.get("href") 37 | title = el.get("title") 38 | 39 | # Escape URIs and skip non-http or file schemes 40 | if href: 41 | try: 42 | parsed_url = urlparse(href) # type: ignore 43 | if parsed_url.scheme and parsed_url.scheme.lower() not in ["http", "https", "file"]: # type: ignore 44 | return "%s%s%s" % (prefix, text, suffix) 45 | href = urlunparse(parsed_url._replace(path=quote(unquote(parsed_url.path)))) # type: ignore 46 | except ValueError: # It's not clear if this ever gets thrown 47 | return "%s%s%s" % (prefix, text, suffix) 48 | 49 | # For the replacement see #29: text nodes underscores are escaped 50 | if ( 51 | self.options["autolinks"] 52 | and text.replace(r"\_", "_") == href 53 | and not title 54 | and not self.options["default_title"] 55 | ): 56 | # Shortcut syntax 57 | return "<%s>" % href 58 | if self.options["default_title"] and not title: 59 | title = href 60 | title_part = ' "%s"' % title.replace('"', r"\"") if title else "" 61 | return ( 62 | "%s[%s](%s%s)%s" % (prefix, text, href, title_part, suffix) 63 | if href 64 | else text 65 | ) 66 | 67 | def convert_img(self, el: Any, text: str, convert_as_inline: bool) -> str: 68 | """Same as usual converter, but removes data URIs""" 69 | 70 | alt = el.attrs.get("alt", None) or "" 71 | src = el.attrs.get("src", None) or "" 72 | title = el.attrs.get("title", None) or "" 73 | title_part = ' "%s"' % title.replace('"', r"\"") if title else "" 74 | if ( 75 | convert_as_inline 76 | and el.parent.name not in self.options["keep_inline_images_in"] 77 | ): 78 | return alt 79 | 80 | # Remove dataURIs 81 | if src.startswith("data:"): 82 | src = src.split(",")[0] + "..." 83 | 84 | return "![%s](%s%s)" % (alt, src, title_part) 85 | 86 | def convert_soup(self, soup: Any) -> str: 87 | return super().convert_soup(soup) # type: ignore 88 | -------------------------------------------------------------------------------- /core/converters/docx.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | 3 | import mammoth 4 | 5 | from core.base import DocumentConverterResult 6 | from core.converters.html import HtmlConverter 7 | 8 | 9 | class DocxConverter(HtmlConverter): 10 | """ 11 | Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible. 12 | """ 13 | 14 | def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: 15 | # Bail if not a DOCX 16 | extension = kwargs.get("file_extension", "") 17 | if extension.lower() != ".docx": 18 | return None 19 | 20 | result = None 21 | with open(local_path, "rb") as docx_file: 22 | style_map = kwargs.get("style_map", None) 23 | 24 | result = mammoth.convert_to_html(docx_file, style_map=style_map) 25 | html_content = result.value 26 | result = self._convert(html_content) 27 | 28 | return result 29 | -------------------------------------------------------------------------------- /core/converters/html.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Union 2 | 3 | from bs4 import BeautifulSoup 4 | 5 | from core.base import DocumentConverter, DocumentConverterResult 6 | from core.converters.custommarkdownify import _CustomMarkdownify 7 | 8 | 9 | class HtmlConverter(DocumentConverter): 10 | """Anything with content type text/html""" 11 | 12 | def convert( 13 | self, local_path: str, **kwargs: Any 14 | ) -> Union[None, DocumentConverterResult]: 15 | # Bail if not html 16 | extension = kwargs.get("file_extension", "") 17 | if extension.lower() not in [".html", ".htm"]: 18 | return None 19 | 20 | result = None 21 | with open(local_path, "rt", encoding="utf-8") as fh: 22 | result = self._convert(fh.read()) 23 | 24 | return result 25 | 26 | def _convert(self, html_content: str) -> Union[None, DocumentConverterResult]: 27 | """Helper function that converts and HTML string.""" 28 | 29 | # Parse the string 30 | soup = BeautifulSoup(html_content, "html.parser") 31 | 32 | # Remove javascript and style blocks 33 | for script in soup(["script", "style"]): 34 | script.extract() 35 | 36 | # Print only the main content 37 | body_elm = soup.find("body") 38 | webpage_text = "" 39 | if body_elm: 40 | webpage_text = _CustomMarkdownify().convert_soup(body_elm) 41 | else: 42 | webpage_text = _CustomMarkdownify().convert_soup(soup) 43 | 44 | assert isinstance(webpage_text, str) 45 | 46 | return DocumentConverterResult( 47 | title=None if soup.title is None else soup.title.string, 48 | text_content=webpage_text, 49 | ) 50 | -------------------------------------------------------------------------------- /core/converters/image.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import mimetypes 3 | from typing import Union 4 | 5 | from core.base import DocumentConverterResult 6 | from core.converters.media import MediaConverter 7 | 8 | 9 | class ImageConverter(MediaConverter): 10 | """ 11 | Converts images to markdown via extraction of metadata (if `exiftool` is installed), OCR (if `easyocr` is installed), and description via a multimodal LLM (if an llm_client is configured). 12 | """ 13 | 14 | def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: 15 | # Bail if not an image 16 | extension = kwargs.get("file_extension", "") 17 | if extension.lower() not in [".jpg", ".jpeg", ".png"]: 18 | return None 19 | 20 | md_content = "" 21 | 22 | # Add metadata 23 | metadata = self._get_metadata(local_path, kwargs.get("exiftool_path")) 24 | if metadata: 25 | for f in [ 26 | "ImageSize", 27 | "Title", 28 | "Caption", 29 | "Description", 30 | "Keywords", 31 | "Artist", 32 | "Author", 33 | "DateTimeOriginal", 34 | "CreateDate", 35 | "GPSPosition", 36 | ]: 37 | if f in metadata: 38 | md_content += f"{f}: {metadata[f]}\n" 39 | 40 | # Try describing the image with GPTV 41 | llm_client = kwargs.get("llm_client") 42 | llm_model = kwargs.get("llm_model") 43 | if llm_client is not None and llm_model is not None: 44 | md_content += ( 45 | "\n# Description:\n" 46 | + self._get_llm_description( 47 | local_path, 48 | extension, 49 | llm_client, 50 | llm_model, 51 | prompt=kwargs.get("llm_prompt"), 52 | ).strip() 53 | + "\n" 54 | ) 55 | else: 56 | md_content += """ 57 | Image description need set following env: 58 | 59 | - MARKIFY_LLM_API_BASE 60 | - MARKIFY_LLM_API_KEY 61 | - MARKIFY_LLM_MODE 62 | """ 63 | return DocumentConverterResult( 64 | title=None, 65 | text_content=md_content, 66 | ) 67 | 68 | def _get_llm_description(self, local_path, extension, client, model, prompt=None): 69 | if prompt is None or prompt.strip() == "": 70 | prompt = "Write a detailed caption for this image." 71 | 72 | data_uri = "" 73 | with open(local_path, "rb") as image_file: 74 | content_type, encoding = mimetypes.guess_type("_dummy" + extension) 75 | if content_type is None: 76 | content_type = "image/jpeg" 77 | image_base64 = base64.b64encode(image_file.read()).decode("utf-8") 78 | data_uri = f"data:{content_type};base64,{image_base64}" 79 | 80 | messages = [ 81 | { 82 | "role": "user", 83 | "content": [ 84 | {"type": "text", "text": prompt}, 85 | { 86 | "type": "image_url", 87 | "image_url": { 88 | "url": data_uri, 89 | }, 90 | }, 91 | ], 92 | } 93 | ] 94 | 95 | response = client.chat.completions.create(model=model, messages=messages) 96 | return response.choices[0].message.content 97 | -------------------------------------------------------------------------------- /core/converters/ipynb.py: -------------------------------------------------------------------------------- 1 | import json 2 | from typing import Any, Union 3 | 4 | from core.base import DocumentConverter, DocumentConverterResult, FileConversionException 5 | 6 | 7 | class IpynbConverter(DocumentConverter): 8 | """Converts Jupyter Notebook (.ipynb) files to Markdown.""" 9 | 10 | def convert( 11 | self, local_path: str, **kwargs: Any 12 | ) -> Union[None, DocumentConverterResult]: 13 | # Bail if not ipynb 14 | extension = kwargs.get("file_extension", "") 15 | if extension.lower() != ".ipynb": 16 | return None 17 | 18 | # Parse and convert the notebook 19 | result = None 20 | with open(local_path, "rt", encoding="utf-8") as fh: 21 | notebook_content = json.load(fh) 22 | result = self._convert(notebook_content) 23 | 24 | return result 25 | 26 | def _convert(self, notebook_content: dict) -> Union[None, DocumentConverterResult]: 27 | """Helper function that converts notebook JSON content to Markdown.""" 28 | try: 29 | md_output = [] 30 | title = None 31 | 32 | for cell in notebook_content.get("cells", []): 33 | cell_type = cell.get("cell_type", "") 34 | source_lines = cell.get("source", []) 35 | 36 | if cell_type == "markdown": 37 | md_output.append("".join(source_lines)) 38 | 39 | # Extract the first # heading as title if not already found 40 | if title is None: 41 | for line in source_lines: 42 | if line.startswith("# "): 43 | title = line.lstrip("# ").strip() 44 | break 45 | 46 | elif cell_type == "code": 47 | # Code cells are wrapped in Markdown code blocks 48 | md_output.append(f"```python\n{''.join(source_lines)}\n```") 49 | elif cell_type == "raw": 50 | md_output.append(f"```\n{''.join(source_lines)}\n```") 51 | 52 | md_text = "\n\n".join(md_output) 53 | 54 | # Check for title in notebook metadata 55 | title = notebook_content.get("metadata", {}).get("title", title) 56 | 57 | return DocumentConverterResult( 58 | title=title, 59 | text_content=md_text, 60 | ) 61 | 62 | except Exception as e: 63 | raise FileConversionException( 64 | f"Error converting .ipynb file: {str(e)}" 65 | ) from e 66 | -------------------------------------------------------------------------------- /core/converters/media.py: -------------------------------------------------------------------------------- 1 | import json 2 | import shutil 3 | import subprocess 4 | from _warnings import warn 5 | 6 | from core.base import DocumentConverter 7 | 8 | 9 | class MediaConverter(DocumentConverter): 10 | """ 11 | Abstract class for multi-modal media (e.g., images and audio) 12 | """ 13 | 14 | def _get_metadata(self, local_path, exiftool_path=None): 15 | if not exiftool_path: 16 | which_exiftool = shutil.which("exiftool") 17 | if which_exiftool: 18 | warn( 19 | f"""Implicit discovery of 'exiftool' is disabled. If you would like to continue to use exiftool in MarkItDown, please set the exiftool_path parameter in the MarkItDown consructor. E.g., 20 | 21 | md = MarkItDown(exiftool_path="{which_exiftool}") 22 | 23 | This warning will be removed in future releases. 24 | """, 25 | DeprecationWarning, 26 | ) 27 | 28 | return None 29 | else: 30 | try: 31 | result = subprocess.run( 32 | [exiftool_path, "-json", local_path], capture_output=True, text=True 33 | ).stdout 34 | return json.loads(result)[0] 35 | except Exception: 36 | return None 37 | -------------------------------------------------------------------------------- /core/converters/mineru/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KylinMountain/markify/622070781f1cdb74506b3ec1ec7d1749c1ef458c/core/converters/mineru/__init__.py -------------------------------------------------------------------------------- /core/converters/mineru/pdf_processor.py: -------------------------------------------------------------------------------- 1 | import re 2 | import urllib.parse 3 | from pathlib import Path 4 | from typing import Dict 5 | 6 | from magic_pdf.config.enums import SupportedPdfParseMethod 7 | from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader 8 | from magic_pdf.data.dataset import PymuDocDataset 9 | from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze 10 | 11 | from core.converters.mineru.title_corrector import MarkdownTitleProcessor 12 | 13 | 14 | class PDFProcessor: 15 | """PDF文档处理管道""" 16 | 17 | def __init__(self, output_dir: str = "output", base_url: str = "http://localhost:20926", **kwargs): 18 | self.output_dir = Path(output_dir) 19 | self.image_dir = self.output_dir / "images" 20 | self.base_url = base_url 21 | self._prepare_directories() 22 | 23 | def _prepare_directories(self): 24 | """创建输出目录结构""" 25 | self.image_dir.mkdir(parents=True, exist_ok=True) 26 | self.output_dir.mkdir(exist_ok=True) 27 | 28 | def process(self, pdf_path: str) -> Dict[str, str]: 29 | """处理PDF主流程""" 30 | pdf_path = Path(pdf_path) 31 | if not pdf_path.exists(): 32 | raise FileNotFoundError(f"PDF文件不存在: {pdf_path}") 33 | 34 | name_stem = pdf_path.stem 35 | writers = { 36 | 'image': FileBasedDataWriter(str(self.image_dir)), 37 | 'markdown': FileBasedDataWriter(str(self.output_dir)) 38 | } 39 | 40 | # 读取并解析PDF 41 | pdf_content = FileBasedDataReader("").read(str(pdf_path)) 42 | dataset = PymuDocDataset(pdf_content) 43 | 44 | # 执行解析流程 45 | if dataset.classify() == SupportedPdfParseMethod.OCR: 46 | result = dataset.apply(doc_analyze, ocr=True).pipe_ocr_mode(writers['image']) 47 | else: 48 | result = dataset.apply(doc_analyze, ocr=False).pipe_txt_mode(writers['image']) 49 | 50 | # 生成输出文件 51 | output_files = self._generate_outputs(result, writers, name_stem) 52 | 53 | # 自动修正标题层级 54 | self._adjust_title_levels(output_files['markdown']) 55 | 56 | self._replace_image_paths(output_files['markdown'], self.base_url) 57 | 58 | return output_files 59 | 60 | def _generate_outputs(self, result, writers, name_stem: str) -> Dict[str, str]: 61 | """生成所有输出文件""" 62 | # 生成原始Markdown 63 | md_file = f"{name_stem}.md" 64 | result.dump_md(writers['markdown'], md_file, self.image_dir.name) 65 | 66 | # 生成中间文件 67 | # result.dump_content_list(writers['markdown'], f"{name_stem}_content.json") 68 | # result.dump_middle_json(writers['markdown'], f"{name_stem}_middle.json") 69 | 70 | return { 71 | 'markdown': str(self.output_dir / md_file), 72 | 'images': str(self.image_dir), 73 | # 'middle_json': str(self.output_dir / f"{name_stem}_middle.json") 74 | } 75 | 76 | def _replace_image_paths(self, md_path: str, base_url: str): 77 | """替换Markdown文件中的本地图像路径为HTTP URL""" 78 | with open(md_path, 'r', encoding='utf-8') as f: 79 | content = f.read() 80 | 81 | # 匹配 Markdown 中的图像链接,假设格式为 ![alt](images/xxxxx) 82 | pattern = r'!\[.*?\]\((images/.*?)\)' 83 | replacement = lambda m: f'![{m.group(0).split("]")[0].split("[")[1]}]({urllib.parse.urljoin(base_url, "images/")}{m.group(1).split("/")[-1]})' 84 | new_content = re.sub(pattern, replacement, content) 85 | 86 | # 将修改后的内容写回文件 87 | with open(md_path, 'w', encoding='utf-8') as f: 88 | f.write(new_content) 89 | 90 | def _adjust_title_levels(self, md_path: str): 91 | """执行Markdown标题修正""" 92 | processor = MarkdownTitleProcessor() 93 | processor.process_file(md_path) 94 | 95 | 96 | if __name__ == "__main__": 97 | # 示例用法 98 | processor = PDFProcessor() 99 | result = processor.process("/path/to/your.pdf") 100 | print(f"处理完成,输出文件:{result}") 101 | -------------------------------------------------------------------------------- /core/converters/mineru/title_corrector.py: -------------------------------------------------------------------------------- 1 | import re 2 | from typing import List, Tuple, Optional 3 | from pathlib import Path 4 | 5 | 6 | class MarkdownTitleProcessor: 7 | """智能Markdown标题层级处理器""" 8 | 9 | def __init__(self, title_patterns: Optional[List[Tuple[str, int]]] = None): 10 | """ 11 | 初始化标题处理器 12 | 13 | Args: 14 | title_patterns: 自定义标题模式列表,格式为[(正则模式, 基准层级), ...] 15 | """ 16 | # 默认支持中英文混合标题模式 17 | self.title_patterns = title_patterns or [ 18 | # 中文章节模式 19 | (r'^(第[一二三四五六七八九十百]+章)\s*[::]?\s*.+', 1), 20 | (r'^(第[一二三四五六七八九十百]+节)\s*[::]?\s*.+', 2), 21 | (r'^【.+】\s*.+', 2), 22 | 23 | # 英文章节模式 24 | (r'^(Chapter|CHAPTER)\s+\d+\.?\s*[:-]?\s*.+', 1), 25 | (r'^(Section|SECTION)\s+\d+\.?\d*\s*[:-]?\s*.+', 2), 26 | 27 | # 数字层级模式 28 | (r'^\d+(?![.]\d)', 1), # 单独数字开头:1 29 | (r'^\d+\.\d+(?![.]\d)', 2), # 二级编号:1.1 30 | (r'^\d+\.\d+\.\d+', 3), # 三级编号:1.1.1 31 | (r'^\d+\.\d+\.\d+\.\d+', 4), # 四级编号:1.1.1.1 32 | 33 | # 特殊标识 34 | (r'^(※|◆|►)\s*.+', 3), # 特殊符号标题 35 | (r'^(Note|Warning):\s*.+', 4) # 提示类标题 36 | ] 37 | 38 | # 编译正则表达式 39 | self.compiled_patterns = [ 40 | (re.compile(pattern, re.IGNORECASE), level) 41 | for pattern, level in self.title_patterns 42 | ] 43 | 44 | # 层级栈管理 45 | self.level_stack = [0] # [当前层级,父层级,祖父层级...] 46 | 47 | def _clean_title(self, title: str) -> str: 48 | """清洗标题内容""" 49 | # 移除常见干扰符号 50 | title = re.sub(r'^[【《〈((]', '', title) 51 | title = re.sub(r'[】》〉)):.]$', '', title) 52 | # 去除首尾特殊符号 53 | return title.strip('※★▪•·\t ') 54 | 55 | def determine_level(self, title: str) -> int: 56 | """智能判断标题层级""" 57 | clean_title = self._clean_title(title) 58 | 59 | # 优先匹配预定义模式 60 | for pattern, base_level in self.compiled_patterns: 61 | if pattern.match(clean_title): 62 | return self._calculate_relative_level(base_level) 63 | 64 | # 无匹配时根据上下文推断 65 | return self._infer_level_from_context(clean_title) 66 | 67 | def _calculate_relative_level(self, base_level: int) -> int: 68 | """计算相对层级""" 69 | # 当前基准层级深度 70 | current_depth = len(self.level_stack) 71 | 72 | # 如果基准层级比当前深,则作为子级 73 | if base_level > current_depth: 74 | return current_depth + 1 75 | # 如果基准层级较浅,则重置层级栈 76 | elif base_level < current_depth: 77 | self.level_stack = self.level_stack[:base_level] 78 | return base_level 79 | 80 | def _infer_level_from_context(self, title: str) -> int: 81 | """根据上下文推断层级""" 82 | # 根据标题长度和内容特征推断 83 | if len(title) < 15 and not re.search(r'\s', title): 84 | return min(len(self.level_stack) + 1, 6) 85 | return max(len(self.level_stack), 1) 86 | 87 | def process_line(self, line: str) -> str: 88 | """处理单行Markdown文本""" 89 | # 匹配标题行 90 | match = re.match(r'^(#+)\s+(.+)$', line.strip()) 91 | if not match: 92 | return line 93 | 94 | original_level = len(match.group(1)) 95 | title_content = match.group(2) 96 | 97 | # 计算新层级 98 | new_level = self.determine_level(title_content) 99 | new_level = max(1, min(new_level, 6)) # 限制在1-6级 100 | 101 | # 更新层级栈 102 | if new_level > len(self.level_stack): 103 | self.level_stack.append(new_level) 104 | else: 105 | self.level_stack = self.level_stack[:new_level] 106 | 107 | return f"{'#' * new_level} {title_content}\n" 108 | 109 | def process_file(self, input_path: str, output_path: Optional[str] = None): 110 | """处理整个Markdown文件""" 111 | input_file = Path(input_path) 112 | output_file = Path(output_path) if output_path else input_file 113 | 114 | with input_file.open('r', encoding='utf-8') as f: 115 | lines = f.readlines() 116 | 117 | processed_lines = [] 118 | for line in lines: 119 | processed_lines.append(self.process_line(line)) 120 | 121 | with output_file.open('w', encoding='utf-8') as f: 122 | f.writelines(processed_lines) 123 | 124 | 125 | if __name__ == '__main__': 126 | main() -------------------------------------------------------------------------------- /core/converters/mp3.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tempfile 3 | from typing import Union 4 | from warnings import catch_warnings, resetwarnings 5 | 6 | # Optional Transcription support 7 | IS_AUDIO_TRANSCRIPTION_CAPABLE = False 8 | try: 9 | # Using warnings' catch_warnings to catch 10 | # pydub's warning of ffmpeg or avconv missing 11 | with catch_warnings(record=True) as w: 12 | import pydub 13 | 14 | if w: 15 | raise ModuleNotFoundError 16 | import speech_recognition as sr 17 | 18 | IS_AUDIO_TRANSCRIPTION_CAPABLE = True 19 | except ModuleNotFoundError: 20 | pass 21 | finally: 22 | resetwarnings() 23 | 24 | from core.base import DocumentConverterResult 25 | from core.converters.wav import WavConverter 26 | 27 | 28 | class Mp3Converter(WavConverter): 29 | """ 30 | Converts MP3 files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` AND `pydub` are installed). 31 | """ 32 | 33 | def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: 34 | # Bail if not a MP3 35 | extension = kwargs.get("file_extension", "") 36 | if extension.lower() != ".mp3": 37 | return None 38 | 39 | md_content = "" 40 | 41 | # Add metadata 42 | metadata = self._get_metadata(local_path, kwargs.get("exiftool_path")) 43 | if metadata: 44 | for f in [ 45 | "Title", 46 | "Artist", 47 | "Author", 48 | "Band", 49 | "Album", 50 | "Genre", 51 | "Track", 52 | "DateTimeOriginal", 53 | "CreateDate", 54 | "Duration", 55 | ]: 56 | if f in metadata: 57 | md_content += f"{f}: {metadata[f]}\n" 58 | 59 | # Transcribe 60 | if IS_AUDIO_TRANSCRIPTION_CAPABLE: 61 | handle, temp_path = tempfile.mkstemp(suffix=".wav") 62 | os.close(handle) 63 | try: 64 | sound = pydub.AudioSegment.from_mp3(local_path) 65 | sound.export(temp_path, format="wav") 66 | 67 | _args = dict() 68 | _args.update(kwargs) 69 | _args["file_extension"] = ".wav" 70 | 71 | try: 72 | transcript = super()._transcribe_audio(temp_path).strip() 73 | md_content += "\n\n### Audio Transcript:\n" + ( 74 | "[No speech detected]" if transcript == "" else transcript 75 | ) 76 | except Exception: 77 | md_content += "\n\n### Audio Transcript:\nError. Could not transcribe this audio." 78 | 79 | finally: 80 | os.unlink(temp_path) 81 | 82 | # Return the result 83 | return DocumentConverterResult( 84 | title=None, 85 | text_content=md_content.strip(), 86 | ) 87 | -------------------------------------------------------------------------------- /core/converters/outlook.py: -------------------------------------------------------------------------------- 1 | from typing import Union, Any 2 | 3 | from olefile import olefile 4 | 5 | from core.base import FileConversionException, DocumentConverterResult, DocumentConverter 6 | 7 | 8 | class OutlookMsgConverter(DocumentConverter): 9 | """Converts Outlook .msg files to markdown by extracting email metadata and content. 10 | 11 | Uses the olefile package to parse the .msg file structure and extract: 12 | - Email headers (From, To, Subject) 13 | - Email body content 14 | """ 15 | 16 | def convert( 17 | self, local_path: str, **kwargs: Any 18 | ) -> Union[None, DocumentConverterResult]: 19 | # Bail if not a MSG file 20 | extension = kwargs.get("file_extension", "") 21 | if extension.lower() != ".msg": 22 | return None 23 | 24 | try: 25 | msg = olefile.OleFileIO(local_path) 26 | # Extract email metadata 27 | md_content = "# Email Message\n\n" 28 | 29 | # Get headers 30 | headers = { 31 | "From": self._get_stream_data(msg, "__substg1.0_0C1F001F"), 32 | "To": self._get_stream_data(msg, "__substg1.0_0E04001F"), 33 | "Subject": self._get_stream_data(msg, "__substg1.0_0037001F"), 34 | } 35 | 36 | # Add headers to markdown 37 | for key, value in headers.items(): 38 | if value: 39 | md_content += f"**{key}:** {value}\n" 40 | 41 | md_content += "\n## Content\n\n" 42 | 43 | # Get email body 44 | body = self._get_stream_data(msg, "__substg1.0_1000001F") 45 | if body: 46 | md_content += body 47 | 48 | msg.close() 49 | 50 | return DocumentConverterResult( 51 | title=headers.get("Subject"), text_content=md_content.strip() 52 | ) 53 | 54 | except Exception as e: 55 | raise FileConversionException( 56 | f"Could not convert MSG file '{local_path}': {str(e)}" 57 | ) 58 | 59 | def _get_stream_data( 60 | self, msg: olefile.OleFileIO, stream_path: str 61 | ) -> Union[str, None]: 62 | """Helper to safely extract and decode stream data from the MSG file.""" 63 | try: 64 | if msg.exists(stream_path): 65 | data = msg.openstream(stream_path).read() 66 | # Try UTF-16 first (common for .msg files) 67 | try: 68 | return data.decode("utf-16-le").strip() 69 | except UnicodeDecodeError: 70 | # Fall back to UTF-8 71 | try: 72 | return data.decode("utf-8").strip() 73 | except UnicodeDecodeError: 74 | # Last resort - ignore errors 75 | return data.decode("utf-8", errors="ignore").strip() 76 | except Exception: 77 | pass 78 | return None -------------------------------------------------------------------------------- /core/converters/pdf.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | from pathlib import Path 3 | 4 | from core.base import DocumentConverter, DocumentConverterResult, FileConversionException 5 | 6 | 7 | class PdfConverter(DocumentConverter): 8 | """默认PDF解析器(simple模式,基于pdfminer)""" 9 | 10 | def convert(self, local_path: str, **kwargs) -> Union[None, DocumentConverterResult]: 11 | # Bail if not a pdf 12 | extension = kwargs.get("file_extension", "") 13 | if extension.lower() != ".pdf": 14 | return None 15 | try: 16 | import pdfminer.high_level 17 | return DocumentConverterResult( 18 | title=None, 19 | text_content=pdfminer.high_level.extract_text(local_path) 20 | ) 21 | except Exception as e: 22 | raise FileConversionException(f"Simple PDF解析失败: {str(e)}") 23 | 24 | 25 | class AdvancedPdfConverter(DocumentConverter): 26 | """使用mineru的增强PDF解析器(advanced模式)""" 27 | 28 | def convert(self, local_path: str, **kwargs) -> DocumentConverterResult: 29 | # Bail if not a pdf 30 | extension = kwargs.get("file_extension", "") 31 | if extension.lower() != ".pdf": 32 | return None 33 | 34 | try: 35 | from core.converters.mineru.pdf_processor import PDFProcessor 36 | processor = PDFProcessor(**kwargs) 37 | result = processor.process(local_path) 38 | 39 | # 读取生成的markdown文件 40 | with open(result["markdown"], "r", encoding="utf-8") as f: 41 | md_content = f.read() 42 | 43 | return DocumentConverterResult( 44 | title=Path(local_path).stem, 45 | text_content=md_content 46 | ) 47 | except ImportError: 48 | raise RuntimeError("miner模块未找到,请安装mineru解析器") 49 | except Exception as e: 50 | raise FileConversionException(f"Advanced PDF解析失败: {str(e)}") 51 | 52 | 53 | class CloudPdfConverter(DocumentConverter): 54 | """云端PDF解析器(预留cloud模式实现)""" 55 | 56 | def convert(self, local_path: str, **kwargs) -> DocumentConverterResult: 57 | # Bail if not a pdf 58 | extension = kwargs.get("file_extension", "") 59 | if extension.lower() != ".pdf": 60 | return None 61 | raise NotImplementedError("Cloud模式尚未实现") 62 | 63 | -------------------------------------------------------------------------------- /core/converters/plaintext.py: -------------------------------------------------------------------------------- 1 | import mimetypes 2 | from typing import Any, Union 3 | 4 | from charset_normalizer import from_path 5 | 6 | from core.base import DocumentConverter, DocumentConverterResult 7 | 8 | 9 | class PlainTextConverter(DocumentConverter): 10 | """Anything with content type text/plain""" 11 | 12 | def convert( 13 | self, local_path: str, **kwargs: Any 14 | ) -> Union[None, DocumentConverterResult]: 15 | # Guess the content type from any file extension that might be around 16 | content_type, _ = mimetypes.guess_type( 17 | "__placeholder" + kwargs.get("file_extension", "") 18 | ) 19 | 20 | # Only accept text files 21 | if content_type is None: 22 | return None 23 | elif all( 24 | not content_type.lower().startswith(type_prefix) 25 | for type_prefix in ["text/", "application/json"] 26 | ): 27 | return None 28 | 29 | text_content = str(from_path(local_path).best()) 30 | return DocumentConverterResult( 31 | title=None, 32 | text_content=text_content, 33 | ) 34 | -------------------------------------------------------------------------------- /core/converters/pptx.py: -------------------------------------------------------------------------------- 1 | import html 2 | import re 3 | from typing import Union 4 | 5 | import pptx 6 | 7 | from core.base import DocumentConverterResult 8 | from core.converters.html import HtmlConverter 9 | 10 | 11 | class PptxConverter(HtmlConverter): 12 | """ 13 | Converts PPTX files to Markdown. Supports heading, tables and images with alt text. 14 | """ 15 | 16 | def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: 17 | # Bail if not a PPTX 18 | extension = kwargs.get("file_extension", "") 19 | if extension.lower() != ".pptx": 20 | return None 21 | 22 | md_content = "" 23 | 24 | presentation = pptx.Presentation(local_path) 25 | slide_num = 0 26 | for slide in presentation.slides: 27 | slide_num += 1 28 | 29 | md_content += f"\n\n\n" 30 | 31 | title = slide.shapes.title 32 | for shape in slide.shapes: 33 | # Pictures 34 | if self._is_picture(shape): 35 | # https://github.com/scanny/python-pptx/pull/512#issuecomment-1713100069 36 | alt_text = "" 37 | try: 38 | alt_text = shape._element._nvXxPr.cNvPr.attrib.get("descr", "") 39 | except Exception: 40 | pass 41 | 42 | # A placeholder name 43 | filename = re.sub(r"\W", "", shape.name) + ".jpg" 44 | md_content += ( 45 | "\n![" 46 | + (alt_text if alt_text else shape.name) 47 | + "](" 48 | + filename 49 | + ")\n" 50 | ) 51 | 52 | # Tables 53 | if self._is_table(shape): 54 | html_table = "" 55 | first_row = True 56 | for row in shape.table.rows: 57 | html_table += "" 58 | for cell in row.cells: 59 | if first_row: 60 | html_table += "" 61 | else: 62 | html_table += "" 63 | html_table += "" 64 | first_row = False 65 | html_table += "
" + html.escape(cell.text) + "" + html.escape(cell.text) + "
" 66 | md_content += ( 67 | "\n" + self._convert(html_table).text_content.strip() + "\n" 68 | ) 69 | 70 | # Charts 71 | if shape.has_chart: 72 | md_content += self._convert_chart_to_markdown(shape.chart) 73 | 74 | # Text areas 75 | elif shape.has_text_frame: 76 | if shape == title: 77 | md_content += "# " + shape.text.lstrip() + "\n" 78 | else: 79 | md_content += shape.text + "\n" 80 | 81 | md_content = md_content.strip() 82 | 83 | if slide.has_notes_slide: 84 | md_content += "\n\n### Notes:\n" 85 | notes_frame = slide.notes_slide.notes_text_frame 86 | if notes_frame is not None: 87 | md_content += notes_frame.text 88 | md_content = md_content.strip() 89 | 90 | return DocumentConverterResult( 91 | title=None, 92 | text_content=md_content.strip(), 93 | ) 94 | 95 | def _is_picture(self, shape): 96 | if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PICTURE: 97 | return True 98 | if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PLACEHOLDER: 99 | if hasattr(shape, "image"): 100 | return True 101 | return False 102 | 103 | def _is_table(self, shape): 104 | if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.TABLE: 105 | return True 106 | return False 107 | 108 | def _convert_chart_to_markdown(self, chart): 109 | md = "\n\n### Chart" 110 | if chart.has_title: 111 | md += f": {chart.chart_title.text_frame.text}" 112 | md += "\n\n" 113 | data = [] 114 | category_names = [c.label for c in chart.plots[0].categories] 115 | series_names = [s.name for s in chart.series] 116 | data.append(["Category"] + series_names) 117 | 118 | for idx, category in enumerate(category_names): 119 | row = [category] 120 | for series in chart.series: 121 | row.append(series.values[idx]) 122 | data.append(row) 123 | 124 | markdown_table = [] 125 | for row in data: 126 | markdown_table.append("| " + " | ".join(map(str, row)) + " |") 127 | header = markdown_table[0] 128 | separator = "|" + "|".join(["---"] * len(data[0])) + "|" 129 | return md + "\n".join([header, separator] + markdown_table[1:]) 130 | -------------------------------------------------------------------------------- /core/converters/rss.py: -------------------------------------------------------------------------------- 1 | import traceback 2 | from typing import Union 3 | from xml.dom import minidom 4 | 5 | from bs4 import BeautifulSoup 6 | 7 | from core.base import DocumentConverter, DocumentConverterResult 8 | from core.converters.custommarkdownify import _CustomMarkdownify 9 | 10 | 11 | class RSSConverter(DocumentConverter): 12 | """Convert RSS / Atom type to markdown""" 13 | 14 | def convert( 15 | self, local_path: str, **kwargs 16 | ) -> Union[None, DocumentConverterResult]: 17 | # Bail if not RSS type 18 | extension = kwargs.get("file_extension", "") 19 | if extension.lower() not in [".xml", ".rss", ".atom"]: 20 | return None 21 | try: 22 | doc = minidom.parse(local_path) 23 | except BaseException as _: 24 | return None 25 | result = None 26 | if doc.getElementsByTagName("rss"): 27 | # A RSS feed must have a root element of 28 | result = self._parse_rss_type(doc) 29 | elif doc.getElementsByTagName("feed"): 30 | root = doc.getElementsByTagName("feed")[0] 31 | if root.getElementsByTagName("entry"): 32 | # An Atom feed must have a root element of and at least one 33 | result = self._parse_atom_type(doc) 34 | else: 35 | return None 36 | else: 37 | # not rss or atom 38 | return None 39 | 40 | return result 41 | 42 | def _parse_atom_type( 43 | self, doc: minidom.Document 44 | ) -> Union[None, DocumentConverterResult]: 45 | """Parse the type of an Atom feed. 46 | 47 | Returns None if the feed type is not recognized or something goes wrong. 48 | """ 49 | try: 50 | root = doc.getElementsByTagName("feed")[0] 51 | title = self._get_data_by_tag_name(root, "title") 52 | subtitle = self._get_data_by_tag_name(root, "subtitle") 53 | entries = root.getElementsByTagName("entry") 54 | md_text = f"# {title}\n" 55 | if subtitle: 56 | md_text += f"{subtitle}\n" 57 | for entry in entries: 58 | entry_title = self._get_data_by_tag_name(entry, "title") 59 | entry_summary = self._get_data_by_tag_name(entry, "summary") 60 | entry_updated = self._get_data_by_tag_name(entry, "updated") 61 | entry_content = self._get_data_by_tag_name(entry, "content") 62 | 63 | if entry_title: 64 | md_text += f"\n## {entry_title}\n" 65 | if entry_updated: 66 | md_text += f"Updated on: {entry_updated}\n" 67 | if entry_summary: 68 | md_text += self._parse_content(entry_summary) 69 | if entry_content: 70 | md_text += self._parse_content(entry_content) 71 | 72 | return DocumentConverterResult( 73 | title=title, 74 | text_content=md_text, 75 | ) 76 | except BaseException as _: 77 | return None 78 | 79 | def _parse_rss_type( 80 | self, doc: minidom.Document 81 | ) -> Union[None, DocumentConverterResult]: 82 | """Parse the type of an RSS feed. 83 | 84 | Returns None if the feed type is not recognized or something goes wrong. 85 | """ 86 | try: 87 | root = doc.getElementsByTagName("rss")[0] 88 | channel = root.getElementsByTagName("channel") 89 | if not channel: 90 | return None 91 | channel = channel[0] 92 | channel_title = self._get_data_by_tag_name(channel, "title") 93 | channel_description = self._get_data_by_tag_name(channel, "description") 94 | items = channel.getElementsByTagName("item") 95 | if channel_title: 96 | md_text = f"# {channel_title}\n" 97 | if channel_description: 98 | md_text += f"{channel_description}\n" 99 | if not items: 100 | items = [] 101 | for item in items: 102 | title = self._get_data_by_tag_name(item, "title") 103 | description = self._get_data_by_tag_name(item, "description") 104 | pubDate = self._get_data_by_tag_name(item, "pubDate") 105 | content = self._get_data_by_tag_name(item, "content:encoded") 106 | 107 | if title: 108 | md_text += f"\n## {title}\n" 109 | if pubDate: 110 | md_text += f"Published on: {pubDate}\n" 111 | if description: 112 | md_text += self._parse_content(description) 113 | if content: 114 | md_text += self._parse_content(content) 115 | 116 | return DocumentConverterResult( 117 | title=channel_title, 118 | text_content=md_text, 119 | ) 120 | except BaseException as _: 121 | print(traceback.format_exc()) 122 | return None 123 | 124 | def _parse_content(self, content: str) -> str: 125 | """Parse the content of an RSS feed item""" 126 | try: 127 | # using bs4 because many RSS feeds have HTML-styled content 128 | soup = BeautifulSoup(content, "html.parser") 129 | return _CustomMarkdownify().convert_soup(soup) 130 | except BaseException as _: 131 | return content 132 | 133 | def _get_data_by_tag_name( 134 | self, element: minidom.Element, tag_name: str 135 | ) -> Union[str, None]: 136 | """Get data from first child element with the given tag name. 137 | Returns None when no such element is found. 138 | """ 139 | nodes = element.getElementsByTagName(tag_name) 140 | if not nodes: 141 | return None 142 | fc = nodes[0].firstChild 143 | if fc: 144 | return fc.data 145 | return None 146 | -------------------------------------------------------------------------------- /core/converters/wav.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | from warnings import catch_warnings, resetwarnings 3 | 4 | # Optional Transcription support 5 | IS_AUDIO_TRANSCRIPTION_CAPABLE = False 6 | try: 7 | # Using warnings' catch_warnings to catch 8 | # pydub's warning of ffmpeg or avconv missing 9 | with catch_warnings(record=True) as w: 10 | import pydub 11 | 12 | if w: 13 | raise ModuleNotFoundError 14 | import speech_recognition as sr 15 | 16 | IS_AUDIO_TRANSCRIPTION_CAPABLE = True 17 | except ModuleNotFoundError: 18 | pass 19 | finally: 20 | resetwarnings() 21 | 22 | from core.base import DocumentConverterResult 23 | from core.converters.media import MediaConverter 24 | 25 | 26 | class WavConverter(MediaConverter): 27 | """ 28 | Converts WAV files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` is installed). 29 | """ 30 | 31 | def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: 32 | # Bail if not a WAV 33 | extension = kwargs.get("file_extension", "") 34 | if extension.lower() != ".wav": 35 | return None 36 | 37 | md_content = "" 38 | 39 | # Add metadata 40 | metadata = self._get_metadata(local_path, kwargs.get("exiftool_path")) 41 | if metadata: 42 | for f in [ 43 | "Title", 44 | "Artist", 45 | "Author", 46 | "Band", 47 | "Album", 48 | "Genre", 49 | "Track", 50 | "DateTimeOriginal", 51 | "CreateDate", 52 | "Duration", 53 | ]: 54 | if f in metadata: 55 | md_content += f"{f}: {metadata[f]}\n" 56 | 57 | # Transcribe 58 | if IS_AUDIO_TRANSCRIPTION_CAPABLE: 59 | try: 60 | transcript = self._transcribe_audio(local_path) 61 | md_content += "\n\n### Audio Transcript:\n" + ( 62 | "[No speech detected]" if transcript == "" else transcript 63 | ) 64 | except Exception: 65 | md_content += ( 66 | "\n\n### Audio Transcript:\nError. Could not transcribe this audio." 67 | ) 68 | 69 | return DocumentConverterResult( 70 | title=None, 71 | text_content=md_content.strip(), 72 | ) 73 | 74 | def _transcribe_audio(self, local_path) -> str: 75 | recognizer = sr.Recognizer() 76 | with sr.AudioFile(local_path) as source: 77 | audio = recognizer.record(source) 78 | return recognizer.recognize_google(audio).strip() 79 | -------------------------------------------------------------------------------- /core/converters/wikipedia.py: -------------------------------------------------------------------------------- 1 | import re 2 | from typing import Any, Union 3 | 4 | from bs4 import BeautifulSoup 5 | 6 | from core.base import DocumentConverter, DocumentConverterResult 7 | from core.converters.custommarkdownify import _CustomMarkdownify 8 | 9 | 10 | class WikipediaConverter(DocumentConverter): 11 | """Handle Wikipedia pages separately, focusing only on the main document content.""" 12 | 13 | def convert( 14 | self, local_path: str, **kwargs: Any 15 | ) -> Union[None, DocumentConverterResult]: 16 | # Bail if not Wikipedia 17 | extension = kwargs.get("file_extension", "") 18 | if extension.lower() not in [".html", ".htm"]: 19 | return None 20 | url = kwargs.get("url", "") 21 | if not re.search(r"^https?:\/\/[a-zA-Z]{2,3}\.wikipedia.org\/", url): 22 | return None 23 | 24 | # Parse the file 25 | soup = None 26 | with open(local_path, "rt", encoding="utf-8") as fh: 27 | soup = BeautifulSoup(fh.read(), "html.parser") 28 | 29 | # Remove javascript and style blocks 30 | for script in soup(["script", "style"]): 31 | script.extract() 32 | 33 | # Print only the main content 34 | body_elm = soup.find("div", {"id": "mw-content-text"}) 35 | title_elm = soup.find("span", {"class": "mw-page-title-main"}) 36 | 37 | webpage_text = "" 38 | main_title = None if soup.title is None else soup.title.string 39 | 40 | if body_elm: 41 | # What's the title 42 | if title_elm and len(title_elm) > 0: 43 | main_title = title_elm.string # type: ignore 44 | assert isinstance(main_title, str) 45 | 46 | # Convert the page 47 | webpage_text = f"# {main_title}\n\n" + _CustomMarkdownify().convert_soup( 48 | body_elm 49 | ) 50 | else: 51 | webpage_text = _CustomMarkdownify().convert_soup(soup) 52 | 53 | return DocumentConverterResult( 54 | title=main_title, 55 | text_content=webpage_text, 56 | ) 57 | -------------------------------------------------------------------------------- /core/converters/xls.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | 3 | import pandas as pd 4 | 5 | from core.base import DocumentConverterResult 6 | from core.converters.html import HtmlConverter 7 | 8 | 9 | class XlsConverter(HtmlConverter): 10 | """ 11 | Converts XLS files to Markdown, with each sheet presented as a separate Markdown table. 12 | """ 13 | 14 | def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: 15 | # Bail if not a XLS 16 | extension = kwargs.get("file_extension", "") 17 | if extension.lower() != ".xls": 18 | return None 19 | 20 | sheets = pd.read_excel(local_path, sheet_name=None, engine="xlrd") 21 | md_content = "" 22 | for s in sheets: 23 | md_content += f"## {s}\n" 24 | html_content = sheets[s].to_html(index=False) 25 | md_content += self._convert(html_content).text_content.strip() + "\n\n" 26 | 27 | return DocumentConverterResult( 28 | title=None, 29 | text_content=md_content.strip(), 30 | ) 31 | -------------------------------------------------------------------------------- /core/converters/xlsx.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | 3 | import pandas as pd 4 | 5 | from core.base import DocumentConverterResult 6 | from core.converters.html import HtmlConverter 7 | 8 | 9 | class XlsxConverter(HtmlConverter): 10 | """ 11 | Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table. 12 | """ 13 | 14 | def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: 15 | # Bail if not a XLSX 16 | extension = kwargs.get("file_extension", "") 17 | if extension.lower() != ".xlsx": 18 | return None 19 | 20 | sheets = pd.read_excel(local_path, sheet_name=None, engine="openpyxl") 21 | md_content = "" 22 | for s in sheets: 23 | md_content += f"## {s}\n" 24 | html_content = sheets[s].to_html(index=False) 25 | md_content += self._convert(html_content).text_content.strip() + "\n\n" 26 | 27 | return DocumentConverterResult( 28 | title=None, 29 | text_content=md_content.strip(), 30 | ) 31 | -------------------------------------------------------------------------------- /core/converters/youtube.py: -------------------------------------------------------------------------------- 1 | import json 2 | import re 3 | from typing import Any, Union, Dict, List 4 | from urllib.parse import urlparse, parse_qs 5 | 6 | from bs4 import BeautifulSoup 7 | 8 | # Optional YouTube transcription support 9 | try: 10 | from youtube_transcript_api import YouTubeTranscriptApi 11 | 12 | IS_YOUTUBE_TRANSCRIPT_CAPABLE = True 13 | except ModuleNotFoundError: 14 | pass 15 | 16 | 17 | from core.base import DocumentConverter, DocumentConverterResult 18 | 19 | 20 | class YouTubeConverter(DocumentConverter): 21 | """Handle YouTube specially, focusing on the video title, description, and transcript.""" 22 | 23 | def convert( 24 | self, local_path: str, **kwargs: Any 25 | ) -> Union[None, DocumentConverterResult]: 26 | # Bail if not YouTube 27 | extension = kwargs.get("file_extension", "") 28 | if extension.lower() not in [".html", ".htm"]: 29 | return None 30 | url = kwargs.get("url", "") 31 | if not url.startswith("https://www.youtube.com/watch?"): 32 | return None 33 | 34 | # Parse the file 35 | soup = None 36 | with open(local_path, "rt", encoding="utf-8") as fh: 37 | soup = BeautifulSoup(fh.read(), "html.parser") 38 | 39 | # Read the meta tags 40 | assert soup.title is not None and soup.title.string is not None 41 | metadata: Dict[str, str] = {"title": soup.title.string} 42 | for meta in soup(["meta"]): 43 | for a in meta.attrs: 44 | if a in ["itemprop", "property", "name"]: 45 | metadata[meta[a]] = meta.get("content", "") 46 | break 47 | 48 | # We can also try to read the full description. This is more prone to breaking, since it reaches into the page implementation 49 | try: 50 | for script in soup(["script"]): 51 | content = script.text 52 | if "ytInitialData" in content: 53 | lines = re.split(r"\r?\n", content) 54 | obj_start = lines[0].find("{") 55 | obj_end = lines[0].rfind("}") 56 | if obj_start >= 0 and obj_end >= 0: 57 | data = json.loads(lines[0][obj_start : obj_end + 1]) 58 | attrdesc = self._findKey(data, "attributedDescriptionBodyText") # type: ignore 59 | if attrdesc: 60 | metadata["description"] = str(attrdesc["content"]) 61 | break 62 | except Exception: 63 | pass 64 | 65 | # Start preparing the page 66 | webpage_text = "# YouTube\n" 67 | 68 | title = self._get(metadata, ["title", "og:title", "name"]) # type: ignore 69 | assert isinstance(title, str) 70 | 71 | if title: 72 | webpage_text += f"\n## {title}\n" 73 | 74 | stats = "" 75 | views = self._get(metadata, ["interactionCount"]) # type: ignore 76 | if views: 77 | stats += f"- **Views:** {views}\n" 78 | 79 | keywords = self._get(metadata, ["keywords"]) # type: ignore 80 | if keywords: 81 | stats += f"- **Keywords:** {keywords}\n" 82 | 83 | runtime = self._get(metadata, ["duration"]) # type: ignore 84 | if runtime: 85 | stats += f"- **Runtime:** {runtime}\n" 86 | 87 | if len(stats) > 0: 88 | webpage_text += f"\n### Video Metadata\n{stats}\n" 89 | 90 | description = self._get(metadata, ["description", "og:description"]) # type: ignore 91 | if description: 92 | webpage_text += f"\n### Description\n{description}\n" 93 | 94 | if IS_YOUTUBE_TRANSCRIPT_CAPABLE: 95 | transcript_text = "" 96 | parsed_url = urlparse(url) # type: ignore 97 | params = parse_qs(parsed_url.query) # type: ignore 98 | if "v" in params: 99 | assert isinstance(params["v"][0], str) 100 | video_id = str(params["v"][0]) 101 | try: 102 | youtube_transcript_languages = kwargs.get( 103 | "youtube_transcript_languages", ("en",) 104 | ) 105 | # Must be a single transcript. 106 | transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=youtube_transcript_languages) # type: ignore 107 | transcript_text = " ".join([part["text"] for part in transcript]) # type: ignore 108 | # Alternative formatting: 109 | # formatter = TextFormatter() 110 | # formatter.format_transcript(transcript) 111 | except Exception: 112 | pass 113 | if transcript_text: 114 | webpage_text += f"\n### Transcript\n{transcript_text}\n" 115 | 116 | title = title if title else soup.title.string 117 | assert isinstance(title, str) 118 | 119 | return DocumentConverterResult( 120 | title=title, 121 | text_content=webpage_text, 122 | ) 123 | 124 | def _get( 125 | self, 126 | metadata: Dict[str, str], 127 | keys: List[str], 128 | default: Union[str, None] = None, 129 | ) -> Union[str, None]: 130 | for k in keys: 131 | if k in metadata: 132 | return metadata[k] 133 | return default 134 | 135 | def _findKey(self, json: Any, key: str) -> Union[str, None]: # TODO: Fix json type 136 | if isinstance(json, list): 137 | for elm in json: 138 | ret = self._findKey(elm, key) 139 | if ret is not None: 140 | return ret 141 | elif isinstance(json, dict): 142 | for k in json: 143 | if k == key: 144 | return json[k] 145 | else: 146 | ret = self._findKey(json[k], key) 147 | if ret is not None: 148 | return ret 149 | return None 150 | -------------------------------------------------------------------------------- /core/converters/zip.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import zipfile 4 | from typing import Any, Union 5 | 6 | from core.base import DocumentConverterResult, DocumentConverter 7 | 8 | 9 | class ZipConverter(DocumentConverter): 10 | """Converts ZIP files to markdown by extracting and converting all contained files. 11 | 12 | The converter extracts the ZIP contents to a temporary directory, processes each file 13 | using appropriate converters based on file extensions, and then combines the results 14 | into a single markdown document. The temporary directory is cleaned up after processing. 15 | 16 | Example output format: 17 | ```markdown 18 | Content from the zip file `example.zip`: 19 | 20 | ## File: docs/readme.txt 21 | 22 | This is the content of readme.txt 23 | Multiple lines are preserved 24 | 25 | ## File: images/example.jpg 26 | 27 | ImageSize: 1920x1080 28 | DateTimeOriginal: 2024-02-15 14:30:00 29 | Description: A beautiful landscape photo 30 | 31 | ## File: data/report.xlsx 32 | 33 | ## Sheet1 34 | | Column1 | Column2 | Column3 | 35 | |---------|---------|---------| 36 | | data1 | data2 | data3 | 37 | | data4 | data5 | data6 | 38 | ``` 39 | 40 | Key features: 41 | - Maintains original file structure in headings 42 | - Processes nested files recursively 43 | - Uses appropriate converters for each file type 44 | - Preserves formatting of converted content 45 | - Cleans up temporary files after processing 46 | """ 47 | 48 | def convert( 49 | self, local_path: str, **kwargs: Any 50 | ) -> Union[None, DocumentConverterResult]: 51 | # Bail if not a ZIP 52 | extension = kwargs.get("file_extension", "") 53 | if extension.lower() != ".zip": 54 | return None 55 | 56 | # Get parent converters list if available 57 | parent_converters = kwargs.get("_parent_converters", []) 58 | if not parent_converters: 59 | return DocumentConverterResult( 60 | title=None, 61 | text_content=f"[ERROR] No converters available to process zip contents from: {local_path}", 62 | ) 63 | 64 | extracted_zip_folder_name = ( 65 | f"extracted_{os.path.basename(local_path).replace('.zip', '_zip')}" 66 | ) 67 | extraction_dir = os.path.normpath( 68 | os.path.join(os.path.dirname(local_path), extracted_zip_folder_name) 69 | ) 70 | md_content = f"Content from the zip file `{os.path.basename(local_path)}`:\n\n" 71 | 72 | try: 73 | # Extract the zip file safely 74 | with zipfile.ZipFile(local_path, "r") as zipObj: 75 | # Safeguard against path traversal 76 | for member in zipObj.namelist(): 77 | member_path = os.path.normpath(os.path.join(extraction_dir, member)) 78 | if ( 79 | not os.path.commonprefix([extraction_dir, member_path]) 80 | == extraction_dir 81 | ): 82 | raise ValueError( 83 | f"Path traversal detected in zip file: {member}" 84 | ) 85 | 86 | # Extract all files safely 87 | zipObj.extractall(path=extraction_dir) 88 | 89 | # Process each extracted file 90 | for root, dirs, files in os.walk(extraction_dir): 91 | for name in files: 92 | file_path = os.path.join(root, name) 93 | relative_path = os.path.relpath(file_path, extraction_dir) 94 | 95 | # Get file extension 96 | _, file_extension = os.path.splitext(name) 97 | 98 | # Update kwargs for the file 99 | file_kwargs = kwargs.copy() 100 | file_kwargs["file_extension"] = file_extension 101 | file_kwargs["_parent_converters"] = parent_converters 102 | 103 | # Try converting the file using available converters 104 | for converter in parent_converters: 105 | # Skip the zip converter to avoid infinite recursion 106 | if isinstance(converter, ZipConverter): 107 | continue 108 | 109 | result = converter.convert(file_path, **file_kwargs) 110 | if result is not None: 111 | md_content += f"\n## File: {relative_path}\n\n" 112 | md_content += result.text_content + "\n\n" 113 | break 114 | 115 | # Clean up extracted files if specified 116 | if kwargs.get("cleanup_extracted", True): 117 | shutil.rmtree(extraction_dir) 118 | 119 | return DocumentConverterResult(title=None, text_content=md_content.strip()) 120 | 121 | except zipfile.BadZipFile: 122 | return DocumentConverterResult( 123 | title=None, 124 | text_content=f"[ERROR] Invalid or corrupted zip file: {local_path}", 125 | ) 126 | except ValueError as ve: 127 | return DocumentConverterResult( 128 | title=None, 129 | text_content=f"[ERROR] Security error in zip file {local_path}: {str(ve)}", 130 | ) 131 | except Exception as e: 132 | return DocumentConverterResult( 133 | title=None, 134 | text_content=f"[ERROR] Failed to process zip file {local_path}: {str(e)}", 135 | ) -------------------------------------------------------------------------------- /core/markitdown.py: -------------------------------------------------------------------------------- 1 | # type: ignore 2 | import copy 3 | import mimetypes 4 | import os 5 | import re 6 | import tempfile 7 | import traceback 8 | from pathlib import Path 9 | from typing import Any, List, Optional, Union 10 | from urllib.parse import urlparse 11 | 12 | # File-format detection 13 | import puremagic 14 | import requests 15 | 16 | from core.base import DocumentConverterResult, DocumentConverter, FileConversionException, UnsupportedFormatException 17 | from core.converters.bingsearch import BingSerpConverter 18 | from core.converters.docx import DocxConverter 19 | from core.converters.html import HtmlConverter 20 | from core.converters.image import ImageConverter 21 | from core.converters.ipynb import IpynbConverter 22 | from core.converters.mp3 import Mp3Converter 23 | from core.converters.outlook import OutlookMsgConverter 24 | from core.converters.plaintext import PlainTextConverter 25 | from core.converters.pptx import PptxConverter 26 | from core.converters.rss import RSSConverter 27 | from core.converters.wav import WavConverter 28 | from core.converters.wikipedia import WikipediaConverter 29 | from core.converters.xls import XlsConverter 30 | from core.converters.xlsx import XlsxConverter 31 | from core.converters.youtube import YouTubeConverter 32 | from core.converters.zip import ZipConverter 33 | 34 | 35 | class MarkItDown: 36 | """(In preview) An extremely simple text-based document reader, suitable for LLM use. 37 | This reader will convert common file-types or webpages to Markdown.""" 38 | 39 | def __init__( 40 | self, 41 | requests_session: Optional[requests.Session] = None, 42 | llm_client: Optional[Any] = None, 43 | llm_model: Optional[str] = None, 44 | style_map: Optional[str] = None, 45 | exiftool_path: Optional[str] = None, 46 | mode: str = "simple", # simple|advanced|cloud 47 | ): 48 | self.mode = mode 49 | if requests_session is None: 50 | self._requests_session = requests.Session() 51 | else: 52 | self._requests_session = requests_session 53 | 54 | if exiftool_path is None: 55 | exiftool_path = os.environ.get("EXIFTOOL_PATH") 56 | 57 | self._llm_client = llm_client 58 | self._llm_model = llm_model 59 | self._style_map = style_map 60 | self._exiftool_path = exiftool_path 61 | 62 | self._page_converters: List[DocumentConverter] = [] 63 | 64 | # Register converters for successful browsing operations 65 | # Later registrations are tried first / take higher priority than earlier registrations 66 | # To this end, the most specific converters should appear below the most generic converters 67 | self.register_page_converter(PlainTextConverter()) 68 | self.register_page_converter(HtmlConverter()) 69 | self.register_page_converter(RSSConverter()) 70 | self.register_page_converter(WikipediaConverter()) 71 | self.register_page_converter(YouTubeConverter()) 72 | self.register_page_converter(BingSerpConverter()) 73 | self.register_page_converter(DocxConverter()) 74 | self.register_page_converter(XlsxConverter()) 75 | self.register_page_converter(XlsConverter()) 76 | self.register_page_converter(PptxConverter()) 77 | self.register_page_converter(WavConverter()) 78 | self.register_page_converter(Mp3Converter()) 79 | self.register_page_converter(ImageConverter()) 80 | self.register_page_converter(IpynbConverter()) 81 | 82 | # 动态注册PDF转换器 83 | # 确保PDF转换器只处理PDF文件 84 | if self.mode == 'advanced': 85 | from core.converters.pdf import AdvancedPdfConverter 86 | self.register_page_converter(AdvancedPdfConverter()) 87 | elif self.mode == 'cloud': 88 | from core.converters.pdf import CloudPdfConverter 89 | self.register_page_converter(CloudPdfConverter()) 90 | else: # 默认simple模式 91 | from core.converters.pdf import PdfConverter 92 | self.register_page_converter(PdfConverter()) 93 | 94 | self.register_page_converter(ZipConverter()) 95 | self.register_page_converter(OutlookMsgConverter()) 96 | 97 | def convert( 98 | self, source: Union[str, requests.Response, Path], **kwargs: Any 99 | ) -> DocumentConverterResult: # TODO: deal with kwargs 100 | """ 101 | Args: 102 | - source: can be a string representing a path either as string pathlib path object or url, or a requests.response object 103 | - extension: specifies the file extension to use when interpreting the file. If None, infer from source (path, uri, content-type, etc.) 104 | """ 105 | 106 | # Local path or url 107 | if isinstance(source, str): 108 | if ( 109 | source.startswith("http://") 110 | or source.startswith("https://") 111 | or source.startswith("file://") 112 | ): 113 | return self.convert_url(source, **kwargs) 114 | else: 115 | return self.convert_local(source, **kwargs) 116 | # Request response 117 | elif isinstance(source, requests.Response): 118 | return self.convert_response(source, **kwargs) 119 | elif isinstance(source, Path): 120 | return self.convert_local(source, **kwargs) 121 | 122 | def convert_local( 123 | self, path: Union[str, Path], **kwargs: Any 124 | ) -> DocumentConverterResult: # TODO: deal with kwargs 125 | if isinstance(path, Path): 126 | path = str(path) 127 | # Prepare a list of extensions to try (in order of priority) 128 | ext = kwargs.get("file_extension") 129 | extensions = [ext] if ext is not None else [] 130 | 131 | # Get extension alternatives from the path and puremagic 132 | base, ext = os.path.splitext(path) 133 | self._append_ext(extensions, ext) 134 | 135 | for g in self._guess_ext_magic(path): 136 | self._append_ext(extensions, g) 137 | 138 | # Convert 139 | return self._convert(path, extensions, **kwargs) 140 | 141 | # TODO what should stream's type be? 142 | def convert_stream( 143 | self, stream: Any, **kwargs: Any 144 | ) -> DocumentConverterResult: # TODO: deal with kwargs 145 | # Prepare a list of extensions to try (in order of priority) 146 | ext = kwargs.get("file_extension") 147 | extensions = [ext] if ext is not None else [] 148 | 149 | # Save the file locally to a temporary file. It will be deleted before this method exits 150 | handle, temp_path = tempfile.mkstemp() 151 | fh = os.fdopen(handle, "wb") 152 | result = None 153 | try: 154 | # Write to the temporary file 155 | content = stream.read() 156 | if isinstance(content, str): 157 | fh.write(content.encode("utf-8")) 158 | else: 159 | fh.write(content) 160 | fh.close() 161 | 162 | # Use puremagic to check for more extension options 163 | for g in self._guess_ext_magic(temp_path): 164 | self._append_ext(extensions, g) 165 | 166 | # Convert 167 | result = self._convert(temp_path, extensions, **kwargs) 168 | # Clean up 169 | finally: 170 | try: 171 | fh.close() 172 | except Exception: 173 | pass 174 | os.unlink(temp_path) 175 | 176 | return result 177 | 178 | def convert_url( 179 | self, url: str, **kwargs: Any 180 | ) -> DocumentConverterResult: # TODO: fix kwargs type 181 | # Send a HTTP request to the URL 182 | response = self._requests_session.get(url, stream=True) 183 | response.raise_for_status() 184 | return self.convert_response(response, **kwargs) 185 | 186 | def convert_response( 187 | self, response: requests.Response, **kwargs: Any 188 | ) -> DocumentConverterResult: # TODO fix kwargs type 189 | # Prepare a list of extensions to try (in order of priority) 190 | ext = kwargs.get("file_extension") 191 | extensions = [ext] if ext is not None else [] 192 | 193 | # Guess from the mimetype 194 | content_type = response.headers.get("content-type", "").split(";")[0] 195 | self._append_ext(extensions, mimetypes.guess_extension(content_type)) 196 | 197 | # Read the content disposition if there is one 198 | content_disposition = response.headers.get("content-disposition", "") 199 | m = re.search(r"filename=([^;]+)", content_disposition) 200 | if m: 201 | base, ext = os.path.splitext(m.group(1).strip("\"'")) 202 | self._append_ext(extensions, ext) 203 | 204 | # Read from the extension from the path 205 | base, ext = os.path.splitext(urlparse(response.url).path) 206 | self._append_ext(extensions, ext) 207 | 208 | # Save the file locally to a temporary file. It will be deleted before this method exits 209 | handle, temp_path = tempfile.mkstemp() 210 | fh = os.fdopen(handle, "wb") 211 | result = None 212 | try: 213 | # Download the file 214 | for chunk in response.iter_content(chunk_size=512): 215 | fh.write(chunk) 216 | fh.close() 217 | 218 | # Use puremagic to check for more extension options 219 | for g in self._guess_ext_magic(temp_path): 220 | self._append_ext(extensions, g) 221 | 222 | # Convert 223 | result = self._convert(temp_path, extensions, url=response.url, **kwargs) 224 | # Clean up 225 | finally: 226 | try: 227 | fh.close() 228 | except Exception: 229 | pass 230 | os.unlink(temp_path) 231 | 232 | return result 233 | 234 | def _convert( 235 | self, local_path: str, extensions: List[Union[str, None]], **kwargs 236 | ) -> DocumentConverterResult: 237 | error_trace = "" 238 | for ext in extensions + [None]: # Try last with no extension 239 | for converter in self._page_converters: 240 | _kwargs = copy.deepcopy(kwargs) 241 | 242 | # Overwrite file_extension appropriately 243 | if ext is None: 244 | if "file_extension" in _kwargs: 245 | del _kwargs["file_extension"] 246 | else: 247 | _kwargs.update({"file_extension": ext}) 248 | 249 | # Copy any additional global options 250 | if "llm_client" not in _kwargs and self._llm_client is not None: 251 | _kwargs["llm_client"] = self._llm_client 252 | 253 | if "llm_model" not in _kwargs and self._llm_model is not None: 254 | _kwargs["llm_model"] = self._llm_model 255 | 256 | if "style_map" not in _kwargs and self._style_map is not None: 257 | _kwargs["style_map"] = self._style_map 258 | 259 | if "exiftool_path" not in _kwargs and self._exiftool_path is not None: 260 | _kwargs["exiftool_path"] = self._exiftool_path 261 | 262 | # Add the list of converters for nested processing 263 | _kwargs["_parent_converters"] = self._page_converters 264 | 265 | # If we hit an error log it and keep trying 266 | try: 267 | res = converter.convert(local_path, **_kwargs) 268 | except Exception: 269 | error_trace = ("\n\n" + traceback.format_exc()).strip() 270 | 271 | if res is not None: 272 | # Normalize the content 273 | res.text_content = "\n".join( 274 | [line.rstrip() for line in re.split(r"\r?\n", res.text_content)] 275 | ) 276 | res.text_content = re.sub(r"\n{3,}", "\n\n", res.text_content) 277 | 278 | # Todo 279 | return res 280 | 281 | # If we got this far without success, report any exceptions 282 | if len(error_trace) > 0: 283 | raise FileConversionException( 284 | f"Could not convert '{local_path}' to Markdown. File type was recognized as {extensions}. While converting the file, the following error was encountered:\n\n{error_trace}" 285 | ) 286 | 287 | # Nothing can handle it! 288 | raise UnsupportedFormatException( 289 | f"Could not convert '{local_path}' to Markdown. The formats {extensions} are not supported." 290 | ) 291 | 292 | def _append_ext(self, extensions, ext): 293 | """Append a unique non-None, non-empty extension to a list of extensions.""" 294 | if ext is None: 295 | return 296 | ext = ext.strip() 297 | if ext == "": 298 | return 299 | # if ext not in extensions: 300 | extensions.append(ext) 301 | 302 | def _guess_ext_magic(self, path): 303 | """Use puremagic (a Python implementation of libmagic) to guess a file's extension based on the first few bytes.""" 304 | # Use puremagic to guess 305 | try: 306 | guesses = puremagic.magic_file(path) 307 | 308 | # Fix for: https://github.com/microsoft/markitdown/issues/222 309 | # If there are no guesses, then try again after trimming leading ASCII whitespaces. 310 | # ASCII whitespace characters are those byte values in the sequence b' \t\n\r\x0b\f' 311 | # (space, tab, newline, carriage return, vertical tab, form feed). 312 | if len(guesses) == 0: 313 | with open(path, "rb") as file: 314 | while True: 315 | char = file.read(1) 316 | if not char: # End of file 317 | break 318 | if not char.isspace(): 319 | file.seek(file.tell() - 1) 320 | break 321 | try: 322 | guesses = puremagic.magic_stream(file) 323 | except puremagic.main.PureError: 324 | pass 325 | 326 | extensions = list() 327 | for g in guesses: 328 | ext = g.extension.strip() 329 | if len(ext) > 0: 330 | if not ext.startswith("."): 331 | ext = "." + ext 332 | if ext not in extensions: 333 | extensions.append(ext) 334 | return extensions 335 | except FileNotFoundError: 336 | pass 337 | except IsADirectoryError: 338 | pass 339 | except PermissionError: 340 | pass 341 | return [] 342 | 343 | def register_page_converter(self, converter: DocumentConverter) -> None: 344 | """Register a page text converter.""" 345 | self._page_converters.insert(0, converter) 346 | -------------------------------------------------------------------------------- /core/model_manager.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from pathlib import Path 4 | 5 | from huggingface_hub import snapshot_download as hf_download 6 | from modelscope.hub.snapshot_download import snapshot_download as ms_download 7 | 8 | DEFAULT_CONFIG_NAME = "magic-pdf.json" 9 | GITHUB_TEMPLATE_URL = "https://raw.githubusercontent.com/opendatalab/MinerU/master/magic-pdf.template.json" 10 | MODEL_REPOS = { 11 | 'main': 'opendatalab/PDF-Extract-Kit-1.0', 12 | 'layout': 'hantian/layoutreader' 13 | } 14 | 15 | 16 | class ModelConfigurator: 17 | """模型配置管理器""" 18 | 19 | def __init__(self, device='cpu', models_dir=None, use_modelscope=True): 20 | self.device = device 21 | self.use_modelscope = use_modelscope 22 | self.models_dir = models_dir 23 | self.config_path = self._get_config_path() 24 | self.mineru_patterns = [ 25 | "models/Layout/LayoutLMv3/*", 26 | "models/Layout/YOLO/*", 27 | "models/MFD/YOLO/*", 28 | "models/MFR/unimernet_small_2501/*", 29 | "models/TabRec/TableMaster/*", 30 | "models/TabRec/StructEqTable/*", 31 | ] 32 | if self.use_modelscope: 33 | MODEL_REPOS['layout'] = 'ppaanngggg/layoutreader' 34 | 35 | def _get_cache_dir(self, model_type): 36 | """获取符合各库规范的缓存目录""" 37 | if self.models_dir: 38 | custom_dir = Path(self.models_dir).expanduser().resolve() 39 | return custom_dir / model_type 40 | 41 | # 自动识别默认缓存路径 42 | if self.use_modelscope: 43 | return Path.home() / ".cache/modelscope/hub" / MODEL_REPOS[model_type] 44 | else: 45 | return Path.home() / ".cache/huggingface/hub" / MODEL_REPOS[model_type] 46 | 47 | def _get_config_path(self): 48 | """获取配置文件路径""" 49 | env_path = os.getenv('MINERU_TOOLS_CONFIG_JSON') 50 | return Path(env_path) if env_path else Path.home() / DEFAULT_CONFIG_NAME 51 | 52 | def setup_environment(self): 53 | """配置环境并下载模型""" 54 | self._download_models() 55 | self._generate_config() 56 | os.environ['MINERU_TOOLS_CONFIG_JSON'] = str(self.config_path) 57 | 58 | def _download_models(self): 59 | """改进后的下载方法""" 60 | downloader = ms_download if self.use_modelscope else hf_download 61 | 62 | model_paths = {} 63 | for model_type in ['main', 'layout']: 64 | cache_dir = self._get_cache_dir(model_type) 65 | 66 | print(f"下载 {model_type} 模型到: {cache_dir}") 67 | 68 | # 保留库的默认缓存行为,仅在指定--models-dir时覆盖 69 | download_args = { 70 | 'repo_id': MODEL_REPOS[model_type], 71 | 'local_dir': str(cache_dir), # 新增参数确保文件存储在指定位置 72 | 'allow_patterns': self.mineru_patterns if model_type == 'main' else None # 添加过滤规则 73 | } 74 | 75 | # 仅在自定义路径时覆盖缓存目录 76 | if self.models_dir: 77 | download_args['cache_dir'] = str(cache_dir.parent) 78 | 79 | snapshot_path = downloader(**download_args) 80 | 81 | # 处理特殊目录结构 82 | if model_type == 'main': 83 | self.main_model_path = Path(snapshot_path) / 'models' 84 | else: 85 | self.layout_model_path = Path(snapshot_path) 86 | 87 | return model_paths 88 | 89 | def _generate_config(self): 90 | """生成配置文件""" 91 | template_path = "assets/magic-pdf-template.json" 92 | try: 93 | with open(template_path, "r") as f: 94 | template_config = json.load(f) 95 | print(f"成功加载模板配置: {template_path}") 96 | except Exception as e: 97 | print(f"加载模板配置失败,使用默认值: {e}") 98 | template_config = {} 99 | 100 | custom_config = { 101 | "device-mode": self.device, 102 | "models-dir": str(self.main_model_path), 103 | "layoutreader-model-dir": str(self.layout_model_path), 104 | } 105 | template_config.update(custom_config) 106 | config = template_config 107 | 108 | if self.config_path.exists(): 109 | with open(self.config_path, 'r') as f: 110 | existing_config = json.load(f) 111 | existing_config.update(custom_config) 112 | config = existing_config 113 | 114 | with open(self.config_path, 'w') as f: 115 | json.dump(config, f, indent=2) 116 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import io 2 | import os 3 | import uuid 4 | from contextlib import asynccontextmanager 5 | from pathlib import Path 6 | from typing import Optional, List 7 | 8 | import openai 9 | from fastapi import FastAPI, UploadFile, File, HTTPException, status, Depends, BackgroundTasks, Form, Query 10 | from fastapi.responses import FileResponse 11 | from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials 12 | from pydantic import BaseModel 13 | from sqlalchemy.orm import Session 14 | from fastapi.staticfiles import StaticFiles 15 | 16 | from core.markitdown import MarkItDown 17 | from core.base import DocumentConverterResult 18 | from core.model_manager import ModelConfigurator 19 | from repository.db import get_db, Job 20 | 21 | # 安全验证 22 | security = HTTPBearer() 23 | 24 | # 从环境变量获取API密钥 25 | API_KEY = os.getenv("MARKIT_API_KEY", "secret-key") 26 | OUTPUT_DIR = Path("output") 27 | OUTPUT_DIR.mkdir(exist_ok=True) 28 | MINER_RUNNING_DEVICE = os.getenv("MINER_RUNNING_DEVICE", "cpu") 29 | port = int(os.getenv("PORT", 20926)) 30 | 31 | 32 | # 依赖项:API Key 验证 33 | async def verify_api_key( 34 | credentials: HTTPAuthorizationCredentials = Depends(security) 35 | ): 36 | if credentials.scheme != "Bearer" or credentials.credentials != API_KEY: 37 | raise HTTPException( 38 | status_code=status.HTTP_401_UNAUTHORIZED, 39 | detail="Invalid API Key", 40 | ) 41 | return credentials 42 | 43 | 44 | @asynccontextmanager 45 | async def lifespan(app: FastAPI): 46 | """服务启动和关闭时的生命周期管理""" 47 | try: 48 | # 初始化模型 49 | configurator = ModelConfigurator( 50 | device=os.getenv("MINERU_DEVICE", MINER_RUNNING_DEVICE), 51 | use_modelscope=os.getenv("MINERU_USE_MODELSCOPE", "true").lower() in ("true", "1") 52 | ) 53 | configurator.setup_environment() 54 | print("模型初始化完成") 55 | except Exception as e: 56 | print(f"模型初始化失败: {str(e)}") 57 | raise 58 | 59 | yield # 应用运行期间 60 | 61 | # 清理逻辑(可选) 62 | print("服务关闭,清理资源...") 63 | 64 | 65 | # FastAPI 应用 66 | app = FastAPI(lifespan=lifespan) 67 | if not os.path.exists("output/images"): 68 | os.mkdir("output/images") 69 | app.mount("/images", StaticFiles(directory="output/images"), name="images") 70 | 71 | 72 | # from slowapi import Limiter, _rate_limit_exceeded_handler 73 | # from slowapi.errors import RateLimitExceeded 74 | # from slowapi.util import get_remote_address 75 | # limiter = Limiter(key_func=get_remote_address) 76 | # app.state.limiter = limiter 77 | # app.add_exception_handler(RateLimitExceeded, _rate_limit_exceeded_handler) 78 | # @limiter.limit("100/minute") 79 | 80 | 81 | # 数据模型 82 | class JobStatusResponse(BaseModel): 83 | job_id: str 84 | status: str 85 | filename: str 86 | params: dict 87 | error: Optional[str] 88 | 89 | 90 | class JobResultResponse(BaseModel): 91 | job_id: str 92 | download_url: str 93 | format: str 94 | 95 | 96 | oai_client = None 97 | if os.getenv("MARKIFY_LLM_API_KEY", None) and os.getenv("MARKIFY_LLM_API_BASE", None): 98 | oai_client = openai.OpenAI( 99 | api_key=os.getenv("MARKIFY_LLM_API_KEY", None), 100 | base_url=os.getenv("MARKIFY_LLM_API_BASE", None) 101 | ) 102 | 103 | 104 | def process_file(db: Session, job_id: str, file_content: bytes, filename: str, mode: str = "simple"): 105 | """处理各种文件的后台任务""" 106 | try: 107 | # 更新任务状态为 processing 108 | job = db.query(Job).filter(Job.id == job_id).first() 109 | if not job: 110 | raise ValueError(f"Job {job_id} not found") 111 | 112 | job.status = "processing" 113 | db.commit() 114 | 115 | # 创建处理器 116 | markitdown = MarkItDown(mode=mode, 117 | llm_client=oai_client, 118 | llm_model=os.getenv("MARKIFY_LLM_MODEL", None) 119 | ) 120 | 121 | # 根据输入类型处理 122 | if filename.endswith('.md'): 123 | result = DocumentConverterResult(text_content=file_content.decode('utf-8')) 124 | else: 125 | # 将字节内容转为文件流 126 | file_stream = io.BytesIO(file_content) 127 | result = markitdown.convert_stream(file_stream, base_url="http://localhost:20926") 128 | 129 | # 保存结果到文件 130 | output_file = OUTPUT_DIR / f"{job_id}.md" 131 | with open(output_file, "w", encoding="utf-8") as f: 132 | f.write(result.text_content) 133 | 134 | # 更新任务状态为 completed 135 | job.status = "completed" 136 | job.result_file = str(output_file) 137 | db.commit() 138 | 139 | except Exception as e: 140 | # 更新任务状态为 failed 141 | job.status = "failed" 142 | job.error = f"{type(e).__name__}: {str(e)}" 143 | db.commit() 144 | 145 | 146 | @app.post("/api/jobs", status_code=status.HTTP_202_ACCEPTED) 147 | async def upload_file( 148 | background_tasks: BackgroundTasks, 149 | file: UploadFile = File(...), 150 | mode: str = Form("simple"), 151 | db: Session = Depends(get_db) 152 | ): 153 | """上传文件并启动转换任务""" 154 | # 生成任务ID 155 | job_id = str(uuid.uuid4()) 156 | 157 | try: 158 | # 读取文件内容 159 | content = await file.read() 160 | 161 | # 创建任务记录 162 | job = Job( 163 | id=job_id, 164 | filename=file.filename, 165 | params={"mode": mode}, 166 | status="pending" 167 | ) 168 | db.add(job) 169 | db.commit() 170 | 171 | # 启动后台任务 172 | background_tasks.add_task( 173 | process_file, 174 | db=db, 175 | job_id=job_id, 176 | file_content=content, 177 | filename=file.filename, 178 | mode=mode 179 | ) 180 | 181 | return {"job_id": job_id} 182 | 183 | except Exception as e: 184 | raise HTTPException( 185 | status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, 186 | detail=f"File upload failed: {str(e)}" 187 | ) 188 | 189 | 190 | @app.get("/api/jobs", response_model=List[JobStatusResponse]) 191 | async def list_jobs( 192 | db: Session = Depends(get_db), 193 | page: int = Query(0, ge=0, description=""), 194 | limit: int = Query(10, gt=0, le=100, description="default 10,max 100")): 195 | """查询任务状态""" 196 | jobs = db.query(Job).order_by(Job.created_at.desc()).limit(limit).offset(page * limit).all() 197 | if not jobs: 198 | raise HTTPException( 199 | status_code=status.HTTP_404_NOT_FOUND, 200 | detail="Job not found" 201 | ) 202 | 203 | response_list = [] 204 | for job in jobs: 205 | response_list.append(JobStatusResponse( 206 | job_id=job.id, 207 | status=job.status, 208 | filename=job.filename, 209 | params=job.params, 210 | error=job.error 211 | )) 212 | return response_list 213 | 214 | 215 | @app.get("/api/jobs/{job_id}", response_model=JobStatusResponse) 216 | async def get_job_status( 217 | job_id: str, 218 | db: Session = Depends(get_db) 219 | ): 220 | """查询任务状态""" 221 | job = db.query(Job).filter(Job.id == job_id).first() 222 | if not job: 223 | raise HTTPException( 224 | status_code=status.HTTP_404_NOT_FOUND, 225 | detail="Job not found" 226 | ) 227 | 228 | return JobStatusResponse( 229 | job_id=job.id, 230 | status=job.status, 231 | filename=job.filename, 232 | params=job.params, 233 | error=job.error 234 | ) 235 | 236 | 237 | @app.get("/api/jobs/{job_id}/result") 238 | async def download_result( 239 | job_id: str, 240 | db: Session = Depends(get_db) 241 | ): 242 | """下载任务结果文件""" 243 | job = db.query(Job).filter(Job.id == job_id).first() 244 | if not job: 245 | raise HTTPException( 246 | status_code=status.HTTP_404_NOT_FOUND, 247 | detail="Job not found" 248 | ) 249 | 250 | if job.status != "completed": 251 | raise HTTPException( 252 | status_code=status.HTTP_425_TOO_EARLY, 253 | detail="Job not completed" 254 | ) 255 | 256 | result_file = job.result_file 257 | if not result_file or not os.path.exists(result_file): 258 | raise HTTPException( 259 | status_code=status.HTTP_404_NOT_FOUND, 260 | detail="Result file not found" 261 | ) 262 | 263 | # 返回文件内容 264 | return FileResponse( 265 | result_file, 266 | filename=f"{job.filename}.md", 267 | media_type="text/markdown" 268 | ) 269 | 270 | 271 | if __name__ == "__main__": 272 | import uvicorn 273 | 274 | uvicorn.run(app, host="0.0.0.0", port=port) 275 | -------------------------------------------------------------------------------- /repository/db.py: -------------------------------------------------------------------------------- 1 | from sqlalchemy import create_engine, Column, String, Integer, JSON, DateTime 2 | from sqlalchemy.orm import declarative_base 3 | from sqlalchemy.orm import sessionmaker 4 | from datetime import datetime 5 | 6 | # SQLite 数据库路径 7 | DATABASE_URL = "sqlite:///./jobs.db" 8 | 9 | # 创建数据库引擎 10 | engine = create_engine(DATABASE_URL, connect_args={"check_same_thread": False}) 11 | 12 | # 创建 SessionLocal 13 | SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine) 14 | 15 | # 声明基类 16 | Base = declarative_base() 17 | 18 | 19 | # 任务模型 20 | class Job(Base): 21 | __tablename__ = "jobs" 22 | 23 | id = Column(String, primary_key=True, index=True) 24 | status = Column(String, default="pending") 25 | filename = Column(String) 26 | params = Column(JSON) 27 | result_file = Column(String) 28 | error = Column(String) 29 | created_at = Column(DateTime, default=datetime.utcnow) 30 | updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow) 31 | 32 | 33 | # 创建数据库表 34 | Base.metadata.create_all(bind=engine) 35 | 36 | 37 | # 获取数据库会话 38 | def get_db(): 39 | db = SessionLocal() 40 | try: 41 | yield db 42 | finally: 43 | db.close() 44 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # Core dependencies 2 | beautifulsoup4~=4.12.3 3 | requests~=2.32.3 4 | mammoth~=1.9.0 5 | markdownify~=0.14.1 6 | numpy 7 | python-pptx==1.0.2 8 | pandas~=2.2.3 9 | openpyxl==3.1.5 10 | xlrd==2.0.1 11 | puremagic~=1.28 12 | pydub~=0.25.1 13 | olefile~=0.47 14 | youtube-transcript-api==0.6.3 15 | SpeechRecognition==3.14.0 16 | pathvalidate==3.2.3 17 | charset-normalizer==3.4.1 18 | openai~=1.59.7 19 | magic-pdf[full] --extra-index-url https://wheels.myhloli.com 20 | modelscope~=1.22.2 21 | huggingface_hub~=0.27.1 22 | slowapi~=0.1.9 23 | limits~=4.0.1 24 | python-multipart~=0.0.20 25 | uvicorn>=0.34.0 26 | sqlalchemy>=2.0.37 27 | # Development dependencies (optional, install with `-r dev-requirements.txt`) 28 | mypy>=1.0.0 29 | 30 | fastapi~=0.115.7 31 | pydantic~=2.10.5 32 | setuptools~=75.1.0 33 | 34 | streamlit~=1.29.0 --------------------------------------------------------------------------------