├── .devcontainer └── devcontainer.json ├── .dockerignore ├── .gitattributes ├── .github ├── dependabot.yml └── workflows │ ├── pre-commit.yml │ └── tests.yml ├── .gitignore ├── .pre-commit-config.yaml ├── CODE_OF_CONDUCT.md ├── Dockerfile ├── LICENSE ├── README.md ├── SECURITY.md ├── SUPPORT.md └── packages ├── markitdown-mcp ├── Dockerfile ├── README.md ├── pyproject.toml ├── src │ └── markitdown_mcp │ │ ├── __about__.py │ │ ├── __init__.py │ │ ├── __main__.py │ │ └── py.typed └── tests │ └── __init__.py ├── markitdown-sample-plugin ├── README.md ├── pyproject.toml ├── src │ └── markitdown_sample_plugin │ │ ├── __about__.py │ │ ├── __init__.py │ │ ├── _plugin.py │ │ └── py.typed └── tests │ ├── __init__.py │ ├── test_files │ └── test.rtf │ └── test_sample_plugin.py └── markitdown ├── README.md ├── ThirdPartyNotices.md ├── pyproject.toml ├── src └── markitdown │ ├── __about__.py │ ├── __init__.py │ ├── __main__.py │ ├── _base_converter.py │ ├── _exceptions.py │ ├── _markitdown.py │ ├── _stream_info.py │ ├── _uri_utils.py │ ├── converter_utils │ ├── __init__.py │ └── docx │ │ ├── __init__.py │ │ ├── math │ │ ├── __init__.py │ │ ├── latex_dict.py │ │ └── omml.py │ │ └── pre_process.py │ ├── converters │ ├── __init__.py │ ├── _audio_converter.py │ ├── _bing_serp_converter.py │ ├── _csv_converter.py │ ├── _doc_intel_converter.py │ ├── _docx_converter.py │ ├── _epub_converter.py │ ├── _exiftool.py │ ├── _html_converter.py │ ├── _image_converter.py │ ├── _ipynb_converter.py │ ├── _llm_caption.py │ ├── _markdownify.py │ ├── _outlook_msg_converter.py │ ├── _pdf_converter.py │ ├── _plain_text_converter.py │ ├── _pptx_converter.py │ ├── _rss_converter.py │ ├── _transcribe_audio.py │ ├── _wikipedia_converter.py │ ├── _xlsx_converter.py │ ├── _youtube_converter.py │ └── _zip_converter.py │ └── py.typed └── tests ├── __init__.py ├── _test_vectors.py ├── test_cli_misc.py ├── test_cli_vectors.py ├── test_files ├── equations.docx ├── random.bin ├── test.docx ├── test.epub ├── test.jpg ├── test.json ├── test.m4a ├── test.mp3 ├── test.pdf ├── test.pptx ├── test.wav ├── test.xls ├── test.xlsx ├── test_blog.html ├── test_files.zip ├── test_llm.jpg ├── test_mskanji.csv ├── test_notebook.ipynb ├── test_outlook_msg.msg ├── test_rss.xml ├── test_serp.html ├── test_wikipedia.html └── test_with_comment.docx ├── test_module_misc.py └── test_module_vectors.py /.devcontainer/devcontainer.json: -------------------------------------------------------------------------------- 1 | // For format details, see https://aka.ms/devcontainer.json. For config options, see the 2 | // README at: https://github.com/devcontainers/templates/tree/main/src/docker-existing-dockerfile 3 | { 4 | "name": "Existing Dockerfile", 5 | "build": { 6 | // Sets the run context to one level up instead of the .devcontainer folder. 7 | "context": "..", 8 | // Update the 'dockerFile' property if you aren't using the standard 'Dockerfile' filename. 9 | "dockerfile": "../Dockerfile", 10 | "args": { 11 | "INSTALL_GIT": "true" 12 | } 13 | }, 14 | 15 | // Features to add to the dev container. More info: https://containers.dev/features. 16 | // "features": {}, 17 | "features": { 18 | "ghcr.io/devcontainers-extra/features/hatch:2": {} 19 | }, 20 | 21 | // Use 'forwardPorts' to make a list of ports inside the container available locally. 22 | // "forwardPorts": [], 23 | 24 | // Uncomment the next line to run commands after the container is created. 25 | // "postCreateCommand": "cat /etc/os-release", 26 | 27 | // Configure tool-specific properties. 28 | // "customizations": {}, 29 | 30 | // Uncomment to connect as an existing user other than the container default. More info: https://aka.ms/dev-containers-non-root. 31 | "remoteUser": "root" 32 | } 33 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | * 2 | !packages/ 3 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | packages/markitdown/tests/test_files/** linguist-vendored 2 | packages/markitdown-sample-plugin/tests/test_files/** linguist-vendored 3 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: "github-actions" 4 | directory: "/" 5 | schedule: 6 | interval: "weekly" 7 | -------------------------------------------------------------------------------- /.github/workflows/pre-commit.yml: -------------------------------------------------------------------------------- 1 | name: pre-commit 2 | on: [pull_request] 3 | 4 | jobs: 5 | pre-commit: 6 | runs-on: ubuntu-latest 7 | steps: 8 | - uses: actions/checkout@v4 9 | - name: Set up Python 10 | uses: actions/setup-python@v5 11 | with: 12 | python-version: "3.x" 13 | 14 | - name: Install pre-commit 15 | run: | 16 | pip install pre-commit 17 | pre-commit install --install-hooks 18 | 19 | - name: Run pre-commit 20 | run: pre-commit run --all-files 21 | -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | name: tests 2 | on: [pull_request] 3 | 4 | jobs: 5 | tests: 6 | runs-on: ubuntu-latest 7 | steps: 8 | - uses: actions/checkout@v4 9 | - uses: actions/setup-python@v5 10 | with: 11 | python-version: | 12 | 3.10 13 | 3.11 14 | 3.12 15 | - name: Install Hatch 16 | run: pipx install hatch 17 | - name: Run tests 18 | run: cd packages/markitdown; hatch test 19 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .vscode 2 | 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | share/python-wheels/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | MANIFEST 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .nox/ 45 | .coverage 46 | .coverage.* 47 | .cache 48 | nosetests.xml 49 | coverage.xml 50 | *.cover 51 | *.py,cover 52 | .hypothesis/ 53 | .pytest_cache/ 54 | cover/ 55 | 56 | # Translations 57 | *.mo 58 | *.pot 59 | 60 | # Django stuff: 61 | *.log 62 | local_settings.py 63 | db.sqlite3 64 | db.sqlite3-journal 65 | 66 | # Flask stuff: 67 | instance/ 68 | .webassets-cache 69 | 70 | # Scrapy stuff: 71 | .scrapy 72 | 73 | # Sphinx documentation 74 | docs/_build/ 75 | 76 | # PyBuilder 77 | .pybuilder/ 78 | target/ 79 | 80 | # Jupyter Notebook 81 | .ipynb_checkpoints 82 | 83 | # IPython 84 | profile_default/ 85 | ipython_config.py 86 | 87 | # pyenv 88 | # For a library or package, you might want to ignore these files since the code is 89 | # intended to run in multiple environments; otherwise, check them in: 90 | # .python-version 91 | 92 | # pipenv 93 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 94 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 95 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 96 | # install all needed dependencies. 97 | #Pipfile.lock 98 | 99 | # poetry 100 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 101 | # This is especially recommended for binary packages to ensure reproducibility, and is more 102 | # commonly ignored for libraries. 103 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 104 | #poetry.lock 105 | 106 | # pdm 107 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 108 | #pdm.lock 109 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 110 | # in version control. 111 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 112 | .pdm.toml 113 | .pdm-python 114 | .pdm-build/ 115 | 116 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 117 | __pypackages__/ 118 | 119 | # Celery stuff 120 | celerybeat-schedule 121 | celerybeat.pid 122 | 123 | # SageMath parsed files 124 | *.sage.py 125 | 126 | # Environments 127 | .env 128 | .venv 129 | env/ 130 | venv/ 131 | ENV/ 132 | env.bak/ 133 | venv.bak/ 134 | 135 | # Spyder project settings 136 | .spyderproject 137 | .spyproject 138 | 139 | # Rope project settings 140 | .ropeproject 141 | 142 | # mkdocs documentation 143 | /site 144 | 145 | # mypy 146 | .mypy_cache/ 147 | .dmypy.json 148 | dmypy.json 149 | 150 | # Pyre type checker 151 | .pyre/ 152 | 153 | # pytype static type analyzer 154 | .pytype/ 155 | 156 | # Cython debug symbols 157 | cython_debug/ 158 | 159 | # PyCharm 160 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 161 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 162 | # and can be added to the global gitignore or merged into this file. For a more nuclear 163 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 164 | #.idea/ 165 | src/.DS_Store 166 | .DS_Store 167 | .cursorrules 168 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/psf/black 3 | rev: 23.7.0 # Use the latest version of Black 4 | hooks: 5 | - id: black 6 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Microsoft Open Source Code of Conduct 2 | 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 4 | 5 | Resources: 6 | 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/) 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns 10 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.13-slim-bullseye 2 | 3 | ENV DEBIAN_FRONTEND=noninteractive 4 | ENV EXIFTOOL_PATH=/usr/bin/exiftool 5 | ENV FFMPEG_PATH=/usr/bin/ffmpeg 6 | 7 | # Runtime dependency 8 | RUN apt-get update && apt-get install -y --no-install-recommends \ 9 | ffmpeg \ 10 | exiftool 11 | 12 | ARG INSTALL_GIT=false 13 | RUN if [ "$INSTALL_GIT" = "true" ]; then \ 14 | apt-get install -y --no-install-recommends \ 15 | git; \ 16 | fi 17 | 18 | # Cleanup 19 | RUN rm -rf /var/lib/apt/lists/* 20 | 21 | WORKDIR /app 22 | COPY . /app 23 | RUN pip --no-cache-dir install \ 24 | /app/packages/markitdown[all] \ 25 | /app/packages/markitdown-sample-plugin 26 | 27 | # Default USERID and GROUPID 28 | ARG USERID=nobody 29 | ARG GROUPID=nogroup 30 | 31 | USER $USERID:$GROUPID 32 | 33 | ENTRYPOINT [ "markitdown" ] 34 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Microsoft Corporation. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE 22 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Security 4 | 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet) and [Xamarin](https://github.com/xamarin). 6 | 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/security.md/definition), please report it to us as described below. 8 | 9 | ## Reporting Security Issues 10 | 11 | **Please do not report security vulnerabilities through public GitHub issues.** 12 | 13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/security.md/msrc/create-report). 14 | 15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/security.md/msrc/pgp). 16 | 17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). 18 | 19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: 20 | 21 | * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) 22 | * Full paths of source file(s) related to the manifestation of the issue 23 | * The location of the affected source code (tag/branch/commit or direct URL) 24 | * Any special configuration required to reproduce the issue 25 | * Step-by-step instructions to reproduce the issue 26 | * Proof-of-concept or exploit code (if possible) 27 | * Impact of the issue, including how an attacker might exploit the issue 28 | 29 | This information will help us triage your report more quickly. 30 | 31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/security.md/msrc/bounty) page for more details about our active programs. 32 | 33 | ## Preferred Languages 34 | 35 | We prefer all communications to be in English. 36 | 37 | ## Policy 38 | 39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/security.md/cvd). 40 | 41 | 42 | -------------------------------------------------------------------------------- /SUPPORT.md: -------------------------------------------------------------------------------- 1 | # TODO: The maintainer of this repo has not yet edited this file 2 | 3 | **REPO OWNER**: Do you want Customer Service & Support (CSS) support for this product/project? 4 | 5 | - **No CSS support:** Fill out this template with information about how to file issues and get help. 6 | - **Yes CSS support:** Fill out an intake form at [aka.ms/onboardsupport](https://aka.ms/onboardsupport). CSS will work with/help you to determine next steps. 7 | - **Not sure?** Fill out an intake as though the answer were "Yes". CSS will help you decide. 8 | 9 | *Then remove this first heading from this SUPPORT.MD file before publishing your repo.* 10 | 11 | # Support 12 | 13 | ## How to file issues and get help 14 | 15 | This project uses GitHub Issues to track bugs and feature requests. Please search the existing 16 | issues before filing new issues to avoid duplicates. For new issues, file your bug or 17 | feature request as a new Issue. 18 | 19 | For help and questions about using this project, please **REPO MAINTAINER: INSERT INSTRUCTIONS HERE 20 | FOR HOW TO ENGAGE REPO OWNERS OR COMMUNITY FOR HELP. COULD BE A STACK OVERFLOW TAG OR OTHER 21 | CHANNEL. WHERE WILL YOU HELP PEOPLE?**. 22 | 23 | ## Microsoft Support Policy 24 | 25 | Support for this **PROJECT or PRODUCT** is limited to the resources listed above. 26 | -------------------------------------------------------------------------------- /packages/markitdown-mcp/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.13-slim-bullseye 2 | 3 | ENV DEBIAN_FRONTEND=noninteractive 4 | ENV EXIFTOOL_PATH=/usr/bin/exiftool 5 | ENV FFMPEG_PATH=/usr/bin/ffmpeg 6 | 7 | # Runtime dependency 8 | RUN apt-get update && apt-get install -y --no-install-recommends \ 9 | ffmpeg \ 10 | exiftool 11 | 12 | # Cleanup 13 | RUN rm -rf /var/lib/apt/lists/* 14 | 15 | COPY . /app 16 | RUN pip --no-cache-dir install /app 17 | 18 | WORKDIR /workdir 19 | 20 | # Default USERID and GROUPID 21 | ARG USERID=nobody 22 | ARG GROUPID=nogroup 23 | 24 | USER $USERID:$GROUPID 25 | 26 | ENTRYPOINT [ "markitdown-mcp" ] 27 | -------------------------------------------------------------------------------- /packages/markitdown-mcp/README.md: -------------------------------------------------------------------------------- 1 | # MarkItDown-MCP 2 | 3 | [![PyPI](https://img.shields.io/pypi/v/markitdown-mcp.svg)](https://pypi.org/project/markitdown-mcp/) 4 | ![PyPI - Downloads](https://img.shields.io/pypi/dd/markitdown-mcp) 5 | [![Built by AutoGen Team](https://img.shields.io/badge/Built%20by-AutoGen%20Team-blue)](https://github.com/microsoft/autogen) 6 | 7 | The `markitdown-mcp` package provides a lightweight STDIO, Streamable HTTP, and SSE MCP server for calling MarkItDown. 8 | 9 | It exposes one tool: `convert_to_markdown(uri)`, where uri can be any `http:`, `https:`, `file:`, or `data:` URI. 10 | 11 | ## Installation 12 | 13 | To install the package, use pip: 14 | 15 | ```bash 16 | pip install markitdown-mcp 17 | ``` 18 | 19 | ## Usage 20 | 21 | To run the MCP server, using STDIO (default) use the following command: 22 | 23 | 24 | ```bash 25 | markitdown-mcp 26 | ``` 27 | 28 | To run the MCP server, using Streamable HTTP and SSE use the following command: 29 | 30 | ```bash 31 | markitdown-mcp --http --host 127.0.0.1 --port 3001 32 | ``` 33 | 34 | ## Running in Docker 35 | 36 | To run `markitdown-mcp` in Docker, build the Docker image using the provided Dockerfile: 37 | ```bash 38 | docker build -t markitdown-mcp:latest . 39 | ``` 40 | 41 | And run it using: 42 | ```bash 43 | docker run -it --rm markitdown-mcp:latest 44 | ``` 45 | This will be sufficient for remote URIs. To access local files, you need to mount the local directory into the container. For example, if you want to access files in `/home/user/data`, you can run: 46 | 47 | ```bash 48 | docker run -it --rm -v /home/user/data:/workdir markitdown-mcp:latest 49 | ``` 50 | 51 | Once mounted, all files under data will be accessible under `/workdir` in the container. For example, if you have a file `example.txt` in `/home/user/data`, it will be accessible in the container at `/workdir/example.txt`. 52 | 53 | ## Accessing from Claude Desktop 54 | 55 | It is recommended to use the Docker image when running the MCP server for Claude Desktop. 56 | 57 | Follow [these instrutions](https://modelcontextprotocol.io/quickstart/user#for-claude-desktop-users) to access Claude's `claude_desktop_config.json` file. 58 | 59 | Edit it to include the following JSON entry: 60 | 61 | ```json 62 | { 63 | "mcpServers": { 64 | "markitdown": { 65 | "command": "docker", 66 | "args": [ 67 | "run", 68 | "--rm", 69 | "-i", 70 | "markitdown-mcp:latest" 71 | ] 72 | } 73 | } 74 | } 75 | ``` 76 | 77 | If you want to mount a directory, adjust it accordingly: 78 | 79 | ```json 80 | { 81 | "mcpServers": { 82 | "markitdown": { 83 | "command": "docker", 84 | "args": [ 85 | "run", 86 | "--rm", 87 | "-i", 88 | "-v", 89 | "/home/user/data:/workdir", 90 | "markitdown-mcp:latest" 91 | ] 92 | } 93 | } 94 | } 95 | ``` 96 | 97 | ## Debugging 98 | 99 | To debug the MCP server you can use the `mcpinspector` tool. 100 | 101 | ```bash 102 | npx @modelcontextprotocol/inspector 103 | ``` 104 | 105 | You can then connect to the insepctor through the specified host and port (e.g., `http://localhost:5173/`). 106 | 107 | If using STDIO: 108 | * select `STDIO` as the transport type, 109 | * input `markitdown-mcp` as the command, and 110 | * click `Connect` 111 | 112 | If using Streamable HTTP: 113 | * select `Streamable HTTP` as the transport type, 114 | * input `http://127.0.0.1:3001/mcp` as the URL, and 115 | * click `Connect` 116 | 117 | If using SSE: 118 | * select `SSE` as the transport type, 119 | * input `http://127.0.0.1:3001/sse` as the URL, and 120 | * click `Connect` 121 | 122 | Finally: 123 | * click the `Tools` tab, 124 | * click `List Tools`, 125 | * click `convert_to_markdown`, and 126 | * run the tool on any valid URI. 127 | 128 | ## Security Considerations 129 | 130 | The server does not support authentication, and runs with the privileges if the user running it. For this reason, when running in SSE or Streamable HTTP mode, it is recommended to run the server bound to `localhost` (default). 131 | 132 | 133 | ## Trademarks 134 | 135 | This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft 136 | trademarks or logos is subject to and must follow 137 | [Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general). 138 | Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship. 139 | Any use of third-party trademarks or logos are subject to those third-party's policies. 140 | -------------------------------------------------------------------------------- /packages/markitdown-mcp/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["hatchling"] 3 | build-backend = "hatchling.build" 4 | 5 | [project] 6 | name = "markitdown-mcp" 7 | dynamic = ["version"] 8 | description = 'An MCP server for the "markitdown" library.' 9 | readme = "README.md" 10 | requires-python = ">=3.10" 11 | license = "MIT" 12 | keywords = [] 13 | authors = [ 14 | { name = "Adam Fourney", email = "adamfo@microsoft.com" }, 15 | ] 16 | classifiers = [ 17 | "Development Status :: 4 - Beta", 18 | "Programming Language :: Python", 19 | "Programming Language :: Python :: 3.10", 20 | "Programming Language :: Python :: 3.11", 21 | "Programming Language :: Python :: 3.12", 22 | "Programming Language :: Python :: 3.13", 23 | "Programming Language :: Python :: Implementation :: CPython", 24 | "Programming Language :: Python :: Implementation :: PyPy", 25 | ] 26 | dependencies = [ 27 | "mcp~=1.8.0", 28 | "markitdown[all]>=0.1.1,<0.2.0", 29 | ] 30 | 31 | [project.urls] 32 | Documentation = "https://github.com/microsoft/markitdown#readme" 33 | Issues = "https://github.com/microsoft/markitdown/issues" 34 | Source = "https://github.com/microsoft/markitdown" 35 | 36 | [tool.hatch.version] 37 | path = "src/markitdown_mcp/__about__.py" 38 | 39 | [project.scripts] 40 | markitdown-mcp = "markitdown_mcp.__main__:main" 41 | 42 | [tool.hatch.envs.types] 43 | extra-dependencies = [ 44 | "mypy>=1.0.0", 45 | ] 46 | [tool.hatch.envs.types.scripts] 47 | check = "mypy --install-types --non-interactive {args:src/markitdown_mcp tests}" 48 | 49 | [tool.coverage.run] 50 | source_pkgs = ["markitdown-mcp", "tests"] 51 | branch = true 52 | parallel = true 53 | omit = [ 54 | "src/markitdown_mcp/__about__.py", 55 | ] 56 | 57 | [tool.coverage.paths] 58 | markitdown-mcp = ["src/markitdown_mcp", "*/markitdown-mcp/src/markitdown_mcp"] 59 | tests = ["tests", "*/markitdown-mcp/tests"] 60 | 61 | [tool.coverage.report] 62 | exclude_lines = [ 63 | "no cov", 64 | "if __name__ == .__main__.:", 65 | "if TYPE_CHECKING:", 66 | ] 67 | 68 | [tool.hatch.build.targets.sdist] 69 | only-include = ["src/markitdown_mcp"] 70 | -------------------------------------------------------------------------------- /packages/markitdown-mcp/src/markitdown_mcp/__about__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: 2024-present Adam Fourney 2 | # 3 | # SPDX-License-Identifier: MIT 4 | __version__ = "0.0.1a4" 5 | -------------------------------------------------------------------------------- /packages/markitdown-mcp/src/markitdown_mcp/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: 2024-present Adam Fourney 2 | # 3 | # SPDX-License-Identifier: MIT 4 | 5 | from .__about__ import __version__ 6 | 7 | __all__ = [ 8 | "__version__", 9 | ] 10 | -------------------------------------------------------------------------------- /packages/markitdown-mcp/src/markitdown_mcp/__main__.py: -------------------------------------------------------------------------------- 1 | import contextlib 2 | import sys 3 | from collections.abc import AsyncIterator 4 | from mcp.server.fastmcp import FastMCP 5 | from starlette.applications import Starlette 6 | from mcp.server.sse import SseServerTransport 7 | from starlette.requests import Request 8 | from starlette.routing import Mount, Route 9 | from starlette.types import Receive, Scope, Send 10 | from mcp.server import Server 11 | from mcp.server.streamable_http_manager import StreamableHTTPSessionManager 12 | from markitdown import MarkItDown 13 | import uvicorn 14 | 15 | # Initialize FastMCP server for MarkItDown (SSE) 16 | mcp = FastMCP("markitdown") 17 | 18 | 19 | @mcp.tool() 20 | async def convert_to_markdown(uri: str) -> str: 21 | """Convert a resource described by an http:, https:, file: or data: URI to markdown""" 22 | return MarkItDown().convert_uri(uri).markdown 23 | 24 | 25 | def create_starlette_app(mcp_server: Server, *, debug: bool = False) -> Starlette: 26 | sse = SseServerTransport("/messages/") 27 | session_manager = StreamableHTTPSessionManager( 28 | app=mcp_server, 29 | event_store=None, 30 | json_response=True, 31 | stateless=True, 32 | ) 33 | 34 | async def handle_sse(request: Request) -> None: 35 | async with sse.connect_sse( 36 | request.scope, 37 | request.receive, 38 | request._send, 39 | ) as (read_stream, write_stream): 40 | await mcp_server.run( 41 | read_stream, 42 | write_stream, 43 | mcp_server.create_initialization_options(), 44 | ) 45 | 46 | async def handle_streamable_http( 47 | scope: Scope, receive: Receive, send: Send 48 | ) -> None: 49 | await session_manager.handle_request(scope, receive, send) 50 | 51 | @contextlib.asynccontextmanager 52 | async def lifespan(app: Starlette) -> AsyncIterator[None]: 53 | """Context manager for session manager.""" 54 | async with session_manager.run(): 55 | print("Application started with StreamableHTTP session manager!") 56 | try: 57 | yield 58 | finally: 59 | print("Application shutting down...") 60 | 61 | return Starlette( 62 | debug=debug, 63 | routes=[ 64 | Route("/sse", endpoint=handle_sse), 65 | Mount("/mcp", app=handle_streamable_http), 66 | Mount("/messages/", app=sse.handle_post_message), 67 | ], 68 | lifespan=lifespan, 69 | ) 70 | 71 | 72 | # Main entry point 73 | def main(): 74 | import argparse 75 | 76 | mcp_server = mcp._mcp_server 77 | 78 | parser = argparse.ArgumentParser(description="Run a MarkItDown MCP server") 79 | 80 | parser.add_argument( 81 | "--http", 82 | action="store_true", 83 | help="Run the server with Streamable HTTP and SSE transport rather than STDIO (default: False)", 84 | ) 85 | parser.add_argument( 86 | "--sse", 87 | action="store_true", 88 | help="(Deprecated) An alias for --http (default: False)", 89 | ) 90 | parser.add_argument( 91 | "--host", default=None, help="Host to bind to (default: 127.0.0.1)" 92 | ) 93 | parser.add_argument( 94 | "--port", type=int, default=None, help="Port to listen on (default: 3001)" 95 | ) 96 | args = parser.parse_args() 97 | 98 | use_http = args.http or args.sse 99 | 100 | if not use_http and (args.host or args.port): 101 | parser.error( 102 | "Host and port arguments are only valid when using streamable HTTP or SSE transport (see: --http)." 103 | ) 104 | sys.exit(1) 105 | 106 | if use_http: 107 | starlette_app = create_starlette_app(mcp_server, debug=True) 108 | uvicorn.run( 109 | starlette_app, 110 | host=args.host if args.host else "127.0.0.1", 111 | port=args.port if args.port else 3001, 112 | ) 113 | else: 114 | mcp.run() 115 | 116 | 117 | if __name__ == "__main__": 118 | main() 119 | -------------------------------------------------------------------------------- /packages/markitdown-mcp/src/markitdown_mcp/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/markitdown/62b72284feb986ffaf8c22fa73614545b5713c30/packages/markitdown-mcp/src/markitdown_mcp/py.typed -------------------------------------------------------------------------------- /packages/markitdown-mcp/tests/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: 2024-present Adam Fourney 2 | # 3 | # SPDX-License-Identifier: MIT 4 | -------------------------------------------------------------------------------- /packages/markitdown-sample-plugin/README.md: -------------------------------------------------------------------------------- 1 | # MarkItDown Sample Plugin 2 | 3 | [![PyPI](https://img.shields.io/pypi/v/markitdown-sample-plugin.svg)](https://pypi.org/project/markitdown-sample-plugin/) 4 | ![PyPI - Downloads](https://img.shields.io/pypi/dd/markitdown-sample-plugin) 5 | [![Built by AutoGen Team](https://img.shields.io/badge/Built%20by-AutoGen%20Team-blue)](https://github.com/microsoft/autogen) 6 | 7 | 8 | This project shows how to create a sample plugin for MarkItDown. The most important parts are as follows: 9 | 10 | Next, implement your custom DocumentConverter: 11 | 12 | ```python 13 | from typing import BinaryIO, Any 14 | from markitdown import MarkItDown, DocumentConverter, DocumentConverterResult, StreamInfo 15 | 16 | class RtfConverter(DocumentConverter): 17 | 18 | def __init__( 19 | self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT 20 | ): 21 | super().__init__(priority=priority) 22 | 23 | def accepts( 24 | self, 25 | file_stream: BinaryIO, 26 | stream_info: StreamInfo, 27 | **kwargs: Any, 28 | ) -> bool: 29 | 30 | # Implement logic to check if the file stream is an RTF file 31 | # ... 32 | raise NotImplementedError() 33 | 34 | 35 | def convert( 36 | self, 37 | file_stream: BinaryIO, 38 | stream_info: StreamInfo, 39 | **kwargs: Any, 40 | ) -> DocumentConverterResult: 41 | 42 | # Implement logic to convert the file stream to Markdown 43 | # ... 44 | raise NotImplementedError() 45 | ``` 46 | 47 | Next, make sure your package implements and exports the following: 48 | 49 | ```python 50 | # The version of the plugin interface that this plugin uses. 51 | # The only supported version is 1 for now. 52 | __plugin_interface_version__ = 1 53 | 54 | # The main entrypoint for the plugin. This is called each time MarkItDown instances are created. 55 | def register_converters(markitdown: MarkItDown, **kwargs): 56 | """ 57 | Called during construction of MarkItDown instances to register converters provided by plugins. 58 | """ 59 | 60 | # Simply create and attach an RtfConverter instance 61 | markitdown.register_converter(RtfConverter()) 62 | ``` 63 | 64 | 65 | Finally, create an entrypoint in the `pyproject.toml` file: 66 | 67 | ```toml 68 | [project.entry-points."markitdown.plugin"] 69 | sample_plugin = "markitdown_sample_plugin" 70 | ``` 71 | 72 | Here, the value of `sample_plugin` can be any key, but should ideally be the name of the plugin. The value is the fully qualified name of the package implementing the plugin. 73 | 74 | 75 | ## Installation 76 | 77 | To use the plugin with MarkItDown, it must be installed. To install the plugin from the current directory use: 78 | 79 | ```bash 80 | pip install -e . 81 | ``` 82 | 83 | Once the plugin package is installed, verify that it is available to MarkItDown by running: 84 | 85 | ```bash 86 | markitdown --list-plugins 87 | ``` 88 | 89 | To use the plugin for a conversion use the `--use-plugins` flag. For example, to convert an RTF file: 90 | 91 | ```bash 92 | markitdown --use-plugins path-to-file.rtf 93 | ``` 94 | 95 | In Python, plugins can be enabled as follows: 96 | 97 | ```python 98 | from markitdown import MarkItDown 99 | 100 | md = MarkItDown(enable_plugins=True) 101 | result = md.convert("path-to-file.rtf") 102 | print(result.text_content) 103 | ``` 104 | 105 | ## Trademarks 106 | 107 | This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft 108 | trademarks or logos is subject to and must follow 109 | [Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general). 110 | Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship. 111 | Any use of third-party trademarks or logos are subject to those third-party's policies. 112 | -------------------------------------------------------------------------------- /packages/markitdown-sample-plugin/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["hatchling"] 3 | build-backend = "hatchling.build" 4 | 5 | [project] 6 | name = "markitdown-sample-plugin" 7 | dynamic = ["version"] 8 | description = 'A sample plugin for the "markitdown" library.' 9 | readme = "README.md" 10 | requires-python = ">=3.10" 11 | license = "MIT" 12 | keywords = [] 13 | authors = [ 14 | { name = "Adam Fourney", email = "adamfo@microsoft.com" }, 15 | ] 16 | classifiers = [ 17 | "Development Status :: 4 - Beta", 18 | "Programming Language :: Python", 19 | "Programming Language :: Python :: 3.10", 20 | "Programming Language :: Python :: 3.11", 21 | "Programming Language :: Python :: 3.12", 22 | "Programming Language :: Python :: 3.13", 23 | "Programming Language :: Python :: Implementation :: CPython", 24 | "Programming Language :: Python :: Implementation :: PyPy", 25 | ] 26 | dependencies = [ 27 | "markitdown>=0.1.0a1", 28 | "striprtf", 29 | ] 30 | 31 | [project.urls] 32 | Documentation = "https://github.com/microsoft/markitdown#readme" 33 | Issues = "https://github.com/microsoft/markitdown/issues" 34 | Source = "https://github.com/microsoft/markitdown" 35 | 36 | [tool.hatch.version] 37 | path = "src/markitdown_sample_plugin/__about__.py" 38 | 39 | # IMPORTANT: MarkItDown will look for this entry point to find the plugin. 40 | [project.entry-points."markitdown.plugin"] 41 | sample_plugin = "markitdown_sample_plugin" 42 | 43 | [tool.hatch.envs.types] 44 | extra-dependencies = [ 45 | "mypy>=1.0.0", 46 | ] 47 | [tool.hatch.envs.types.scripts] 48 | check = "mypy --install-types --non-interactive {args:src/markitdown_sample_plugin tests}" 49 | 50 | [tool.coverage.run] 51 | source_pkgs = ["markitdown-sample-plugin", "tests"] 52 | branch = true 53 | parallel = true 54 | omit = [ 55 | "src/markitdown_sample_plugin/__about__.py", 56 | ] 57 | 58 | [tool.coverage.paths] 59 | markitdown-sample-plugin = ["src/markitdown_sample_plugin", "*/markitdown-sample-plugin/src/markitdown_sample_plugin"] 60 | tests = ["tests", "*/markitdown-sample-plugin/tests"] 61 | 62 | [tool.coverage.report] 63 | exclude_lines = [ 64 | "no cov", 65 | "if __name__ == .__main__.:", 66 | "if TYPE_CHECKING:", 67 | ] 68 | 69 | [tool.hatch.build.targets.sdist] 70 | only-include = ["src/markitdown_sample_plugin"] 71 | -------------------------------------------------------------------------------- /packages/markitdown-sample-plugin/src/markitdown_sample_plugin/__about__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: 2024-present Adam Fourney 2 | # 3 | # SPDX-License-Identifier: MIT 4 | __version__ = "0.1.0a1" 5 | -------------------------------------------------------------------------------- /packages/markitdown-sample-plugin/src/markitdown_sample_plugin/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: 2024-present Adam Fourney 2 | # 3 | # SPDX-License-Identifier: MIT 4 | 5 | from ._plugin import __plugin_interface_version__, register_converters, RtfConverter 6 | from .__about__ import __version__ 7 | 8 | __all__ = [ 9 | "__version__", 10 | "__plugin_interface_version__", 11 | "register_converters", 12 | "RtfConverter", 13 | ] 14 | -------------------------------------------------------------------------------- /packages/markitdown-sample-plugin/src/markitdown_sample_plugin/_plugin.py: -------------------------------------------------------------------------------- 1 | import locale 2 | from typing import BinaryIO, Any 3 | from striprtf.striprtf import rtf_to_text 4 | 5 | from markitdown import ( 6 | MarkItDown, 7 | DocumentConverter, 8 | DocumentConverterResult, 9 | StreamInfo, 10 | ) 11 | 12 | 13 | __plugin_interface_version__ = ( 14 | 1 # The version of the plugin interface that this plugin uses 15 | ) 16 | 17 | ACCEPTED_MIME_TYPE_PREFIXES = [ 18 | "text/rtf", 19 | "application/rtf", 20 | ] 21 | 22 | ACCEPTED_FILE_EXTENSIONS = [".rtf"] 23 | 24 | 25 | def register_converters(markitdown: MarkItDown, **kwargs): 26 | """ 27 | Called during construction of MarkItDown instances to register converters provided by plugins. 28 | """ 29 | 30 | # Simply create and attach an RtfConverter instance 31 | markitdown.register_converter(RtfConverter()) 32 | 33 | 34 | class RtfConverter(DocumentConverter): 35 | """ 36 | Converts an RTF file to in the simplest possible way. 37 | """ 38 | 39 | def accepts( 40 | self, 41 | file_stream: BinaryIO, 42 | stream_info: StreamInfo, 43 | **kwargs: Any, 44 | ) -> bool: 45 | mimetype = (stream_info.mimetype or "").lower() 46 | extension = (stream_info.extension or "").lower() 47 | 48 | if extension in ACCEPTED_FILE_EXTENSIONS: 49 | return True 50 | 51 | for prefix in ACCEPTED_MIME_TYPE_PREFIXES: 52 | if mimetype.startswith(prefix): 53 | return True 54 | 55 | return False 56 | 57 | def convert( 58 | self, 59 | file_stream: BinaryIO, 60 | stream_info: StreamInfo, 61 | **kwargs: Any, 62 | ) -> DocumentConverterResult: 63 | # Read the file stream into an str using hte provided charset encoding, or using the system default 64 | encoding = stream_info.charset or locale.getpreferredencoding() 65 | stream_data = file_stream.read().decode(encoding) 66 | 67 | # Return the result 68 | return DocumentConverterResult( 69 | title=None, 70 | markdown=rtf_to_text(stream_data), 71 | ) 72 | -------------------------------------------------------------------------------- /packages/markitdown-sample-plugin/src/markitdown_sample_plugin/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/markitdown/62b72284feb986ffaf8c22fa73614545b5713c30/packages/markitdown-sample-plugin/src/markitdown_sample_plugin/py.typed -------------------------------------------------------------------------------- /packages/markitdown-sample-plugin/tests/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: 2024-present Adam Fourney 2 | # 3 | # SPDX-License-Identifier: MIT 4 | -------------------------------------------------------------------------------- /packages/markitdown-sample-plugin/tests/test_sample_plugin.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 -m pytest 2 | import os 3 | 4 | from markitdown import MarkItDown, StreamInfo 5 | from markitdown_sample_plugin import RtfConverter 6 | 7 | TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files") 8 | 9 | RTF_TEST_STRINGS = { 10 | "This is a Sample RTF File", 11 | "It is included to test if the MarkItDown sample plugin can correctly convert RTF files.", 12 | } 13 | 14 | 15 | def test_converter() -> None: 16 | """Tests the RTF converter dirctly.""" 17 | with open(os.path.join(TEST_FILES_DIR, "test.rtf"), "rb") as file_stream: 18 | converter = RtfConverter() 19 | result = converter.convert( 20 | file_stream=file_stream, 21 | stream_info=StreamInfo( 22 | mimetype="text/rtf", extension=".rtf", filename="test.rtf" 23 | ), 24 | ) 25 | 26 | for test_string in RTF_TEST_STRINGS: 27 | assert test_string in result.text_content 28 | 29 | 30 | def test_markitdown() -> None: 31 | """Tests that MarkItDown correctly loads the plugin.""" 32 | md = MarkItDown(enable_plugins=True) 33 | result = md.convert(os.path.join(TEST_FILES_DIR, "test.rtf")) 34 | 35 | for test_string in RTF_TEST_STRINGS: 36 | assert test_string in result.text_content 37 | 38 | 39 | if __name__ == "__main__": 40 | """Runs this file's tests from the command line.""" 41 | test_converter() 42 | test_markitdown() 43 | print("All tests passed.") 44 | -------------------------------------------------------------------------------- /packages/markitdown/README.md: -------------------------------------------------------------------------------- 1 | # MarkItDown 2 | 3 | > [!IMPORTANT] 4 | > MarkItDown is a Python package and command-line utility for converting various files to Markdown (e.g., for indexing, text analysis, etc). 5 | > 6 | > For more information, and full documentation, see the project [README.md](https://github.com/microsoft/markitdown) on GitHub. 7 | 8 | ## Installation 9 | 10 | From PyPI: 11 | 12 | ```bash 13 | pip install markitdown[all] 14 | ``` 15 | 16 | From source: 17 | 18 | ```bash 19 | git clone git@github.com:microsoft/markitdown.git 20 | cd markitdown 21 | pip install -e packages/markitdown[all] 22 | ``` 23 | 24 | ## Usage 25 | 26 | ### Command-Line 27 | 28 | ```bash 29 | markitdown path-to-file.pdf > document.md 30 | ``` 31 | 32 | ### Python API 33 | 34 | ```python 35 | from markitdown import MarkItDown 36 | 37 | md = MarkItDown() 38 | result = md.convert("test.xlsx") 39 | print(result.text_content) 40 | ``` 41 | 42 | ### More Information 43 | 44 | For more information, and full documentation, see the project [README.md](https://github.com/microsoft/markitdown) on GitHub. 45 | 46 | ## Trademarks 47 | 48 | This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft 49 | trademarks or logos is subject to and must follow 50 | [Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general). 51 | Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship. 52 | Any use of third-party trademarks or logos are subject to those third-party's policies. 53 | -------------------------------------------------------------------------------- /packages/markitdown/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["hatchling"] 3 | build-backend = "hatchling.build" 4 | 5 | [project] 6 | name = "markitdown" 7 | dynamic = ["version"] 8 | description = 'Utility tool for converting various files to Markdown' 9 | readme = "README.md" 10 | requires-python = ">=3.10" 11 | license = "MIT" 12 | keywords = [] 13 | authors = [ 14 | { name = "Adam Fourney", email = "adamfo@microsoft.com" }, 15 | ] 16 | classifiers = [ 17 | "Development Status :: 4 - Beta", 18 | "Programming Language :: Python", 19 | "Programming Language :: Python :: 3.10", 20 | "Programming Language :: Python :: 3.11", 21 | "Programming Language :: Python :: 3.12", 22 | "Programming Language :: Python :: 3.13", 23 | "Programming Language :: Python :: Implementation :: CPython", 24 | "Programming Language :: Python :: Implementation :: PyPy", 25 | ] 26 | dependencies = [ 27 | "beautifulsoup4", 28 | "requests", 29 | "markdownify", 30 | "magika~=0.6.1", 31 | "charset-normalizer", 32 | "defusedxml", 33 | "onnxruntime<=1.20.1; sys_platform == 'win32'", 34 | ] 35 | 36 | [project.optional-dependencies] 37 | all = [ 38 | "python-pptx", 39 | "mammoth", 40 | "pandas", 41 | "openpyxl", 42 | "xlrd", 43 | "lxml", 44 | "pdfminer.six", 45 | "olefile", 46 | "pydub", 47 | "SpeechRecognition", 48 | "youtube-transcript-api~=1.0.0", 49 | "azure-ai-documentintelligence", 50 | "azure-identity" 51 | ] 52 | pptx = ["python-pptx"] 53 | docx = ["mammoth", "lxml"] 54 | xlsx = ["pandas", "openpyxl"] 55 | xls = ["pandas", "xlrd"] 56 | pdf = ["pdfminer.six"] 57 | outlook = ["olefile"] 58 | audio-transcription = ["pydub", "SpeechRecognition"] 59 | youtube-transcription = ["youtube-transcript-api"] 60 | az-doc-intel = ["azure-ai-documentintelligence", "azure-identity"] 61 | 62 | [project.urls] 63 | Documentation = "https://github.com/microsoft/markitdown#readme" 64 | Issues = "https://github.com/microsoft/markitdown/issues" 65 | Source = "https://github.com/microsoft/markitdown" 66 | 67 | [tool.hatch.version] 68 | path = "src/markitdown/__about__.py" 69 | 70 | [project.scripts] 71 | markitdown = "markitdown.__main__:main" 72 | 73 | [tool.hatch.envs.default] 74 | features = ["all"] 75 | 76 | [tool.hatch.envs.hatch-test] 77 | features = ["all"] 78 | extra-dependencies = [ 79 | "openai", 80 | ] 81 | 82 | [tool.hatch.envs.types] 83 | features = ["all"] 84 | extra-dependencies = [ 85 | "openai", 86 | "mypy>=1.0.0", 87 | ] 88 | 89 | [tool.hatch.envs.types.scripts] 90 | check = "mypy --install-types --non-interactive --ignore-missing-imports {args:src/markitdown tests}" 91 | 92 | [tool.coverage.run] 93 | source_pkgs = ["markitdown", "tests"] 94 | branch = true 95 | parallel = true 96 | omit = [ 97 | "src/markitdown/__about__.py", 98 | ] 99 | 100 | [tool.coverage.paths] 101 | markitdown = ["src/markitdown", "*/markitdown/src/markitdown"] 102 | tests = ["tests", "*/markitdown/tests"] 103 | 104 | [tool.coverage.report] 105 | exclude_lines = [ 106 | "no cov", 107 | "if __name__ == .__main__.:", 108 | "if TYPE_CHECKING:", 109 | ] 110 | 111 | [tool.hatch.build.targets.sdist] 112 | only-include = ["src/markitdown"] 113 | -------------------------------------------------------------------------------- /packages/markitdown/src/markitdown/__about__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: 2024-present Adam Fourney 2 | # 3 | # SPDX-License-Identifier: MIT 4 | __version__ = "0.1.2" 5 | -------------------------------------------------------------------------------- /packages/markitdown/src/markitdown/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: 2024-present Adam Fourney 2 | # 3 | # SPDX-License-Identifier: MIT 4 | 5 | from .__about__ import __version__ 6 | from ._markitdown import ( 7 | MarkItDown, 8 | PRIORITY_SPECIFIC_FILE_FORMAT, 9 | PRIORITY_GENERIC_FILE_FORMAT, 10 | ) 11 | from ._base_converter import DocumentConverterResult, DocumentConverter 12 | from ._stream_info import StreamInfo 13 | from ._exceptions import ( 14 | MarkItDownException, 15 | MissingDependencyException, 16 | FailedConversionAttempt, 17 | FileConversionException, 18 | UnsupportedFormatException, 19 | ) 20 | 21 | __all__ = [ 22 | "__version__", 23 | "MarkItDown", 24 | "DocumentConverter", 25 | "DocumentConverterResult", 26 | "MarkItDownException", 27 | "MissingDependencyException", 28 | "FailedConversionAttempt", 29 | "FileConversionException", 30 | "UnsupportedFormatException", 31 | "StreamInfo", 32 | "PRIORITY_SPECIFIC_FILE_FORMAT", 33 | "PRIORITY_GENERIC_FILE_FORMAT", 34 | ] 35 | -------------------------------------------------------------------------------- /packages/markitdown/src/markitdown/__main__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: 2024-present Adam Fourney 2 | # 3 | # SPDX-License-Identifier: MIT 4 | import argparse 5 | import sys 6 | import codecs 7 | from textwrap import dedent 8 | from importlib.metadata import entry_points 9 | from .__about__ import __version__ 10 | from ._markitdown import MarkItDown, StreamInfo, DocumentConverterResult 11 | 12 | 13 | def main(): 14 | parser = argparse.ArgumentParser( 15 | description="Convert various file formats to markdown.", 16 | prog="markitdown", 17 | formatter_class=argparse.RawDescriptionHelpFormatter, 18 | usage=dedent( 19 | """ 20 | SYNTAX: 21 | 22 | markitdown 23 | If FILENAME is empty, markitdown reads from stdin. 24 | 25 | EXAMPLE: 26 | 27 | markitdown example.pdf 28 | 29 | OR 30 | 31 | cat example.pdf | markitdown 32 | 33 | OR 34 | 35 | markitdown < example.pdf 36 | 37 | OR to save to a file use 38 | 39 | markitdown example.pdf -o example.md 40 | 41 | OR 42 | 43 | markitdown example.pdf > example.md 44 | """ 45 | ).strip(), 46 | ) 47 | 48 | parser.add_argument( 49 | "-v", 50 | "--version", 51 | action="version", 52 | version=f"%(prog)s {__version__}", 53 | help="show the version number and exit", 54 | ) 55 | 56 | parser.add_argument( 57 | "-o", 58 | "--output", 59 | help="Output file name. If not provided, output is written to stdout.", 60 | ) 61 | 62 | parser.add_argument( 63 | "-x", 64 | "--extension", 65 | help="Provide a hint about the file extension (e.g., when reading from stdin).", 66 | ) 67 | 68 | parser.add_argument( 69 | "-m", 70 | "--mime-type", 71 | help="Provide a hint about the file's MIME type.", 72 | ) 73 | 74 | parser.add_argument( 75 | "-c", 76 | "--charset", 77 | help="Provide a hint about the file's charset (e.g, UTF-8).", 78 | ) 79 | 80 | parser.add_argument( 81 | "-d", 82 | "--use-docintel", 83 | action="store_true", 84 | help="Use Document Intelligence to extract text instead of offline conversion. Requires a valid Document Intelligence Endpoint.", 85 | ) 86 | 87 | parser.add_argument( 88 | "-e", 89 | "--endpoint", 90 | type=str, 91 | help="Document Intelligence Endpoint. Required if using Document Intelligence.", 92 | ) 93 | 94 | parser.add_argument( 95 | "-p", 96 | "--use-plugins", 97 | action="store_true", 98 | help="Use 3rd-party plugins to convert files. Use --list-plugins to see installed plugins.", 99 | ) 100 | 101 | parser.add_argument( 102 | "--list-plugins", 103 | action="store_true", 104 | help="List installed 3rd-party plugins. Plugins are loaded when using the -p or --use-plugin option.", 105 | ) 106 | 107 | parser.add_argument( 108 | "--keep-data-uris", 109 | action="store_true", 110 | help="Keep data URIs (like base64-encoded images) in the output. By default, data URIs are truncated.", 111 | ) 112 | 113 | parser.add_argument("filename", nargs="?") 114 | args = parser.parse_args() 115 | 116 | # Parse the extension hint 117 | extension_hint = args.extension 118 | if extension_hint is not None: 119 | extension_hint = extension_hint.strip().lower() 120 | if len(extension_hint) > 0: 121 | if not extension_hint.startswith("."): 122 | extension_hint = "." + extension_hint 123 | else: 124 | extension_hint = None 125 | 126 | # Parse the mime type 127 | mime_type_hint = args.mime_type 128 | if mime_type_hint is not None: 129 | mime_type_hint = mime_type_hint.strip() 130 | if len(mime_type_hint) > 0: 131 | if mime_type_hint.count("/") != 1: 132 | _exit_with_error(f"Invalid MIME type: {mime_type_hint}") 133 | else: 134 | mime_type_hint = None 135 | 136 | # Parse the charset 137 | charset_hint = args.charset 138 | if charset_hint is not None: 139 | charset_hint = charset_hint.strip() 140 | if len(charset_hint) > 0: 141 | try: 142 | charset_hint = codecs.lookup(charset_hint).name 143 | except LookupError: 144 | _exit_with_error(f"Invalid charset: {charset_hint}") 145 | else: 146 | charset_hint = None 147 | 148 | stream_info = None 149 | if ( 150 | extension_hint is not None 151 | or mime_type_hint is not None 152 | or charset_hint is not None 153 | ): 154 | stream_info = StreamInfo( 155 | extension=extension_hint, mimetype=mime_type_hint, charset=charset_hint 156 | ) 157 | 158 | if args.list_plugins: 159 | # List installed plugins, then exit 160 | print("Installed MarkItDown 3rd-party Plugins:\n") 161 | plugin_entry_points = list(entry_points(group="markitdown.plugin")) 162 | if len(plugin_entry_points) == 0: 163 | print(" * No 3rd-party plugins installed.") 164 | print( 165 | "\nFind plugins by searching for the hashtag #markitdown-plugin on GitHub.\n" 166 | ) 167 | else: 168 | for entry_point in plugin_entry_points: 169 | print(f" * {entry_point.name:<16}\t(package: {entry_point.value})") 170 | print( 171 | "\nUse the -p (or --use-plugins) option to enable 3rd-party plugins.\n" 172 | ) 173 | sys.exit(0) 174 | 175 | if args.use_docintel: 176 | if args.endpoint is None: 177 | _exit_with_error( 178 | "Document Intelligence Endpoint is required when using Document Intelligence." 179 | ) 180 | elif args.filename is None: 181 | _exit_with_error("Filename is required when using Document Intelligence.") 182 | 183 | markitdown = MarkItDown( 184 | enable_plugins=args.use_plugins, docintel_endpoint=args.endpoint 185 | ) 186 | else: 187 | markitdown = MarkItDown(enable_plugins=args.use_plugins) 188 | 189 | if args.filename is None: 190 | result = markitdown.convert_stream( 191 | sys.stdin.buffer, 192 | stream_info=stream_info, 193 | keep_data_uris=args.keep_data_uris, 194 | ) 195 | else: 196 | result = markitdown.convert( 197 | args.filename, stream_info=stream_info, keep_data_uris=args.keep_data_uris 198 | ) 199 | 200 | _handle_output(args, result) 201 | 202 | 203 | def _handle_output(args, result: DocumentConverterResult): 204 | """Handle output to stdout or file""" 205 | if args.output: 206 | with open(args.output, "w", encoding="utf-8") as f: 207 | f.write(result.markdown) 208 | else: 209 | # Handle stdout encoding errors more gracefully 210 | print( 211 | result.markdown.encode(sys.stdout.encoding, errors="replace").decode( 212 | sys.stdout.encoding 213 | ) 214 | ) 215 | 216 | 217 | def _exit_with_error(message: str): 218 | print(message) 219 | sys.exit(1) 220 | 221 | 222 | if __name__ == "__main__": 223 | main() 224 | -------------------------------------------------------------------------------- /packages/markitdown/src/markitdown/_base_converter.py: -------------------------------------------------------------------------------- 1 | from typing import Any, BinaryIO, Optional 2 | from ._stream_info import StreamInfo 3 | 4 | 5 | class DocumentConverterResult: 6 | """The result of converting a document to Markdown.""" 7 | 8 | def __init__( 9 | self, 10 | markdown: str, 11 | *, 12 | title: Optional[str] = None, 13 | ): 14 | """ 15 | Initialize the DocumentConverterResult. 16 | 17 | The only required parameter is the converted Markdown text. 18 | The title, and any other metadata that may be added in the future, are optional. 19 | 20 | Parameters: 21 | - markdown: The converted Markdown text. 22 | - title: Optional title of the document. 23 | """ 24 | self.markdown = markdown 25 | self.title = title 26 | 27 | @property 28 | def text_content(self) -> str: 29 | """Soft-deprecated alias for `markdown`. New code should migrate to using `markdown` or __str__.""" 30 | return self.markdown 31 | 32 | @text_content.setter 33 | def text_content(self, markdown: str): 34 | """Soft-deprecated alias for `markdown`. New code should migrate to using `markdown` or __str__.""" 35 | self.markdown = markdown 36 | 37 | def __str__(self) -> str: 38 | """Return the converted Markdown text.""" 39 | return self.markdown 40 | 41 | 42 | class DocumentConverter: 43 | """Abstract superclass of all DocumentConverters.""" 44 | 45 | def accepts( 46 | self, 47 | file_stream: BinaryIO, 48 | stream_info: StreamInfo, 49 | **kwargs: Any, # Options to pass to the converter 50 | ) -> bool: 51 | """ 52 | Return a quick determination on if the converter should attempt converting the document. 53 | This is primarily based `stream_info` (typically, `stream_info.mimetype`, `stream_info.extension`). 54 | In cases where the data is retrieved via HTTP, the `steam_info.url` might also be referenced to 55 | make a determination (e.g., special converters for Wikipedia, YouTube etc). 56 | Finally, it is conceivable that the `stream_info.filename` might be used to in cases 57 | where the filename is well-known (e.g., `Dockerfile`, `Makefile`, etc) 58 | 59 | NOTE: The method signature is designed to match that of the convert() method. This provides some 60 | assurance that, if accepts() returns True, the convert() method will also be able to handle the document. 61 | 62 | IMPORTANT: In rare cases, (e.g., OutlookMsgConverter) we need to read more from the stream to make a final 63 | determination. Read operations inevitably advances the position in file_stream. In these case, the position 64 | MUST be reset it MUST be reset before returning. This is because the convert() method may be called immediately 65 | after accepts(), and will expect the file_stream to be at the original position. 66 | 67 | E.g., 68 | cur_pos = file_stream.tell() # Save the current position 69 | data = file_stream.read(100) # ... peek at the first 100 bytes, etc. 70 | file_stream.seek(cur_pos) # Reset the position to the original position 71 | 72 | Prameters: 73 | - file_stream: The file-like object to convert. Must support seek(), tell(), and read() methods. 74 | - stream_info: The StreamInfo object containing metadata about the file (mimetype, extension, charset, set) 75 | - kwargs: Additional keyword arguments for the converter. 76 | 77 | Returns: 78 | - bool: True if the converter can handle the document, False otherwise. 79 | """ 80 | raise NotImplementedError( 81 | f"The subclass, {type(self).__name__}, must implement the accepts() method to determine if they can handle the document." 82 | ) 83 | 84 | def convert( 85 | self, 86 | file_stream: BinaryIO, 87 | stream_info: StreamInfo, 88 | **kwargs: Any, # Options to pass to the converter 89 | ) -> DocumentConverterResult: 90 | """ 91 | Convert a document to Markdown text. 92 | 93 | Prameters: 94 | - file_stream: The file-like object to convert. Must support seek(), tell(), and read() methods. 95 | - stream_info: The StreamInfo object containing metadata about the file (mimetype, extension, charset, set) 96 | - kwargs: Additional keyword arguments for the converter. 97 | 98 | Returns: 99 | - DocumentConverterResult: The result of the conversion, which includes the title and markdown content. 100 | 101 | Raises: 102 | - FileConversionException: If the mimetype is recognized, but the conversion fails for some other reason. 103 | - MissingDependencyException: If the converter requires a dependency that is not installed. 104 | """ 105 | raise NotImplementedError("Subclasses must implement this method") 106 | -------------------------------------------------------------------------------- /packages/markitdown/src/markitdown/_exceptions.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, List, Any 2 | 3 | MISSING_DEPENDENCY_MESSAGE = """{converter} recognized the input as a potential {extension} file, but the dependencies needed to read {extension} files have not been installed. To resolve this error, include the optional dependency [{feature}] or [all] when installing MarkItDown. For example: 4 | 5 | * pip install markitdown[{feature}] 6 | * pip install markitdown[all] 7 | * pip install markitdown[{feature}, ...] 8 | * etc.""" 9 | 10 | 11 | class MarkItDownException(Exception): 12 | """ 13 | Base exception class for MarkItDown. 14 | """ 15 | 16 | pass 17 | 18 | 19 | class MissingDependencyException(MarkItDownException): 20 | """ 21 | Converters shipped with MarkItDown may depend on optional 22 | dependencies. This exception is thrown when a converter's 23 | convert() method is called, but the required dependency is not 24 | installed. This is not necessarily a fatal error, as the converter 25 | will simply be skipped (an error will bubble up only if no other 26 | suitable converter is found). 27 | 28 | Error messages should clearly indicate which dependency is missing. 29 | """ 30 | 31 | pass 32 | 33 | 34 | class UnsupportedFormatException(MarkItDownException): 35 | """ 36 | Thrown when no suitable converter was found for the given file. 37 | """ 38 | 39 | pass 40 | 41 | 42 | class FailedConversionAttempt(object): 43 | """ 44 | Represents an a single attempt to convert a file. 45 | """ 46 | 47 | def __init__(self, converter: Any, exc_info: Optional[tuple] = None): 48 | self.converter = converter 49 | self.exc_info = exc_info 50 | 51 | 52 | class FileConversionException(MarkItDownException): 53 | """ 54 | Thrown when a suitable converter was found, but the conversion 55 | process fails for any reason. 56 | """ 57 | 58 | def __init__( 59 | self, 60 | message: Optional[str] = None, 61 | attempts: Optional[List[FailedConversionAttempt]] = None, 62 | ): 63 | self.attempts = attempts 64 | 65 | if message is None: 66 | if attempts is None: 67 | message = "File conversion failed." 68 | else: 69 | message = f"File conversion failed after {len(attempts)} attempts:\n" 70 | for attempt in attempts: 71 | if attempt.exc_info is None: 72 | message += f" - {type(attempt.converter).__name__} provided no execution info." 73 | else: 74 | message += f" - {type(attempt.converter).__name__} threw {attempt.exc_info[0].__name__} with message: {attempt.exc_info[1]}\n" 75 | 76 | super().__init__(message) 77 | -------------------------------------------------------------------------------- /packages/markitdown/src/markitdown/_stream_info.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, asdict 2 | from typing import Optional 3 | 4 | 5 | @dataclass(kw_only=True, frozen=True) 6 | class StreamInfo: 7 | """The StreamInfo class is used to store information about a file stream. 8 | All fields can be None, and will depend on how the stream was opened. 9 | """ 10 | 11 | mimetype: Optional[str] = None 12 | extension: Optional[str] = None 13 | charset: Optional[str] = None 14 | filename: Optional[ 15 | str 16 | ] = None # From local path, url, or Content-Disposition header 17 | local_path: Optional[str] = None # If read from disk 18 | url: Optional[str] = None # If read from url 19 | 20 | def copy_and_update(self, *args, **kwargs): 21 | """Copy the StreamInfo object and update it with the given StreamInfo 22 | instance and/or other keyword arguments.""" 23 | new_info = asdict(self) 24 | 25 | for si in args: 26 | assert isinstance(si, StreamInfo) 27 | new_info.update({k: v for k, v in asdict(si).items() if v is not None}) 28 | 29 | if len(kwargs) > 0: 30 | new_info.update(kwargs) 31 | 32 | return StreamInfo(**new_info) 33 | -------------------------------------------------------------------------------- /packages/markitdown/src/markitdown/_uri_utils.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import os 3 | from typing import Tuple, Dict 4 | from urllib.request import url2pathname 5 | from urllib.parse import urlparse, unquote_to_bytes 6 | 7 | 8 | def file_uri_to_path(file_uri: str) -> Tuple[str | None, str]: 9 | """Convert a file URI to a local file path""" 10 | parsed = urlparse(file_uri) 11 | if parsed.scheme != "file": 12 | raise ValueError(f"Not a file URL: {file_uri}") 13 | 14 | netloc = parsed.netloc if parsed.netloc else None 15 | path = os.path.abspath(url2pathname(parsed.path)) 16 | return netloc, path 17 | 18 | 19 | def parse_data_uri(uri: str) -> Tuple[str | None, Dict[str, str], bytes]: 20 | if not uri.startswith("data:"): 21 | raise ValueError("Not a data URI") 22 | 23 | header, _, data = uri.partition(",") 24 | if not _: 25 | raise ValueError("Malformed data URI, missing ',' separator") 26 | 27 | meta = header[5:] # Strip 'data:' 28 | parts = meta.split(";") 29 | 30 | is_base64 = False 31 | # Ends with base64? 32 | if parts[-1] == "base64": 33 | parts.pop() 34 | is_base64 = True 35 | 36 | mime_type = None # Normally this would default to text/plain but we won't assume 37 | if len(parts) and len(parts[0]) > 0: 38 | # First part is the mime type 39 | mime_type = parts.pop(0) 40 | 41 | attributes: Dict[str, str] = {} 42 | for part in parts: 43 | # Handle key=value pairs in the middle 44 | if "=" in part: 45 | key, value = part.split("=", 1) 46 | attributes[key] = value 47 | elif len(part) > 0: 48 | attributes[part] = "" 49 | 50 | content = base64.b64decode(data) if is_base64 else unquote_to_bytes(data) 51 | 52 | return mime_type, attributes, content 53 | -------------------------------------------------------------------------------- /packages/markitdown/src/markitdown/converter_utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/markitdown/62b72284feb986ffaf8c22fa73614545b5713c30/packages/markitdown/src/markitdown/converter_utils/__init__.py -------------------------------------------------------------------------------- /packages/markitdown/src/markitdown/converter_utils/docx/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/markitdown/62b72284feb986ffaf8c22fa73614545b5713c30/packages/markitdown/src/markitdown/converter_utils/docx/__init__.py -------------------------------------------------------------------------------- /packages/markitdown/src/markitdown/converter_utils/docx/math/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/markitdown/62b72284feb986ffaf8c22fa73614545b5713c30/packages/markitdown/src/markitdown/converter_utils/docx/math/__init__.py -------------------------------------------------------------------------------- /packages/markitdown/src/markitdown/converter_utils/docx/math/latex_dict.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | Adapted from https://github.com/xiilei/dwml/blob/master/dwml/latex_dict.py 5 | On 25/03/2025 6 | """ 7 | 8 | from __future__ import unicode_literals 9 | 10 | CHARS = ("{", "}", "_", "^", "#", "&", "$", "%", "~") 11 | 12 | BLANK = "" 13 | BACKSLASH = "\\" 14 | ALN = "&" 15 | 16 | CHR = { 17 | # Unicode : Latex Math Symbols 18 | # Top accents 19 | "\u0300": "\\grave{{{0}}}", 20 | "\u0301": "\\acute{{{0}}}", 21 | "\u0302": "\\hat{{{0}}}", 22 | "\u0303": "\\tilde{{{0}}}", 23 | "\u0304": "\\bar{{{0}}}", 24 | "\u0305": "\\overbar{{{0}}}", 25 | "\u0306": "\\breve{{{0}}}", 26 | "\u0307": "\\dot{{{0}}}", 27 | "\u0308": "\\ddot{{{0}}}", 28 | "\u0309": "\\ovhook{{{0}}}", 29 | "\u030a": "\\ocirc{{{0}}}}", 30 | "\u030c": "\\check{{{0}}}}", 31 | "\u0310": "\\candra{{{0}}}", 32 | "\u0312": "\\oturnedcomma{{{0}}}", 33 | "\u0315": "\\ocommatopright{{{0}}}", 34 | "\u031a": "\\droang{{{0}}}", 35 | "\u0338": "\\not{{{0}}}", 36 | "\u20d0": "\\leftharpoonaccent{{{0}}}", 37 | "\u20d1": "\\rightharpoonaccent{{{0}}}", 38 | "\u20d2": "\\vertoverlay{{{0}}}", 39 | "\u20d6": "\\overleftarrow{{{0}}}", 40 | "\u20d7": "\\vec{{{0}}}", 41 | "\u20db": "\\dddot{{{0}}}", 42 | "\u20dc": "\\ddddot{{{0}}}", 43 | "\u20e1": "\\overleftrightarrow{{{0}}}", 44 | "\u20e7": "\\annuity{{{0}}}", 45 | "\u20e9": "\\widebridgeabove{{{0}}}", 46 | "\u20f0": "\\asteraccent{{{0}}}", 47 | # Bottom accents 48 | "\u0330": "\\wideutilde{{{0}}}", 49 | "\u0331": "\\underbar{{{0}}}", 50 | "\u20e8": "\\threeunderdot{{{0}}}", 51 | "\u20ec": "\\underrightharpoondown{{{0}}}", 52 | "\u20ed": "\\underleftharpoondown{{{0}}}", 53 | "\u20ee": "\\underledtarrow{{{0}}}", 54 | "\u20ef": "\\underrightarrow{{{0}}}", 55 | # Over | group 56 | "\u23b4": "\\overbracket{{{0}}}", 57 | "\u23dc": "\\overparen{{{0}}}", 58 | "\u23de": "\\overbrace{{{0}}}", 59 | # Under| group 60 | "\u23b5": "\\underbracket{{{0}}}", 61 | "\u23dd": "\\underparen{{{0}}}", 62 | "\u23df": "\\underbrace{{{0}}}", 63 | } 64 | 65 | CHR_BO = { 66 | # Big operators, 67 | "\u2140": "\\Bbbsum", 68 | "\u220f": "\\prod", 69 | "\u2210": "\\coprod", 70 | "\u2211": "\\sum", 71 | "\u222b": "\\int", 72 | "\u22c0": "\\bigwedge", 73 | "\u22c1": "\\bigvee", 74 | "\u22c2": "\\bigcap", 75 | "\u22c3": "\\bigcup", 76 | "\u2a00": "\\bigodot", 77 | "\u2a01": "\\bigoplus", 78 | "\u2a02": "\\bigotimes", 79 | } 80 | 81 | T = { 82 | "\u2192": "\\rightarrow ", 83 | # Greek letters 84 | "\U0001d6fc": "\\alpha ", 85 | "\U0001d6fd": "\\beta ", 86 | "\U0001d6fe": "\\gamma ", 87 | "\U0001d6ff": "\\theta ", 88 | "\U0001d700": "\\epsilon ", 89 | "\U0001d701": "\\zeta ", 90 | "\U0001d702": "\\eta ", 91 | "\U0001d703": "\\theta ", 92 | "\U0001d704": "\\iota ", 93 | "\U0001d705": "\\kappa ", 94 | "\U0001d706": "\\lambda ", 95 | "\U0001d707": "\\m ", 96 | "\U0001d708": "\\n ", 97 | "\U0001d709": "\\xi ", 98 | "\U0001d70a": "\\omicron ", 99 | "\U0001d70b": "\\pi ", 100 | "\U0001d70c": "\\rho ", 101 | "\U0001d70d": "\\varsigma ", 102 | "\U0001d70e": "\\sigma ", 103 | "\U0001d70f": "\\ta ", 104 | "\U0001d710": "\\upsilon ", 105 | "\U0001d711": "\\phi ", 106 | "\U0001d712": "\\chi ", 107 | "\U0001d713": "\\psi ", 108 | "\U0001d714": "\\omega ", 109 | "\U0001d715": "\\partial ", 110 | "\U0001d716": "\\varepsilon ", 111 | "\U0001d717": "\\vartheta ", 112 | "\U0001d718": "\\varkappa ", 113 | "\U0001d719": "\\varphi ", 114 | "\U0001d71a": "\\varrho ", 115 | "\U0001d71b": "\\varpi ", 116 | # Relation symbols 117 | "\u2190": "\\leftarrow ", 118 | "\u2191": "\\uparrow ", 119 | "\u2192": "\\rightarrow ", 120 | "\u2193": "\\downright ", 121 | "\u2194": "\\leftrightarrow ", 122 | "\u2195": "\\updownarrow ", 123 | "\u2196": "\\nwarrow ", 124 | "\u2197": "\\nearrow ", 125 | "\u2198": "\\searrow ", 126 | "\u2199": "\\swarrow ", 127 | "\u22ee": "\\vdots ", 128 | "\u22ef": "\\cdots ", 129 | "\u22f0": "\\adots ", 130 | "\u22f1": "\\ddots ", 131 | "\u2260": "\\ne ", 132 | "\u2264": "\\leq ", 133 | "\u2265": "\\geq ", 134 | "\u2266": "\\leqq ", 135 | "\u2267": "\\geqq ", 136 | "\u2268": "\\lneqq ", 137 | "\u2269": "\\gneqq ", 138 | "\u226a": "\\ll ", 139 | "\u226b": "\\gg ", 140 | "\u2208": "\\in ", 141 | "\u2209": "\\notin ", 142 | "\u220b": "\\ni ", 143 | "\u220c": "\\nni ", 144 | # Ordinary symbols 145 | "\u221e": "\\infty ", 146 | # Binary relations 147 | "\u00b1": "\\pm ", 148 | "\u2213": "\\mp ", 149 | # Italic, Latin, uppercase 150 | "\U0001d434": "A", 151 | "\U0001d435": "B", 152 | "\U0001d436": "C", 153 | "\U0001d437": "D", 154 | "\U0001d438": "E", 155 | "\U0001d439": "F", 156 | "\U0001d43a": "G", 157 | "\U0001d43b": "H", 158 | "\U0001d43c": "I", 159 | "\U0001d43d": "J", 160 | "\U0001d43e": "K", 161 | "\U0001d43f": "L", 162 | "\U0001d440": "M", 163 | "\U0001d441": "N", 164 | "\U0001d442": "O", 165 | "\U0001d443": "P", 166 | "\U0001d444": "Q", 167 | "\U0001d445": "R", 168 | "\U0001d446": "S", 169 | "\U0001d447": "T", 170 | "\U0001d448": "U", 171 | "\U0001d449": "V", 172 | "\U0001d44a": "W", 173 | "\U0001d44b": "X", 174 | "\U0001d44c": "Y", 175 | "\U0001d44d": "Z", 176 | # Italic, Latin, lowercase 177 | "\U0001d44e": "a", 178 | "\U0001d44f": "b", 179 | "\U0001d450": "c", 180 | "\U0001d451": "d", 181 | "\U0001d452": "e", 182 | "\U0001d453": "f", 183 | "\U0001d454": "g", 184 | "\U0001d456": "i", 185 | "\U0001d457": "j", 186 | "\U0001d458": "k", 187 | "\U0001d459": "l", 188 | "\U0001d45a": "m", 189 | "\U0001d45b": "n", 190 | "\U0001d45c": "o", 191 | "\U0001d45d": "p", 192 | "\U0001d45e": "q", 193 | "\U0001d45f": "r", 194 | "\U0001d460": "s", 195 | "\U0001d461": "t", 196 | "\U0001d462": "u", 197 | "\U0001d463": "v", 198 | "\U0001d464": "w", 199 | "\U0001d465": "x", 200 | "\U0001d466": "y", 201 | "\U0001d467": "z", 202 | } 203 | 204 | FUNC = { 205 | "sin": "\\sin({fe})", 206 | "cos": "\\cos({fe})", 207 | "tan": "\\tan({fe})", 208 | "arcsin": "\\arcsin({fe})", 209 | "arccos": "\\arccos({fe})", 210 | "arctan": "\\arctan({fe})", 211 | "arccot": "\\arccot({fe})", 212 | "sinh": "\\sinh({fe})", 213 | "cosh": "\\cosh({fe})", 214 | "tanh": "\\tanh({fe})", 215 | "coth": "\\coth({fe})", 216 | "sec": "\\sec({fe})", 217 | "csc": "\\csc({fe})", 218 | } 219 | 220 | FUNC_PLACE = "{fe}" 221 | 222 | BRK = "\\\\" 223 | 224 | CHR_DEFAULT = { 225 | "ACC_VAL": "\\hat{{{0}}}", 226 | } 227 | 228 | POS = { 229 | "top": "\\overline{{{0}}}", # not sure 230 | "bot": "\\underline{{{0}}}", 231 | } 232 | 233 | POS_DEFAULT = { 234 | "BAR_VAL": "\\overline{{{0}}}", 235 | } 236 | 237 | SUB = "_{{{0}}}" 238 | 239 | SUP = "^{{{0}}}" 240 | 241 | F = { 242 | "bar": "\\frac{{{num}}}{{{den}}}", 243 | "skw": r"^{{{num}}}/_{{{den}}}", 244 | "noBar": "\\genfrac{{}}{{}}{{0pt}}{{}}{{{num}}}{{{den}}}", 245 | "lin": "{{{num}}}/{{{den}}}", 246 | } 247 | F_DEFAULT = "\\frac{{{num}}}{{{den}}}" 248 | 249 | D = "\\left{left}{text}\\right{right}" 250 | 251 | D_DEFAULT = { 252 | "left": "(", 253 | "right": ")", 254 | "null": ".", 255 | } 256 | 257 | RAD = "\\sqrt[{deg}]{{{text}}}" 258 | 259 | RAD_DEFAULT = "\\sqrt{{{text}}}" 260 | 261 | ARR = "\\begin{{array}}{{c}}{text}\\end{{array}}" 262 | 263 | LIM_FUNC = { 264 | "lim": "\\lim_{{{lim}}}", 265 | "max": "\\max_{{{lim}}}", 266 | "min": "\\min_{{{lim}}}", 267 | } 268 | 269 | LIM_TO = ("\\rightarrow", "\\to") 270 | 271 | LIM_UPP = "\\overset{{{lim}}}{{{text}}}" 272 | 273 | M = "\\begin{{matrix}}{text}\\end{{matrix}}" 274 | -------------------------------------------------------------------------------- /packages/markitdown/src/markitdown/converter_utils/docx/pre_process.py: -------------------------------------------------------------------------------- 1 | import zipfile 2 | from io import BytesIO 3 | from typing import BinaryIO 4 | from xml.etree import ElementTree as ET 5 | 6 | from bs4 import BeautifulSoup, Tag 7 | 8 | from .math.omml import OMML_NS, oMath2Latex 9 | 10 | MATH_ROOT_TEMPLATE = "".join( 11 | ( 12 | "', 28 | "{0}", 29 | ) 30 | ) 31 | 32 | 33 | def _convert_omath_to_latex(tag: Tag) -> str: 34 | """ 35 | Converts an OMML (Office Math Markup Language) tag to LaTeX format. 36 | 37 | Args: 38 | tag (Tag): A BeautifulSoup Tag object representing the OMML element. 39 | 40 | Returns: 41 | str: The LaTeX representation of the OMML element. 42 | """ 43 | # Format the tag into a complete XML document string 44 | math_root = ET.fromstring(MATH_ROOT_TEMPLATE.format(str(tag))) 45 | # Find the 'oMath' element within the XML document 46 | math_element = math_root.find(OMML_NS + "oMath") 47 | # Convert the 'oMath' element to LaTeX using the oMath2Latex function 48 | latex = oMath2Latex(math_element).latex 49 | return latex 50 | 51 | 52 | def _get_omath_tag_replacement(tag: Tag, block: bool = False) -> Tag: 53 | """ 54 | Creates a replacement tag for an OMML (Office Math Markup Language) element. 55 | 56 | Args: 57 | tag (Tag): A BeautifulSoup Tag object representing the "oMath" element. 58 | block (bool, optional): If True, the LaTeX will be wrapped in double dollar signs for block mode. Defaults to False. 59 | 60 | Returns: 61 | Tag: A BeautifulSoup Tag object representing the replacement element. 62 | """ 63 | t_tag = Tag(name="w:t") 64 | t_tag.string = ( 65 | f"$${_convert_omath_to_latex(tag)}$$" 66 | if block 67 | else f"${_convert_omath_to_latex(tag)}$" 68 | ) 69 | r_tag = Tag(name="w:r") 70 | r_tag.append(t_tag) 71 | return r_tag 72 | 73 | 74 | def _replace_equations(tag: Tag): 75 | """ 76 | Replaces OMML (Office Math Markup Language) elements with their LaTeX equivalents. 77 | 78 | Args: 79 | tag (Tag): A BeautifulSoup Tag object representing the OMML element. Could be either "oMathPara" or "oMath". 80 | 81 | Raises: 82 | ValueError: If the tag is not supported. 83 | """ 84 | if tag.name == "oMathPara": 85 | # Create a new paragraph tag 86 | p_tag = Tag(name="w:p") 87 | # Replace each 'oMath' child tag with its LaTeX equivalent as block equations 88 | for child_tag in tag.find_all("oMath"): 89 | p_tag.append(_get_omath_tag_replacement(child_tag, block=True)) 90 | # Replace the original 'oMathPara' tag with the new paragraph tag 91 | tag.replace_with(p_tag) 92 | elif tag.name == "oMath": 93 | # Replace the 'oMath' tag with its LaTeX equivalent as inline equation 94 | tag.replace_with(_get_omath_tag_replacement(tag, block=False)) 95 | else: 96 | raise ValueError(f"Not supported tag: {tag.name}") 97 | 98 | 99 | def _pre_process_math(content: bytes) -> bytes: 100 | """ 101 | Pre-processes the math content in a DOCX -> XML file by converting OMML (Office Math Markup Language) elements to LaTeX. 102 | This preprocessed content can be directly replaced in the DOCX file -> XMLs. 103 | 104 | Args: 105 | content (bytes): The XML content of the DOCX file as bytes. 106 | 107 | Returns: 108 | bytes: The processed content with OMML elements replaced by their LaTeX equivalents, encoded as bytes. 109 | """ 110 | soup = BeautifulSoup(content.decode(), features="xml") 111 | for tag in soup.find_all("oMathPara"): 112 | _replace_equations(tag) 113 | for tag in soup.find_all("oMath"): 114 | _replace_equations(tag) 115 | return str(soup).encode() 116 | 117 | 118 | def pre_process_docx(input_docx: BinaryIO) -> BinaryIO: 119 | """ 120 | Pre-processes a DOCX file with provided steps. 121 | 122 | The process works by unzipping the DOCX file in memory, transforming specific XML files 123 | (such as converting OMML elements to LaTeX), and then zipping everything back into a 124 | DOCX file without writing to disk. 125 | 126 | Args: 127 | input_docx (BinaryIO): A binary input stream representing the DOCX file. 128 | 129 | Returns: 130 | BinaryIO: A binary output stream representing the processed DOCX file. 131 | """ 132 | output_docx = BytesIO() 133 | # The files that need to be pre-processed from .docx 134 | pre_process_enable_files = [ 135 | "word/document.xml", 136 | "word/footnotes.xml", 137 | "word/endnotes.xml", 138 | ] 139 | with zipfile.ZipFile(input_docx, mode="r") as zip_input: 140 | files = {name: zip_input.read(name) for name in zip_input.namelist()} 141 | with zipfile.ZipFile(output_docx, mode="w") as zip_output: 142 | zip_output.comment = zip_input.comment 143 | for name, content in files.items(): 144 | if name in pre_process_enable_files: 145 | try: 146 | # Pre-process the content 147 | updated_content = _pre_process_math(content) 148 | # In the future, if there are more pre-processing steps, they can be added here 149 | zip_output.writestr(name, updated_content) 150 | except Exception: 151 | # If there is an error in processing the content, write the original content 152 | zip_output.writestr(name, content) 153 | else: 154 | zip_output.writestr(name, content) 155 | output_docx.seek(0) 156 | return output_docx 157 | -------------------------------------------------------------------------------- /packages/markitdown/src/markitdown/converters/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: 2024-present Adam Fourney 2 | # 3 | # SPDX-License-Identifier: MIT 4 | 5 | from ._plain_text_converter import PlainTextConverter 6 | from ._html_converter import HtmlConverter 7 | from ._rss_converter import RssConverter 8 | from ._wikipedia_converter import WikipediaConverter 9 | from ._youtube_converter import YouTubeConverter 10 | from ._ipynb_converter import IpynbConverter 11 | from ._bing_serp_converter import BingSerpConverter 12 | from ._pdf_converter import PdfConverter 13 | from ._docx_converter import DocxConverter 14 | from ._xlsx_converter import XlsxConverter, XlsConverter 15 | from ._pptx_converter import PptxConverter 16 | from ._image_converter import ImageConverter 17 | from ._audio_converter import AudioConverter 18 | from ._outlook_msg_converter import OutlookMsgConverter 19 | from ._zip_converter import ZipConverter 20 | from ._doc_intel_converter import ( 21 | DocumentIntelligenceConverter, 22 | DocumentIntelligenceFileType, 23 | ) 24 | from ._epub_converter import EpubConverter 25 | from ._csv_converter import CsvConverter 26 | 27 | __all__ = [ 28 | "PlainTextConverter", 29 | "HtmlConverter", 30 | "RssConverter", 31 | "WikipediaConverter", 32 | "YouTubeConverter", 33 | "IpynbConverter", 34 | "BingSerpConverter", 35 | "PdfConverter", 36 | "DocxConverter", 37 | "XlsxConverter", 38 | "XlsConverter", 39 | "PptxConverter", 40 | "ImageConverter", 41 | "AudioConverter", 42 | "OutlookMsgConverter", 43 | "ZipConverter", 44 | "DocumentIntelligenceConverter", 45 | "DocumentIntelligenceFileType", 46 | "EpubConverter", 47 | "CsvConverter", 48 | ] 49 | -------------------------------------------------------------------------------- /packages/markitdown/src/markitdown/converters/_audio_converter.py: -------------------------------------------------------------------------------- 1 | from typing import Any, BinaryIO 2 | 3 | from ._exiftool import exiftool_metadata 4 | from ._transcribe_audio import transcribe_audio 5 | from .._base_converter import DocumentConverter, DocumentConverterResult 6 | from .._stream_info import StreamInfo 7 | from .._exceptions import MissingDependencyException 8 | 9 | ACCEPTED_MIME_TYPE_PREFIXES = [ 10 | "audio/x-wav", 11 | "audio/mpeg", 12 | "video/mp4", 13 | ] 14 | 15 | ACCEPTED_FILE_EXTENSIONS = [ 16 | ".wav", 17 | ".mp3", 18 | ".m4a", 19 | ".mp4", 20 | ] 21 | 22 | 23 | class AudioConverter(DocumentConverter): 24 | """ 25 | Converts audio files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` is installed). 26 | """ 27 | 28 | def accepts( 29 | self, 30 | file_stream: BinaryIO, 31 | stream_info: StreamInfo, 32 | **kwargs: Any, # Options to pass to the converter 33 | ) -> bool: 34 | mimetype = (stream_info.mimetype or "").lower() 35 | extension = (stream_info.extension or "").lower() 36 | 37 | if extension in ACCEPTED_FILE_EXTENSIONS: 38 | return True 39 | 40 | for prefix in ACCEPTED_MIME_TYPE_PREFIXES: 41 | if mimetype.startswith(prefix): 42 | return True 43 | 44 | return False 45 | 46 | def convert( 47 | self, 48 | file_stream: BinaryIO, 49 | stream_info: StreamInfo, 50 | **kwargs: Any, # Options to pass to the converter 51 | ) -> DocumentConverterResult: 52 | md_content = "" 53 | 54 | # Add metadata 55 | metadata = exiftool_metadata( 56 | file_stream, exiftool_path=kwargs.get("exiftool_path") 57 | ) 58 | if metadata: 59 | for f in [ 60 | "Title", 61 | "Artist", 62 | "Author", 63 | "Band", 64 | "Album", 65 | "Genre", 66 | "Track", 67 | "DateTimeOriginal", 68 | "CreateDate", 69 | # "Duration", -- Wrong values when read from memory 70 | "NumChannels", 71 | "SampleRate", 72 | "AvgBytesPerSec", 73 | "BitsPerSample", 74 | ]: 75 | if f in metadata: 76 | md_content += f"{f}: {metadata[f]}\n" 77 | 78 | # Figure out the audio format for transcription 79 | if stream_info.extension == ".wav" or stream_info.mimetype == "audio/x-wav": 80 | audio_format = "wav" 81 | elif stream_info.extension == ".mp3" or stream_info.mimetype == "audio/mpeg": 82 | audio_format = "mp3" 83 | elif ( 84 | stream_info.extension in [".mp4", ".m4a"] 85 | or stream_info.mimetype == "video/mp4" 86 | ): 87 | audio_format = "mp4" 88 | else: 89 | audio_format = None 90 | 91 | # Transcribe 92 | if audio_format: 93 | try: 94 | transcript = transcribe_audio(file_stream, audio_format=audio_format) 95 | if transcript: 96 | md_content += "\n\n### Audio Transcript:\n" + transcript 97 | except MissingDependencyException: 98 | pass 99 | 100 | # Return the result 101 | return DocumentConverterResult(markdown=md_content.strip()) 102 | -------------------------------------------------------------------------------- /packages/markitdown/src/markitdown/converters/_bing_serp_converter.py: -------------------------------------------------------------------------------- 1 | import re 2 | import base64 3 | import binascii 4 | from urllib.parse import parse_qs, urlparse 5 | from typing import Any, BinaryIO 6 | from bs4 import BeautifulSoup 7 | 8 | from .._base_converter import DocumentConverter, DocumentConverterResult 9 | from .._stream_info import StreamInfo 10 | from ._markdownify import _CustomMarkdownify 11 | 12 | ACCEPTED_MIME_TYPE_PREFIXES = [ 13 | "text/html", 14 | "application/xhtml", 15 | ] 16 | 17 | ACCEPTED_FILE_EXTENSIONS = [ 18 | ".html", 19 | ".htm", 20 | ] 21 | 22 | 23 | class BingSerpConverter(DocumentConverter): 24 | """ 25 | Handle Bing results pages (only the organic search results). 26 | NOTE: It is better to use the Bing API 27 | """ 28 | 29 | def accepts( 30 | self, 31 | file_stream: BinaryIO, 32 | stream_info: StreamInfo, 33 | **kwargs: Any, # Options to pass to the converter 34 | ) -> bool: 35 | """ 36 | Make sure we're dealing with HTML content *from* Bing. 37 | """ 38 | 39 | url = stream_info.url or "" 40 | mimetype = (stream_info.mimetype or "").lower() 41 | extension = (stream_info.extension or "").lower() 42 | 43 | if not re.search(r"^https://www\.bing\.com/search\?q=", url): 44 | # Not a Bing SERP URL 45 | return False 46 | 47 | if extension in ACCEPTED_FILE_EXTENSIONS: 48 | return True 49 | 50 | for prefix in ACCEPTED_MIME_TYPE_PREFIXES: 51 | if mimetype.startswith(prefix): 52 | return True 53 | 54 | # Not HTML content 55 | return False 56 | 57 | def convert( 58 | self, 59 | file_stream: BinaryIO, 60 | stream_info: StreamInfo, 61 | **kwargs: Any, # Options to pass to the converter 62 | ) -> DocumentConverterResult: 63 | assert stream_info.url is not None 64 | 65 | # Parse the query parameters 66 | parsed_params = parse_qs(urlparse(stream_info.url).query) 67 | query = parsed_params.get("q", [""])[0] 68 | 69 | # Parse the stream 70 | encoding = "utf-8" if stream_info.charset is None else stream_info.charset 71 | soup = BeautifulSoup(file_stream, "html.parser", from_encoding=encoding) 72 | 73 | # Clean up some formatting 74 | for tptt in soup.find_all(class_="tptt"): 75 | if hasattr(tptt, "string") and tptt.string: 76 | tptt.string += " " 77 | for slug in soup.find_all(class_="algoSlug_icon"): 78 | slug.extract() 79 | 80 | # Parse the algorithmic results 81 | _markdownify = _CustomMarkdownify(**kwargs) 82 | results = list() 83 | for result in soup.find_all(class_="b_algo"): 84 | if not hasattr(result, "find_all"): 85 | continue 86 | 87 | # Rewrite redirect urls 88 | for a in result.find_all("a", href=True): 89 | parsed_href = urlparse(a["href"]) 90 | qs = parse_qs(parsed_href.query) 91 | 92 | # The destination is contained in the u parameter, 93 | # but appears to be base64 encoded, with some prefix 94 | if "u" in qs: 95 | u = ( 96 | qs["u"][0][2:].strip() + "==" 97 | ) # Python 3 doesn't care about extra padding 98 | 99 | try: 100 | # RFC 4648 / Base64URL" variant, which uses "-" and "_" 101 | a["href"] = base64.b64decode(u, altchars="-_").decode("utf-8") 102 | except UnicodeDecodeError: 103 | pass 104 | except binascii.Error: 105 | pass 106 | 107 | # Convert to markdown 108 | md_result = _markdownify.convert_soup(result).strip() 109 | lines = [line.strip() for line in re.split(r"\n+", md_result)] 110 | results.append("\n".join([line for line in lines if len(line) > 0])) 111 | 112 | webpage_text = ( 113 | f"## A Bing search for '{query}' found the following results:\n\n" 114 | + "\n\n".join(results) 115 | ) 116 | 117 | return DocumentConverterResult( 118 | markdown=webpage_text, 119 | title=None if soup.title is None else soup.title.string, 120 | ) 121 | -------------------------------------------------------------------------------- /packages/markitdown/src/markitdown/converters/_csv_converter.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import io 3 | from typing import BinaryIO, Any 4 | from charset_normalizer import from_bytes 5 | from .._base_converter import DocumentConverter, DocumentConverterResult 6 | from .._stream_info import StreamInfo 7 | 8 | ACCEPTED_MIME_TYPE_PREFIXES = [ 9 | "text/csv", 10 | "application/csv", 11 | ] 12 | ACCEPTED_FILE_EXTENSIONS = [".csv"] 13 | 14 | 15 | class CsvConverter(DocumentConverter): 16 | """ 17 | Converts CSV files to Markdown tables. 18 | """ 19 | 20 | def __init__(self): 21 | super().__init__() 22 | 23 | def accepts( 24 | self, 25 | file_stream: BinaryIO, 26 | stream_info: StreamInfo, 27 | **kwargs: Any, # Options to pass to the converter 28 | ) -> bool: 29 | mimetype = (stream_info.mimetype or "").lower() 30 | extension = (stream_info.extension or "").lower() 31 | if extension in ACCEPTED_FILE_EXTENSIONS: 32 | return True 33 | for prefix in ACCEPTED_MIME_TYPE_PREFIXES: 34 | if mimetype.startswith(prefix): 35 | return True 36 | return False 37 | 38 | def convert( 39 | self, 40 | file_stream: BinaryIO, 41 | stream_info: StreamInfo, 42 | **kwargs: Any, # Options to pass to the converter 43 | ) -> DocumentConverterResult: 44 | # Read the file content 45 | if stream_info.charset: 46 | content = file_stream.read().decode(stream_info.charset) 47 | else: 48 | content = str(from_bytes(file_stream.read()).best()) 49 | 50 | # Parse CSV content 51 | reader = csv.reader(io.StringIO(content)) 52 | rows = list(reader) 53 | 54 | if not rows: 55 | return DocumentConverterResult(markdown="") 56 | 57 | # Create markdown table 58 | markdown_table = [] 59 | 60 | # Add header row 61 | markdown_table.append("| " + " | ".join(rows[0]) + " |") 62 | 63 | # Add separator row 64 | markdown_table.append("| " + " | ".join(["---"] * len(rows[0])) + " |") 65 | 66 | # Add data rows 67 | for row in rows[1:]: 68 | # Make sure row has the same number of columns as header 69 | while len(row) < len(rows[0]): 70 | row.append("") 71 | # Truncate if row has more columns than header 72 | row = row[: len(rows[0])] 73 | markdown_table.append("| " + " | ".join(row) + " |") 74 | 75 | result = "\n".join(markdown_table) 76 | 77 | return DocumentConverterResult(markdown=result) 78 | -------------------------------------------------------------------------------- /packages/markitdown/src/markitdown/converters/_doc_intel_converter.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import re 3 | import os 4 | from typing import BinaryIO, Any, List 5 | from enum import Enum 6 | 7 | from .._base_converter import DocumentConverter, DocumentConverterResult 8 | from .._stream_info import StreamInfo 9 | from .._exceptions import MissingDependencyException 10 | 11 | # Try loading optional (but in this case, required) dependencies 12 | # Save reporting of any exceptions for later 13 | _dependency_exc_info = None 14 | try: 15 | from azure.ai.documentintelligence import DocumentIntelligenceClient 16 | from azure.ai.documentintelligence.models import ( 17 | AnalyzeDocumentRequest, 18 | AnalyzeResult, 19 | DocumentAnalysisFeature, 20 | ) 21 | from azure.core.credentials import AzureKeyCredential, TokenCredential 22 | from azure.identity import DefaultAzureCredential 23 | except ImportError: 24 | # Preserve the error and stack trace for later 25 | _dependency_exc_info = sys.exc_info() 26 | 27 | # Define these types for type hinting when the package is not available 28 | class AzureKeyCredential: 29 | pass 30 | 31 | class TokenCredential: 32 | pass 33 | 34 | class DocumentIntelligenceClient: 35 | pass 36 | 37 | class AnalyzeDocumentRequest: 38 | pass 39 | 40 | class AnalyzeResult: 41 | pass 42 | 43 | class DocumentAnalysisFeature: 44 | pass 45 | 46 | class DefaultAzureCredential: 47 | pass 48 | 49 | 50 | # TODO: currently, there is a bug in the document intelligence SDK with importing the "ContentFormat" enum. 51 | # This constant is a temporary fix until the bug is resolved. 52 | CONTENT_FORMAT = "markdown" 53 | 54 | 55 | class DocumentIntelligenceFileType(str, Enum): 56 | """Enum of file types supported by the Document Intelligence Converter.""" 57 | 58 | # No OCR 59 | DOCX = "docx" 60 | PPTX = "pptx" 61 | XLSX = "xlsx" 62 | HTML = "html" 63 | # OCR 64 | PDF = "pdf" 65 | JPEG = "jpeg" 66 | PNG = "png" 67 | BMP = "bmp" 68 | TIFF = "tiff" 69 | 70 | 71 | def _get_mime_type_prefixes(types: List[DocumentIntelligenceFileType]) -> List[str]: 72 | """Get the MIME type prefixes for the given file types.""" 73 | prefixes: List[str] = [] 74 | for type_ in types: 75 | if type_ == DocumentIntelligenceFileType.DOCX: 76 | prefixes.append( 77 | "application/vnd.openxmlformats-officedocument.wordprocessingml.document" 78 | ) 79 | elif type_ == DocumentIntelligenceFileType.PPTX: 80 | prefixes.append( 81 | "application/vnd.openxmlformats-officedocument.presentationml" 82 | ) 83 | elif type_ == DocumentIntelligenceFileType.XLSX: 84 | prefixes.append( 85 | "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" 86 | ) 87 | elif type_ == DocumentIntelligenceFileType.PDF: 88 | prefixes.append("application/pdf") 89 | prefixes.append("application/x-pdf") 90 | elif type_ == DocumentIntelligenceFileType.JPEG: 91 | prefixes.append("image/jpeg") 92 | elif type_ == DocumentIntelligenceFileType.PNG: 93 | prefixes.append("image/png") 94 | elif type_ == DocumentIntelligenceFileType.BMP: 95 | prefixes.append("image/bmp") 96 | elif type_ == DocumentIntelligenceFileType.TIFF: 97 | prefixes.append("image/tiff") 98 | return prefixes 99 | 100 | 101 | def _get_file_extensions(types: List[DocumentIntelligenceFileType]) -> List[str]: 102 | """Get the file extensions for the given file types.""" 103 | extensions: List[str] = [] 104 | for type_ in types: 105 | if type_ == DocumentIntelligenceFileType.DOCX: 106 | extensions.append(".docx") 107 | elif type_ == DocumentIntelligenceFileType.PPTX: 108 | extensions.append(".pptx") 109 | elif type_ == DocumentIntelligenceFileType.XLSX: 110 | extensions.append(".xlsx") 111 | elif type_ == DocumentIntelligenceFileType.PDF: 112 | extensions.append(".pdf") 113 | elif type_ == DocumentIntelligenceFileType.JPEG: 114 | extensions.append(".jpg") 115 | extensions.append(".jpeg") 116 | elif type_ == DocumentIntelligenceFileType.PNG: 117 | extensions.append(".png") 118 | elif type_ == DocumentIntelligenceFileType.BMP: 119 | extensions.append(".bmp") 120 | elif type_ == DocumentIntelligenceFileType.TIFF: 121 | extensions.append(".tiff") 122 | return extensions 123 | 124 | 125 | class DocumentIntelligenceConverter(DocumentConverter): 126 | """Specialized DocumentConverter that uses Document Intelligence to extract text from documents.""" 127 | 128 | def __init__( 129 | self, 130 | *, 131 | endpoint: str, 132 | api_version: str = "2024-07-31-preview", 133 | credential: AzureKeyCredential | TokenCredential | None = None, 134 | file_types: List[DocumentIntelligenceFileType] = [ 135 | DocumentIntelligenceFileType.DOCX, 136 | DocumentIntelligenceFileType.PPTX, 137 | DocumentIntelligenceFileType.XLSX, 138 | DocumentIntelligenceFileType.PDF, 139 | DocumentIntelligenceFileType.JPEG, 140 | DocumentIntelligenceFileType.PNG, 141 | DocumentIntelligenceFileType.BMP, 142 | DocumentIntelligenceFileType.TIFF, 143 | ], 144 | ): 145 | """ 146 | Initialize the DocumentIntelligenceConverter. 147 | 148 | Args: 149 | endpoint (str): The endpoint for the Document Intelligence service. 150 | api_version (str): The API version to use. Defaults to "2024-07-31-preview". 151 | credential (AzureKeyCredential | TokenCredential | None): The credential to use for authentication. 152 | file_types (List[DocumentIntelligenceFileType]): The file types to accept. Defaults to all supported file types. 153 | """ 154 | 155 | super().__init__() 156 | self._file_types = file_types 157 | 158 | # Raise an error if the dependencies are not available. 159 | # This is different than other converters since this one isn't even instantiated 160 | # unless explicitly requested. 161 | if _dependency_exc_info is not None: 162 | raise MissingDependencyException( 163 | "DocumentIntelligenceConverter requires the optional dependency [az-doc-intel] (or [all]) to be installed. E.g., `pip install markitdown[az-doc-intel]`" 164 | ) from _dependency_exc_info[ 165 | 1 166 | ].with_traceback( # type: ignore[union-attr] 167 | _dependency_exc_info[2] 168 | ) 169 | 170 | if credential is None: 171 | if os.environ.get("AZURE_API_KEY") is None: 172 | credential = DefaultAzureCredential() 173 | else: 174 | credential = AzureKeyCredential(os.environ["AZURE_API_KEY"]) 175 | 176 | self.endpoint = endpoint 177 | self.api_version = api_version 178 | self.doc_intel_client = DocumentIntelligenceClient( 179 | endpoint=self.endpoint, 180 | api_version=self.api_version, 181 | credential=credential, 182 | ) 183 | 184 | def accepts( 185 | self, 186 | file_stream: BinaryIO, 187 | stream_info: StreamInfo, 188 | **kwargs: Any, # Options to pass to the converter 189 | ) -> bool: 190 | mimetype = (stream_info.mimetype or "").lower() 191 | extension = (stream_info.extension or "").lower() 192 | 193 | if extension in _get_file_extensions(self._file_types): 194 | return True 195 | 196 | for prefix in _get_mime_type_prefixes(self._file_types): 197 | if mimetype.startswith(prefix): 198 | return True 199 | 200 | return False 201 | 202 | def _analysis_features(self, stream_info: StreamInfo) -> List[str]: 203 | """ 204 | Helper needed to determine which analysis features to use. 205 | Certain document analysis features are not availiable for 206 | office filetypes (.xlsx, .pptx, .html, .docx) 207 | """ 208 | mimetype = (stream_info.mimetype or "").lower() 209 | extension = (stream_info.extension or "").lower() 210 | 211 | # Types that don't support ocr 212 | no_ocr_types = [ 213 | DocumentIntelligenceFileType.DOCX, 214 | DocumentIntelligenceFileType.PPTX, 215 | DocumentIntelligenceFileType.XLSX, 216 | DocumentIntelligenceFileType.HTML, 217 | ] 218 | 219 | if extension in _get_file_extensions(no_ocr_types): 220 | return [] 221 | 222 | for prefix in _get_mime_type_prefixes(no_ocr_types): 223 | if mimetype.startswith(prefix): 224 | return [] 225 | 226 | return [ 227 | DocumentAnalysisFeature.FORMULAS, # enable formula extraction 228 | DocumentAnalysisFeature.OCR_HIGH_RESOLUTION, # enable high resolution OCR 229 | DocumentAnalysisFeature.STYLE_FONT, # enable font style extraction 230 | ] 231 | 232 | def convert( 233 | self, 234 | file_stream: BinaryIO, 235 | stream_info: StreamInfo, 236 | **kwargs: Any, # Options to pass to the converter 237 | ) -> DocumentConverterResult: 238 | # Extract the text using Azure Document Intelligence 239 | poller = self.doc_intel_client.begin_analyze_document( 240 | model_id="prebuilt-layout", 241 | body=AnalyzeDocumentRequest(bytes_source=file_stream.read()), 242 | features=self._analysis_features(stream_info), 243 | output_content_format=CONTENT_FORMAT, # TODO: replace with "ContentFormat.MARKDOWN" when the bug is fixed 244 | ) 245 | result: AnalyzeResult = poller.result() 246 | 247 | # remove comments from the markdown content generated by Doc Intelligence and append to markdown string 248 | markdown_text = re.sub(r"", "", result.content, flags=re.DOTALL) 249 | return DocumentConverterResult(markdown=markdown_text) 250 | -------------------------------------------------------------------------------- /packages/markitdown/src/markitdown/converters/_docx_converter.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | from typing import BinaryIO, Any 4 | 5 | from ._html_converter import HtmlConverter 6 | from ..converter_utils.docx.pre_process import pre_process_docx 7 | from .._base_converter import DocumentConverterResult 8 | from .._stream_info import StreamInfo 9 | from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE 10 | 11 | # Try loading optional (but in this case, required) dependencies 12 | # Save reporting of any exceptions for later 13 | _dependency_exc_info = None 14 | try: 15 | import mammoth 16 | except ImportError: 17 | # Preserve the error and stack trace for later 18 | _dependency_exc_info = sys.exc_info() 19 | 20 | 21 | ACCEPTED_MIME_TYPE_PREFIXES = [ 22 | "application/vnd.openxmlformats-officedocument.wordprocessingml.document", 23 | ] 24 | 25 | ACCEPTED_FILE_EXTENSIONS = [".docx"] 26 | 27 | 28 | class DocxConverter(HtmlConverter): 29 | """ 30 | Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible. 31 | """ 32 | 33 | def __init__(self): 34 | super().__init__() 35 | self._html_converter = HtmlConverter() 36 | 37 | def accepts( 38 | self, 39 | file_stream: BinaryIO, 40 | stream_info: StreamInfo, 41 | **kwargs: Any, # Options to pass to the converter 42 | ) -> bool: 43 | mimetype = (stream_info.mimetype or "").lower() 44 | extension = (stream_info.extension or "").lower() 45 | 46 | if extension in ACCEPTED_FILE_EXTENSIONS: 47 | return True 48 | 49 | for prefix in ACCEPTED_MIME_TYPE_PREFIXES: 50 | if mimetype.startswith(prefix): 51 | return True 52 | 53 | return False 54 | 55 | def convert( 56 | self, 57 | file_stream: BinaryIO, 58 | stream_info: StreamInfo, 59 | **kwargs: Any, # Options to pass to the converter 60 | ) -> DocumentConverterResult: 61 | # Check: the dependencies 62 | if _dependency_exc_info is not None: 63 | raise MissingDependencyException( 64 | MISSING_DEPENDENCY_MESSAGE.format( 65 | converter=type(self).__name__, 66 | extension=".docx", 67 | feature="docx", 68 | ) 69 | ) from _dependency_exc_info[ 70 | 1 71 | ].with_traceback( # type: ignore[union-attr] 72 | _dependency_exc_info[2] 73 | ) 74 | 75 | style_map = kwargs.get("style_map", None) 76 | pre_process_stream = pre_process_docx(file_stream) 77 | return self._html_converter.convert_string( 78 | mammoth.convert_to_html(pre_process_stream, style_map=style_map).value, 79 | **kwargs, 80 | ) 81 | -------------------------------------------------------------------------------- /packages/markitdown/src/markitdown/converters/_epub_converter.py: -------------------------------------------------------------------------------- 1 | import os 2 | import zipfile 3 | from defusedxml import minidom 4 | from xml.dom.minidom import Document 5 | 6 | from typing import BinaryIO, Any, Dict, List 7 | 8 | from ._html_converter import HtmlConverter 9 | from .._base_converter import DocumentConverterResult 10 | from .._stream_info import StreamInfo 11 | 12 | ACCEPTED_MIME_TYPE_PREFIXES = [ 13 | "application/epub", 14 | "application/epub+zip", 15 | "application/x-epub+zip", 16 | ] 17 | 18 | ACCEPTED_FILE_EXTENSIONS = [".epub"] 19 | 20 | MIME_TYPE_MAPPING = { 21 | ".html": "text/html", 22 | ".xhtml": "application/xhtml+xml", 23 | } 24 | 25 | 26 | class EpubConverter(HtmlConverter): 27 | """ 28 | Converts EPUB files to Markdown. Style information (e.g.m headings) and tables are preserved where possible. 29 | """ 30 | 31 | def __init__(self): 32 | super().__init__() 33 | self._html_converter = HtmlConverter() 34 | 35 | def accepts( 36 | self, 37 | file_stream: BinaryIO, 38 | stream_info: StreamInfo, 39 | **kwargs: Any, # Options to pass to the converter 40 | ) -> bool: 41 | mimetype = (stream_info.mimetype or "").lower() 42 | extension = (stream_info.extension or "").lower() 43 | 44 | if extension in ACCEPTED_FILE_EXTENSIONS: 45 | return True 46 | 47 | for prefix in ACCEPTED_MIME_TYPE_PREFIXES: 48 | if mimetype.startswith(prefix): 49 | return True 50 | 51 | return False 52 | 53 | def convert( 54 | self, 55 | file_stream: BinaryIO, 56 | stream_info: StreamInfo, 57 | **kwargs: Any, # Options to pass to the converter 58 | ) -> DocumentConverterResult: 59 | with zipfile.ZipFile(file_stream, "r") as z: 60 | # Extracts metadata (title, authors, language, publisher, date, description, cover) from an EPUB file.""" 61 | 62 | # Locate content.opf 63 | container_dom = minidom.parse(z.open("META-INF/container.xml")) 64 | opf_path = container_dom.getElementsByTagName("rootfile")[0].getAttribute( 65 | "full-path" 66 | ) 67 | 68 | # Parse content.opf 69 | opf_dom = minidom.parse(z.open(opf_path)) 70 | metadata: Dict[str, Any] = { 71 | "title": self._get_text_from_node(opf_dom, "dc:title"), 72 | "authors": self._get_all_texts_from_nodes(opf_dom, "dc:creator"), 73 | "language": self._get_text_from_node(opf_dom, "dc:language"), 74 | "publisher": self._get_text_from_node(opf_dom, "dc:publisher"), 75 | "date": self._get_text_from_node(opf_dom, "dc:date"), 76 | "description": self._get_text_from_node(opf_dom, "dc:description"), 77 | "identifier": self._get_text_from_node(opf_dom, "dc:identifier"), 78 | } 79 | 80 | # Extract manifest items (ID → href mapping) 81 | manifest = { 82 | item.getAttribute("id"): item.getAttribute("href") 83 | for item in opf_dom.getElementsByTagName("item") 84 | } 85 | 86 | # Extract spine order (ID refs) 87 | spine_items = opf_dom.getElementsByTagName("itemref") 88 | spine_order = [item.getAttribute("idref") for item in spine_items] 89 | 90 | # Convert spine order to actual file paths 91 | base_path = "/".join( 92 | opf_path.split("/")[:-1] 93 | ) # Get base directory of content.opf 94 | spine = [ 95 | f"{base_path}/{manifest[item_id]}" if base_path else manifest[item_id] 96 | for item_id in spine_order 97 | if item_id in manifest 98 | ] 99 | 100 | # Extract and convert the content 101 | markdown_content: List[str] = [] 102 | for file in spine: 103 | if file in z.namelist(): 104 | with z.open(file) as f: 105 | filename = os.path.basename(file) 106 | extension = os.path.splitext(filename)[1].lower() 107 | mimetype = MIME_TYPE_MAPPING.get(extension) 108 | converted_content = self._html_converter.convert( 109 | f, 110 | StreamInfo( 111 | mimetype=mimetype, 112 | extension=extension, 113 | filename=filename, 114 | ), 115 | ) 116 | markdown_content.append(converted_content.markdown.strip()) 117 | 118 | # Format and add the metadata 119 | metadata_markdown = [] 120 | for key, value in metadata.items(): 121 | if isinstance(value, list): 122 | value = ", ".join(value) 123 | if value: 124 | metadata_markdown.append(f"**{key.capitalize()}:** {value}") 125 | 126 | markdown_content.insert(0, "\n".join(metadata_markdown)) 127 | 128 | return DocumentConverterResult( 129 | markdown="\n\n".join(markdown_content), title=metadata["title"] 130 | ) 131 | 132 | def _get_text_from_node(self, dom: Document, tag_name: str) -> str | None: 133 | """Convenience function to extract a single occurrence of a tag (e.g., title).""" 134 | texts = self._get_all_texts_from_nodes(dom, tag_name) 135 | if len(texts) > 0: 136 | return texts[0] 137 | else: 138 | return None 139 | 140 | def _get_all_texts_from_nodes(self, dom: Document, tag_name: str) -> List[str]: 141 | """Helper function to extract all occurrences of a tag (e.g., multiple authors).""" 142 | texts: List[str] = [] 143 | for node in dom.getElementsByTagName(tag_name): 144 | if node.firstChild and hasattr(node.firstChild, "nodeValue"): 145 | texts.append(node.firstChild.nodeValue.strip()) 146 | return texts 147 | -------------------------------------------------------------------------------- /packages/markitdown/src/markitdown/converters/_exiftool.py: -------------------------------------------------------------------------------- 1 | import json 2 | import subprocess 3 | import locale 4 | from typing import BinaryIO, Any, Union 5 | 6 | 7 | def exiftool_metadata( 8 | file_stream: BinaryIO, 9 | *, 10 | exiftool_path: Union[str, None], 11 | ) -> Any: # Need a better type for json data 12 | # Nothing to do 13 | if not exiftool_path: 14 | return {} 15 | 16 | # Run exiftool 17 | cur_pos = file_stream.tell() 18 | try: 19 | output = subprocess.run( 20 | [exiftool_path, "-json", "-"], 21 | input=file_stream.read(), 22 | capture_output=True, 23 | text=False, 24 | ).stdout 25 | 26 | return json.loads( 27 | output.decode(locale.getpreferredencoding(False)), 28 | )[0] 29 | finally: 30 | file_stream.seek(cur_pos) 31 | -------------------------------------------------------------------------------- /packages/markitdown/src/markitdown/converters/_html_converter.py: -------------------------------------------------------------------------------- 1 | import io 2 | from typing import Any, BinaryIO, Optional 3 | from bs4 import BeautifulSoup 4 | 5 | from .._base_converter import DocumentConverter, DocumentConverterResult 6 | from .._stream_info import StreamInfo 7 | from ._markdownify import _CustomMarkdownify 8 | 9 | ACCEPTED_MIME_TYPE_PREFIXES = [ 10 | "text/html", 11 | "application/xhtml", 12 | ] 13 | 14 | ACCEPTED_FILE_EXTENSIONS = [ 15 | ".html", 16 | ".htm", 17 | ] 18 | 19 | 20 | class HtmlConverter(DocumentConverter): 21 | """Anything with content type text/html""" 22 | 23 | def accepts( 24 | self, 25 | file_stream: BinaryIO, 26 | stream_info: StreamInfo, 27 | **kwargs: Any, # Options to pass to the converter 28 | ) -> bool: 29 | mimetype = (stream_info.mimetype or "").lower() 30 | extension = (stream_info.extension or "").lower() 31 | 32 | if extension in ACCEPTED_FILE_EXTENSIONS: 33 | return True 34 | 35 | for prefix in ACCEPTED_MIME_TYPE_PREFIXES: 36 | if mimetype.startswith(prefix): 37 | return True 38 | 39 | return False 40 | 41 | def convert( 42 | self, 43 | file_stream: BinaryIO, 44 | stream_info: StreamInfo, 45 | **kwargs: Any, # Options to pass to the converter 46 | ) -> DocumentConverterResult: 47 | # Parse the stream 48 | encoding = "utf-8" if stream_info.charset is None else stream_info.charset 49 | soup = BeautifulSoup(file_stream, "html.parser", from_encoding=encoding) 50 | 51 | # Remove javascript and style blocks 52 | for script in soup(["script", "style"]): 53 | script.extract() 54 | 55 | # Print only the main content 56 | body_elm = soup.find("body") 57 | webpage_text = "" 58 | if body_elm: 59 | webpage_text = _CustomMarkdownify(**kwargs).convert_soup(body_elm) 60 | else: 61 | webpage_text = _CustomMarkdownify(**kwargs).convert_soup(soup) 62 | 63 | assert isinstance(webpage_text, str) 64 | 65 | # remove leading and trailing \n 66 | webpage_text = webpage_text.strip() 67 | 68 | return DocumentConverterResult( 69 | markdown=webpage_text, 70 | title=None if soup.title is None else soup.title.string, 71 | ) 72 | 73 | def convert_string( 74 | self, html_content: str, *, url: Optional[str] = None, **kwargs 75 | ) -> DocumentConverterResult: 76 | """ 77 | Non-standard convenience method to convert a string to markdown. 78 | Given that many converters produce HTML as intermediate output, this 79 | allows for easy conversion of HTML to markdown. 80 | """ 81 | return self.convert( 82 | file_stream=io.BytesIO(html_content.encode("utf-8")), 83 | stream_info=StreamInfo( 84 | mimetype="text/html", 85 | extension=".html", 86 | charset="utf-8", 87 | url=url, 88 | ), 89 | **kwargs, 90 | ) 91 | -------------------------------------------------------------------------------- /packages/markitdown/src/markitdown/converters/_image_converter.py: -------------------------------------------------------------------------------- 1 | from typing import BinaryIO, Any, Union 2 | import base64 3 | import mimetypes 4 | from ._exiftool import exiftool_metadata 5 | from .._base_converter import DocumentConverter, DocumentConverterResult 6 | from .._stream_info import StreamInfo 7 | 8 | ACCEPTED_MIME_TYPE_PREFIXES = [ 9 | "image/jpeg", 10 | "image/png", 11 | ] 12 | 13 | ACCEPTED_FILE_EXTENSIONS = [".jpg", ".jpeg", ".png"] 14 | 15 | 16 | class ImageConverter(DocumentConverter): 17 | """ 18 | Converts images to markdown via extraction of metadata (if `exiftool` is installed), and description via a multimodal LLM (if an llm_client is configured). 19 | """ 20 | 21 | def accepts( 22 | self, 23 | file_stream: BinaryIO, 24 | stream_info: StreamInfo, 25 | **kwargs: Any, 26 | ) -> bool: 27 | mimetype = (stream_info.mimetype or "").lower() 28 | extension = (stream_info.extension or "").lower() 29 | 30 | if extension in ACCEPTED_FILE_EXTENSIONS: 31 | return True 32 | 33 | for prefix in ACCEPTED_MIME_TYPE_PREFIXES: 34 | if mimetype.startswith(prefix): 35 | return True 36 | 37 | return False 38 | 39 | def convert( 40 | self, 41 | file_stream: BinaryIO, 42 | stream_info: StreamInfo, 43 | **kwargs: Any, # Options to pass to the converter 44 | ) -> DocumentConverterResult: 45 | md_content = "" 46 | 47 | # Add metadata 48 | metadata = exiftool_metadata( 49 | file_stream, exiftool_path=kwargs.get("exiftool_path") 50 | ) 51 | 52 | if metadata: 53 | for f in [ 54 | "ImageSize", 55 | "Title", 56 | "Caption", 57 | "Description", 58 | "Keywords", 59 | "Artist", 60 | "Author", 61 | "DateTimeOriginal", 62 | "CreateDate", 63 | "GPSPosition", 64 | ]: 65 | if f in metadata: 66 | md_content += f"{f}: {metadata[f]}\n" 67 | 68 | # Try describing the image with GPT 69 | llm_client = kwargs.get("llm_client") 70 | llm_model = kwargs.get("llm_model") 71 | if llm_client is not None and llm_model is not None: 72 | llm_description = self._get_llm_description( 73 | file_stream, 74 | stream_info, 75 | client=llm_client, 76 | model=llm_model, 77 | prompt=kwargs.get("llm_prompt"), 78 | ) 79 | 80 | if llm_description is not None: 81 | md_content += "\n# Description:\n" + llm_description.strip() + "\n" 82 | 83 | return DocumentConverterResult( 84 | markdown=md_content, 85 | ) 86 | 87 | def _get_llm_description( 88 | self, 89 | file_stream: BinaryIO, 90 | stream_info: StreamInfo, 91 | *, 92 | client, 93 | model, 94 | prompt=None, 95 | ) -> Union[None, str]: 96 | if prompt is None or prompt.strip() == "": 97 | prompt = "Write a detailed caption for this image." 98 | 99 | # Get the content type 100 | content_type = stream_info.mimetype 101 | if not content_type: 102 | content_type, _ = mimetypes.guess_type( 103 | "_dummy" + (stream_info.extension or "") 104 | ) 105 | if not content_type: 106 | content_type = "application/octet-stream" 107 | 108 | # Convert to base64 109 | cur_pos = file_stream.tell() 110 | try: 111 | base64_image = base64.b64encode(file_stream.read()).decode("utf-8") 112 | except Exception as e: 113 | return None 114 | finally: 115 | file_stream.seek(cur_pos) 116 | 117 | # Prepare the data-uri 118 | data_uri = f"data:{content_type};base64,{base64_image}" 119 | 120 | # Prepare the OpenAI API request 121 | messages = [ 122 | { 123 | "role": "user", 124 | "content": [ 125 | {"type": "text", "text": prompt}, 126 | { 127 | "type": "image_url", 128 | "image_url": { 129 | "url": data_uri, 130 | }, 131 | }, 132 | ], 133 | } 134 | ] 135 | 136 | # Call the OpenAI API 137 | response = client.chat.completions.create(model=model, messages=messages) 138 | return response.choices[0].message.content 139 | -------------------------------------------------------------------------------- /packages/markitdown/src/markitdown/converters/_ipynb_converter.py: -------------------------------------------------------------------------------- 1 | from typing import BinaryIO, Any 2 | import json 3 | 4 | from .._base_converter import DocumentConverter, DocumentConverterResult 5 | from .._exceptions import FileConversionException 6 | from .._stream_info import StreamInfo 7 | 8 | CANDIDATE_MIME_TYPE_PREFIXES = [ 9 | "application/json", 10 | ] 11 | 12 | ACCEPTED_FILE_EXTENSIONS = [".ipynb"] 13 | 14 | 15 | class IpynbConverter(DocumentConverter): 16 | """Converts Jupyter Notebook (.ipynb) files to Markdown.""" 17 | 18 | def accepts( 19 | self, 20 | file_stream: BinaryIO, 21 | stream_info: StreamInfo, 22 | **kwargs: Any, # Options to pass to the converter 23 | ) -> bool: 24 | mimetype = (stream_info.mimetype or "").lower() 25 | extension = (stream_info.extension or "").lower() 26 | 27 | if extension in ACCEPTED_FILE_EXTENSIONS: 28 | return True 29 | 30 | for prefix in CANDIDATE_MIME_TYPE_PREFIXES: 31 | if mimetype.startswith(prefix): 32 | # Read further to see if it's a notebook 33 | cur_pos = file_stream.tell() 34 | try: 35 | encoding = stream_info.charset or "utf-8" 36 | notebook_content = file_stream.read().decode(encoding) 37 | return ( 38 | "nbformat" in notebook_content 39 | and "nbformat_minor" in notebook_content 40 | ) 41 | finally: 42 | file_stream.seek(cur_pos) 43 | 44 | return False 45 | 46 | def convert( 47 | self, 48 | file_stream: BinaryIO, 49 | stream_info: StreamInfo, 50 | **kwargs: Any, # Options to pass to the converter 51 | ) -> DocumentConverterResult: 52 | # Parse and convert the notebook 53 | encoding = stream_info.charset or "utf-8" 54 | notebook_content = file_stream.read().decode(encoding=encoding) 55 | return self._convert(json.loads(notebook_content)) 56 | 57 | def _convert(self, notebook_content: dict) -> DocumentConverterResult: 58 | """Helper function that converts notebook JSON content to Markdown.""" 59 | try: 60 | md_output = [] 61 | title = None 62 | 63 | for cell in notebook_content.get("cells", []): 64 | cell_type = cell.get("cell_type", "") 65 | source_lines = cell.get("source", []) 66 | 67 | if cell_type == "markdown": 68 | md_output.append("".join(source_lines)) 69 | 70 | # Extract the first # heading as title if not already found 71 | if title is None: 72 | for line in source_lines: 73 | if line.startswith("# "): 74 | title = line.lstrip("# ").strip() 75 | break 76 | 77 | elif cell_type == "code": 78 | # Code cells are wrapped in Markdown code blocks 79 | md_output.append(f"```python\n{''.join(source_lines)}\n```") 80 | elif cell_type == "raw": 81 | md_output.append(f"```\n{''.join(source_lines)}\n```") 82 | 83 | md_text = "\n\n".join(md_output) 84 | 85 | # Check for title in notebook metadata 86 | title = notebook_content.get("metadata", {}).get("title", title) 87 | 88 | return DocumentConverterResult( 89 | markdown=md_text, 90 | title=title, 91 | ) 92 | 93 | except Exception as e: 94 | raise FileConversionException( 95 | f"Error converting .ipynb file: {str(e)}" 96 | ) from e 97 | -------------------------------------------------------------------------------- /packages/markitdown/src/markitdown/converters/_llm_caption.py: -------------------------------------------------------------------------------- 1 | from typing import BinaryIO, Union 2 | import base64 3 | import mimetypes 4 | from .._stream_info import StreamInfo 5 | 6 | 7 | def llm_caption( 8 | file_stream: BinaryIO, stream_info: StreamInfo, *, client, model, prompt=None 9 | ) -> Union[None, str]: 10 | if prompt is None or prompt.strip() == "": 11 | prompt = "Write a detailed caption for this image." 12 | 13 | # Get the content type 14 | content_type = stream_info.mimetype 15 | if not content_type: 16 | content_type, _ = mimetypes.guess_type("_dummy" + (stream_info.extension or "")) 17 | if not content_type: 18 | content_type = "application/octet-stream" 19 | 20 | # Convert to base64 21 | cur_pos = file_stream.tell() 22 | try: 23 | base64_image = base64.b64encode(file_stream.read()).decode("utf-8") 24 | except Exception as e: 25 | return None 26 | finally: 27 | file_stream.seek(cur_pos) 28 | 29 | # Prepare the data-uri 30 | data_uri = f"data:{content_type};base64,{base64_image}" 31 | 32 | # Prepare the OpenAI API request 33 | messages = [ 34 | { 35 | "role": "user", 36 | "content": [ 37 | {"type": "text", "text": prompt}, 38 | { 39 | "type": "image_url", 40 | "image_url": { 41 | "url": data_uri, 42 | }, 43 | }, 44 | ], 45 | } 46 | ] 47 | 48 | # Call the OpenAI API 49 | response = client.chat.completions.create(model=model, messages=messages) 50 | return response.choices[0].message.content 51 | -------------------------------------------------------------------------------- /packages/markitdown/src/markitdown/converters/_markdownify.py: -------------------------------------------------------------------------------- 1 | import re 2 | import markdownify 3 | 4 | from typing import Any, Optional 5 | from urllib.parse import quote, unquote, urlparse, urlunparse 6 | 7 | 8 | class _CustomMarkdownify(markdownify.MarkdownConverter): 9 | """ 10 | A custom version of markdownify's MarkdownConverter. Changes include: 11 | 12 | - Altering the default heading style to use '#', '##', etc. 13 | - Removing javascript hyperlinks. 14 | - Truncating images with large data:uri sources. 15 | - Ensuring URIs are properly escaped, and do not conflict with Markdown syntax 16 | """ 17 | 18 | def __init__(self, **options: Any): 19 | options["heading_style"] = options.get("heading_style", markdownify.ATX) 20 | options["keep_data_uris"] = options.get("keep_data_uris", False) 21 | # Explicitly cast options to the expected type if necessary 22 | super().__init__(**options) 23 | 24 | def convert_hn( 25 | self, 26 | n: int, 27 | el: Any, 28 | text: str, 29 | convert_as_inline: Optional[bool] = False, 30 | **kwargs, 31 | ) -> str: 32 | """Same as usual, but be sure to start with a new line""" 33 | if not convert_as_inline: 34 | if not re.search(r"^\n", text): 35 | return "\n" + super().convert_hn(n, el, text, convert_as_inline) # type: ignore 36 | 37 | return super().convert_hn(n, el, text, convert_as_inline) # type: ignore 38 | 39 | def convert_a( 40 | self, 41 | el: Any, 42 | text: str, 43 | convert_as_inline: Optional[bool] = False, 44 | **kwargs, 45 | ): 46 | """Same as usual converter, but removes Javascript links and escapes URIs.""" 47 | prefix, suffix, text = markdownify.chomp(text) # type: ignore 48 | if not text: 49 | return "" 50 | 51 | if el.find_parent("pre") is not None: 52 | return text 53 | 54 | href = el.get("href") 55 | title = el.get("title") 56 | 57 | # Escape URIs and skip non-http or file schemes 58 | if href: 59 | try: 60 | parsed_url = urlparse(href) # type: ignore 61 | if parsed_url.scheme and parsed_url.scheme.lower() not in ["http", "https", "file"]: # type: ignore 62 | return "%s%s%s" % (prefix, text, suffix) 63 | href = urlunparse(parsed_url._replace(path=quote(unquote(parsed_url.path)))) # type: ignore 64 | except ValueError: # It's not clear if this ever gets thrown 65 | return "%s%s%s" % (prefix, text, suffix) 66 | 67 | # For the replacement see #29: text nodes underscores are escaped 68 | if ( 69 | self.options["autolinks"] 70 | and text.replace(r"\_", "_") == href 71 | and not title 72 | and not self.options["default_title"] 73 | ): 74 | # Shortcut syntax 75 | return "<%s>" % href 76 | if self.options["default_title"] and not title: 77 | title = href 78 | title_part = ' "%s"' % title.replace('"', r"\"") if title else "" 79 | return ( 80 | "%s[%s](%s%s)%s" % (prefix, text, href, title_part, suffix) 81 | if href 82 | else text 83 | ) 84 | 85 | def convert_img( 86 | self, 87 | el: Any, 88 | text: str, 89 | convert_as_inline: Optional[bool] = False, 90 | **kwargs, 91 | ) -> str: 92 | """Same as usual converter, but removes data URIs""" 93 | 94 | alt = el.attrs.get("alt", None) or "" 95 | src = el.attrs.get("src", None) or "" 96 | title = el.attrs.get("title", None) or "" 97 | title_part = ' "%s"' % title.replace('"', r"\"") if title else "" 98 | if ( 99 | convert_as_inline 100 | and el.parent.name not in self.options["keep_inline_images_in"] 101 | ): 102 | return alt 103 | 104 | # Remove dataURIs 105 | if src.startswith("data:") and not self.options["keep_data_uris"]: 106 | src = src.split(",")[0] + "..." 107 | 108 | return "![%s](%s%s)" % (alt, src, title_part) 109 | 110 | def convert_soup(self, soup: Any) -> str: 111 | return super().convert_soup(soup) # type: ignore 112 | -------------------------------------------------------------------------------- /packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from typing import Any, Union, BinaryIO 3 | from .._stream_info import StreamInfo 4 | from .._base_converter import DocumentConverter, DocumentConverterResult 5 | from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE 6 | 7 | # Try loading optional (but in this case, required) dependencies 8 | # Save reporting of any exceptions for later 9 | _dependency_exc_info = None 10 | olefile = None 11 | try: 12 | import olefile # type: ignore[no-redef] 13 | except ImportError: 14 | # Preserve the error and stack trace for later 15 | _dependency_exc_info = sys.exc_info() 16 | 17 | ACCEPTED_MIME_TYPE_PREFIXES = [ 18 | "application/vnd.ms-outlook", 19 | ] 20 | 21 | ACCEPTED_FILE_EXTENSIONS = [".msg"] 22 | 23 | 24 | class OutlookMsgConverter(DocumentConverter): 25 | """Converts Outlook .msg files to markdown by extracting email metadata and content. 26 | 27 | Uses the olefile package to parse the .msg file structure and extract: 28 | - Email headers (From, To, Subject) 29 | - Email body content 30 | """ 31 | 32 | def accepts( 33 | self, 34 | file_stream: BinaryIO, 35 | stream_info: StreamInfo, 36 | **kwargs: Any, # Options to pass to the converter 37 | ) -> bool: 38 | mimetype = (stream_info.mimetype or "").lower() 39 | extension = (stream_info.extension or "").lower() 40 | 41 | # Check the extension and mimetype 42 | if extension in ACCEPTED_FILE_EXTENSIONS: 43 | return True 44 | 45 | for prefix in ACCEPTED_MIME_TYPE_PREFIXES: 46 | if mimetype.startswith(prefix): 47 | return True 48 | 49 | # Brute force, check if we have an OLE file 50 | cur_pos = file_stream.tell() 51 | try: 52 | if olefile and not olefile.isOleFile(file_stream): 53 | return False 54 | finally: 55 | file_stream.seek(cur_pos) 56 | 57 | # Brue force, check if it's an Outlook file 58 | try: 59 | if olefile is not None: 60 | msg = olefile.OleFileIO(file_stream) 61 | toc = "\n".join([str(stream) for stream in msg.listdir()]) 62 | return ( 63 | "__properties_version1.0" in toc 64 | and "__recip_version1.0_#00000000" in toc 65 | ) 66 | except Exception as e: 67 | pass 68 | finally: 69 | file_stream.seek(cur_pos) 70 | 71 | return False 72 | 73 | def convert( 74 | self, 75 | file_stream: BinaryIO, 76 | stream_info: StreamInfo, 77 | **kwargs: Any, # Options to pass to the converter 78 | ) -> DocumentConverterResult: 79 | # Check: the dependencies 80 | if _dependency_exc_info is not None: 81 | raise MissingDependencyException( 82 | MISSING_DEPENDENCY_MESSAGE.format( 83 | converter=type(self).__name__, 84 | extension=".msg", 85 | feature="outlook", 86 | ) 87 | ) from _dependency_exc_info[ 88 | 1 89 | ].with_traceback( # type: ignore[union-attr] 90 | _dependency_exc_info[2] 91 | ) 92 | 93 | assert ( 94 | olefile is not None 95 | ) # If we made it this far, olefile should be available 96 | msg = olefile.OleFileIO(file_stream) 97 | 98 | # Extract email metadata 99 | md_content = "# Email Message\n\n" 100 | 101 | # Get headers 102 | headers = { 103 | "From": self._get_stream_data(msg, "__substg1.0_0C1F001F"), 104 | "To": self._get_stream_data(msg, "__substg1.0_0E04001F"), 105 | "Subject": self._get_stream_data(msg, "__substg1.0_0037001F"), 106 | } 107 | 108 | # Add headers to markdown 109 | for key, value in headers.items(): 110 | if value: 111 | md_content += f"**{key}:** {value}\n" 112 | 113 | md_content += "\n## Content\n\n" 114 | 115 | # Get email body 116 | body = self._get_stream_data(msg, "__substg1.0_1000001F") 117 | if body: 118 | md_content += body 119 | 120 | msg.close() 121 | 122 | return DocumentConverterResult( 123 | markdown=md_content.strip(), 124 | title=headers.get("Subject"), 125 | ) 126 | 127 | def _get_stream_data(self, msg: Any, stream_path: str) -> Union[str, None]: 128 | """Helper to safely extract and decode stream data from the MSG file.""" 129 | assert olefile is not None 130 | assert isinstance( 131 | msg, olefile.OleFileIO 132 | ) # Ensure msg is of the correct type (type hinting is not possible with the optional olefile package) 133 | 134 | try: 135 | if msg.exists(stream_path): 136 | data = msg.openstream(stream_path).read() 137 | # Try UTF-16 first (common for .msg files) 138 | try: 139 | return data.decode("utf-16-le").strip() 140 | except UnicodeDecodeError: 141 | # Fall back to UTF-8 142 | try: 143 | return data.decode("utf-8").strip() 144 | except UnicodeDecodeError: 145 | # Last resort - ignore errors 146 | return data.decode("utf-8", errors="ignore").strip() 147 | except Exception: 148 | pass 149 | return None 150 | -------------------------------------------------------------------------------- /packages/markitdown/src/markitdown/converters/_pdf_converter.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import io 3 | 4 | from typing import BinaryIO, Any 5 | 6 | 7 | from .._base_converter import DocumentConverter, DocumentConverterResult 8 | from .._stream_info import StreamInfo 9 | from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE 10 | 11 | 12 | # Try loading optional (but in this case, required) dependencies 13 | # Save reporting of any exceptions for later 14 | _dependency_exc_info = None 15 | try: 16 | import pdfminer 17 | import pdfminer.high_level 18 | except ImportError: 19 | # Preserve the error and stack trace for later 20 | _dependency_exc_info = sys.exc_info() 21 | 22 | 23 | ACCEPTED_MIME_TYPE_PREFIXES = [ 24 | "application/pdf", 25 | "application/x-pdf", 26 | ] 27 | 28 | ACCEPTED_FILE_EXTENSIONS = [".pdf"] 29 | 30 | 31 | class PdfConverter(DocumentConverter): 32 | """ 33 | Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text. 34 | """ 35 | 36 | def accepts( 37 | self, 38 | file_stream: BinaryIO, 39 | stream_info: StreamInfo, 40 | **kwargs: Any, # Options to pass to the converter 41 | ) -> bool: 42 | mimetype = (stream_info.mimetype or "").lower() 43 | extension = (stream_info.extension or "").lower() 44 | 45 | if extension in ACCEPTED_FILE_EXTENSIONS: 46 | return True 47 | 48 | for prefix in ACCEPTED_MIME_TYPE_PREFIXES: 49 | if mimetype.startswith(prefix): 50 | return True 51 | 52 | return False 53 | 54 | def convert( 55 | self, 56 | file_stream: BinaryIO, 57 | stream_info: StreamInfo, 58 | **kwargs: Any, # Options to pass to the converter 59 | ) -> DocumentConverterResult: 60 | # Check the dependencies 61 | if _dependency_exc_info is not None: 62 | raise MissingDependencyException( 63 | MISSING_DEPENDENCY_MESSAGE.format( 64 | converter=type(self).__name__, 65 | extension=".pdf", 66 | feature="pdf", 67 | ) 68 | ) from _dependency_exc_info[ 69 | 1 70 | ].with_traceback( # type: ignore[union-attr] 71 | _dependency_exc_info[2] 72 | ) 73 | 74 | assert isinstance(file_stream, io.IOBase) # for mypy 75 | return DocumentConverterResult( 76 | markdown=pdfminer.high_level.extract_text(file_stream), 77 | ) 78 | -------------------------------------------------------------------------------- /packages/markitdown/src/markitdown/converters/_plain_text_converter.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | from typing import BinaryIO, Any 4 | from charset_normalizer import from_bytes 5 | from .._base_converter import DocumentConverter, DocumentConverterResult 6 | from .._stream_info import StreamInfo 7 | 8 | # Try loading optional (but in this case, required) dependencies 9 | # Save reporting of any exceptions for later 10 | _dependency_exc_info = None 11 | try: 12 | import mammoth # noqa: F401 13 | except ImportError: 14 | # Preserve the error and stack trace for later 15 | _dependency_exc_info = sys.exc_info() 16 | 17 | ACCEPTED_MIME_TYPE_PREFIXES = [ 18 | "text/", 19 | "application/json", 20 | "application/markdown", 21 | ] 22 | 23 | ACCEPTED_FILE_EXTENSIONS = [ 24 | ".txt", 25 | ".text", 26 | ".md", 27 | ".markdown", 28 | ".json", 29 | ".jsonl", 30 | ] 31 | 32 | 33 | class PlainTextConverter(DocumentConverter): 34 | """Anything with content type text/plain""" 35 | 36 | def accepts( 37 | self, 38 | file_stream: BinaryIO, 39 | stream_info: StreamInfo, 40 | **kwargs: Any, # Options to pass to the converter 41 | ) -> bool: 42 | mimetype = (stream_info.mimetype or "").lower() 43 | extension = (stream_info.extension or "").lower() 44 | 45 | # If we have a charset, we can safely assume it's text 46 | # With Magika in the earlier stages, this handles most cases 47 | if stream_info.charset is not None: 48 | return True 49 | 50 | # Otherwise, check the mimetype and extension 51 | if extension in ACCEPTED_FILE_EXTENSIONS: 52 | return True 53 | 54 | for prefix in ACCEPTED_MIME_TYPE_PREFIXES: 55 | if mimetype.startswith(prefix): 56 | return True 57 | 58 | return False 59 | 60 | def convert( 61 | self, 62 | file_stream: BinaryIO, 63 | stream_info: StreamInfo, 64 | **kwargs: Any, # Options to pass to the converter 65 | ) -> DocumentConverterResult: 66 | if stream_info.charset: 67 | text_content = file_stream.read().decode(stream_info.charset) 68 | else: 69 | text_content = str(from_bytes(file_stream.read()).best()) 70 | 71 | return DocumentConverterResult(markdown=text_content) 72 | -------------------------------------------------------------------------------- /packages/markitdown/src/markitdown/converters/_pptx_converter.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import base64 3 | import os 4 | import io 5 | import re 6 | import html 7 | 8 | from typing import BinaryIO, Any 9 | from operator import attrgetter 10 | 11 | from ._html_converter import HtmlConverter 12 | from ._llm_caption import llm_caption 13 | from .._base_converter import DocumentConverter, DocumentConverterResult 14 | from .._stream_info import StreamInfo 15 | from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE 16 | 17 | # Try loading optional (but in this case, required) dependencies 18 | # Save reporting of any exceptions for later 19 | _dependency_exc_info = None 20 | try: 21 | import pptx 22 | except ImportError: 23 | # Preserve the error and stack trace for later 24 | _dependency_exc_info = sys.exc_info() 25 | 26 | 27 | ACCEPTED_MIME_TYPE_PREFIXES = [ 28 | "application/vnd.openxmlformats-officedocument.presentationml", 29 | ] 30 | 31 | ACCEPTED_FILE_EXTENSIONS = [".pptx"] 32 | 33 | 34 | class PptxConverter(DocumentConverter): 35 | """ 36 | Converts PPTX files to Markdown. Supports heading, tables and images with alt text. 37 | """ 38 | 39 | def __init__(self): 40 | super().__init__() 41 | self._html_converter = HtmlConverter() 42 | 43 | def accepts( 44 | self, 45 | file_stream: BinaryIO, 46 | stream_info: StreamInfo, 47 | **kwargs: Any, # Options to pass to the converter 48 | ) -> bool: 49 | mimetype = (stream_info.mimetype or "").lower() 50 | extension = (stream_info.extension or "").lower() 51 | 52 | if extension in ACCEPTED_FILE_EXTENSIONS: 53 | return True 54 | 55 | for prefix in ACCEPTED_MIME_TYPE_PREFIXES: 56 | if mimetype.startswith(prefix): 57 | return True 58 | 59 | return False 60 | 61 | def convert( 62 | self, 63 | file_stream: BinaryIO, 64 | stream_info: StreamInfo, 65 | **kwargs: Any, # Options to pass to the converter 66 | ) -> DocumentConverterResult: 67 | # Check the dependencies 68 | if _dependency_exc_info is not None: 69 | raise MissingDependencyException( 70 | MISSING_DEPENDENCY_MESSAGE.format( 71 | converter=type(self).__name__, 72 | extension=".pptx", 73 | feature="pptx", 74 | ) 75 | ) from _dependency_exc_info[ 76 | 1 77 | ].with_traceback( # type: ignore[union-attr] 78 | _dependency_exc_info[2] 79 | ) 80 | 81 | # Perform the conversion 82 | presentation = pptx.Presentation(file_stream) 83 | md_content = "" 84 | slide_num = 0 85 | for slide in presentation.slides: 86 | slide_num += 1 87 | 88 | md_content += f"\n\n\n" 89 | 90 | title = slide.shapes.title 91 | 92 | def get_shape_content(shape, **kwargs): 93 | nonlocal md_content 94 | # Pictures 95 | if self._is_picture(shape): 96 | # https://github.com/scanny/python-pptx/pull/512#issuecomment-1713100069 97 | 98 | llm_description = "" 99 | alt_text = "" 100 | 101 | # Potentially generate a description using an LLM 102 | llm_client = kwargs.get("llm_client") 103 | llm_model = kwargs.get("llm_model") 104 | if llm_client is not None and llm_model is not None: 105 | # Prepare a file_stream and stream_info for the image data 106 | image_filename = shape.image.filename 107 | image_extension = None 108 | if image_filename: 109 | image_extension = os.path.splitext(image_filename)[1] 110 | image_stream_info = StreamInfo( 111 | mimetype=shape.image.content_type, 112 | extension=image_extension, 113 | filename=image_filename, 114 | ) 115 | 116 | image_stream = io.BytesIO(shape.image.blob) 117 | 118 | # Caption the image 119 | try: 120 | llm_description = llm_caption( 121 | image_stream, 122 | image_stream_info, 123 | client=llm_client, 124 | model=llm_model, 125 | prompt=kwargs.get("llm_prompt"), 126 | ) 127 | except Exception: 128 | # Unable to generate a description 129 | pass 130 | 131 | # Also grab any description embedded in the deck 132 | try: 133 | alt_text = shape._element._nvXxPr.cNvPr.attrib.get("descr", "") 134 | except Exception: 135 | # Unable to get alt text 136 | pass 137 | 138 | # Prepare the alt, escaping any special characters 139 | alt_text = "\n".join([llm_description, alt_text]) or shape.name 140 | alt_text = re.sub(r"[\r\n\[\]]", " ", alt_text) 141 | alt_text = re.sub(r"\s+", " ", alt_text).strip() 142 | 143 | # If keep_data_uris is True, use base64 encoding for images 144 | if kwargs.get("keep_data_uris", False): 145 | blob = shape.image.blob 146 | content_type = shape.image.content_type or "image/png" 147 | b64_string = base64.b64encode(blob).decode("utf-8") 148 | md_content += f"\n![{alt_text}](data:{content_type};base64,{b64_string})\n" 149 | else: 150 | # A placeholder name 151 | filename = re.sub(r"\W", "", shape.name) + ".jpg" 152 | md_content += "\n![" + alt_text + "](" + filename + ")\n" 153 | 154 | # Tables 155 | if self._is_table(shape): 156 | md_content += self._convert_table_to_markdown(shape.table, **kwargs) 157 | 158 | # Charts 159 | if shape.has_chart: 160 | md_content += self._convert_chart_to_markdown(shape.chart) 161 | 162 | # Text areas 163 | elif shape.has_text_frame: 164 | if shape == title: 165 | md_content += "# " + shape.text.lstrip() + "\n" 166 | else: 167 | md_content += shape.text + "\n" 168 | 169 | # Group Shapes 170 | if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.GROUP: 171 | sorted_shapes = sorted(shape.shapes, key=attrgetter("top", "left")) 172 | for subshape in sorted_shapes: 173 | get_shape_content(subshape, **kwargs) 174 | 175 | sorted_shapes = sorted(slide.shapes, key=attrgetter("top", "left")) 176 | for shape in sorted_shapes: 177 | get_shape_content(shape, **kwargs) 178 | 179 | md_content = md_content.strip() 180 | 181 | if slide.has_notes_slide: 182 | md_content += "\n\n### Notes:\n" 183 | notes_frame = slide.notes_slide.notes_text_frame 184 | if notes_frame is not None: 185 | md_content += notes_frame.text 186 | md_content = md_content.strip() 187 | 188 | return DocumentConverterResult(markdown=md_content.strip()) 189 | 190 | def _is_picture(self, shape): 191 | if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PICTURE: 192 | return True 193 | if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PLACEHOLDER: 194 | if hasattr(shape, "image"): 195 | return True 196 | return False 197 | 198 | def _is_table(self, shape): 199 | if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.TABLE: 200 | return True 201 | return False 202 | 203 | def _convert_table_to_markdown(self, table, **kwargs): 204 | # Write the table as HTML, then convert it to Markdown 205 | html_table = "" 206 | first_row = True 207 | for row in table.rows: 208 | html_table += "" 209 | for cell in row.cells: 210 | if first_row: 211 | html_table += "" 212 | else: 213 | html_table += "" 214 | html_table += "" 215 | first_row = False 216 | html_table += "
" + html.escape(cell.text) + "" + html.escape(cell.text) + "
" 217 | 218 | return ( 219 | self._html_converter.convert_string(html_table, **kwargs).markdown.strip() 220 | + "\n" 221 | ) 222 | 223 | def _convert_chart_to_markdown(self, chart): 224 | try: 225 | md = "\n\n### Chart" 226 | if chart.has_title: 227 | md += f": {chart.chart_title.text_frame.text}" 228 | md += "\n\n" 229 | data = [] 230 | category_names = [c.label for c in chart.plots[0].categories] 231 | series_names = [s.name for s in chart.series] 232 | data.append(["Category"] + series_names) 233 | 234 | for idx, category in enumerate(category_names): 235 | row = [category] 236 | for series in chart.series: 237 | row.append(series.values[idx]) 238 | data.append(row) 239 | 240 | markdown_table = [] 241 | for row in data: 242 | markdown_table.append("| " + " | ".join(map(str, row)) + " |") 243 | header = markdown_table[0] 244 | separator = "|" + "|".join(["---"] * len(data[0])) + "|" 245 | return md + "\n".join([header, separator] + markdown_table[1:]) 246 | except ValueError as e: 247 | # Handle the specific error for unsupported chart types 248 | if "unsupported plot type" in str(e): 249 | return "\n\n[unsupported chart]\n\n" 250 | except Exception: 251 | # Catch any other exceptions that might occur 252 | return "\n\n[unsupported chart]\n\n" 253 | -------------------------------------------------------------------------------- /packages/markitdown/src/markitdown/converters/_rss_converter.py: -------------------------------------------------------------------------------- 1 | from defusedxml import minidom 2 | from xml.dom.minidom import Document, Element 3 | from typing import BinaryIO, Any, Union 4 | from bs4 import BeautifulSoup 5 | 6 | from ._markdownify import _CustomMarkdownify 7 | from .._stream_info import StreamInfo 8 | from .._base_converter import DocumentConverter, DocumentConverterResult 9 | 10 | PRECISE_MIME_TYPE_PREFIXES = [ 11 | "application/rss", 12 | "application/rss+xml", 13 | "application/atom", 14 | "application/atom+xml", 15 | ] 16 | 17 | PRECISE_FILE_EXTENSIONS = [".rss", ".atom"] 18 | 19 | CANDIDATE_MIME_TYPE_PREFIXES = [ 20 | "text/xml", 21 | "application/xml", 22 | ] 23 | 24 | CANDIDATE_FILE_EXTENSIONS = [ 25 | ".xml", 26 | ] 27 | 28 | 29 | class RssConverter(DocumentConverter): 30 | """Convert RSS / Atom type to markdown""" 31 | 32 | def __init__(self): 33 | super().__init__() 34 | self._kwargs = {} 35 | 36 | def accepts( 37 | self, 38 | file_stream: BinaryIO, 39 | stream_info: StreamInfo, 40 | **kwargs: Any, # Options to pass to the converter 41 | ) -> bool: 42 | mimetype = (stream_info.mimetype or "").lower() 43 | extension = (stream_info.extension or "").lower() 44 | 45 | # Check for precise mimetypes and file extensions 46 | if extension in PRECISE_FILE_EXTENSIONS: 47 | return True 48 | 49 | for prefix in PRECISE_MIME_TYPE_PREFIXES: 50 | if mimetype.startswith(prefix): 51 | return True 52 | 53 | # Check for precise mimetypes and file extensions 54 | if extension in CANDIDATE_FILE_EXTENSIONS: 55 | return self._check_xml(file_stream) 56 | 57 | for prefix in CANDIDATE_MIME_TYPE_PREFIXES: 58 | if mimetype.startswith(prefix): 59 | return self._check_xml(file_stream) 60 | 61 | return False 62 | 63 | def _check_xml(self, file_stream: BinaryIO) -> bool: 64 | cur_pos = file_stream.tell() 65 | try: 66 | doc = minidom.parse(file_stream) 67 | return self._feed_type(doc) is not None 68 | except BaseException as _: 69 | pass 70 | finally: 71 | file_stream.seek(cur_pos) 72 | return False 73 | 74 | def _feed_type(self, doc: Any) -> str | None: 75 | if doc.getElementsByTagName("rss"): 76 | return "rss" 77 | elif doc.getElementsByTagName("feed"): 78 | root = doc.getElementsByTagName("feed")[0] 79 | if root.getElementsByTagName("entry"): 80 | # An Atom feed must have a root element of and at least one 81 | return "atom" 82 | return None 83 | 84 | def convert( 85 | self, 86 | file_stream: BinaryIO, 87 | stream_info: StreamInfo, 88 | **kwargs: Any, # Options to pass to the converter 89 | ) -> DocumentConverterResult: 90 | self._kwargs = kwargs 91 | doc = minidom.parse(file_stream) 92 | feed_type = self._feed_type(doc) 93 | 94 | if feed_type == "rss": 95 | return self._parse_rss_type(doc) 96 | elif feed_type == "atom": 97 | return self._parse_atom_type(doc) 98 | else: 99 | raise ValueError("Unknown feed type") 100 | 101 | def _parse_atom_type(self, doc: Document) -> DocumentConverterResult: 102 | """Parse the type of an Atom feed. 103 | 104 | Returns None if the feed type is not recognized or something goes wrong. 105 | """ 106 | root = doc.getElementsByTagName("feed")[0] 107 | title = self._get_data_by_tag_name(root, "title") 108 | subtitle = self._get_data_by_tag_name(root, "subtitle") 109 | entries = root.getElementsByTagName("entry") 110 | md_text = f"# {title}\n" 111 | if subtitle: 112 | md_text += f"{subtitle}\n" 113 | for entry in entries: 114 | entry_title = self._get_data_by_tag_name(entry, "title") 115 | entry_summary = self._get_data_by_tag_name(entry, "summary") 116 | entry_updated = self._get_data_by_tag_name(entry, "updated") 117 | entry_content = self._get_data_by_tag_name(entry, "content") 118 | 119 | if entry_title: 120 | md_text += f"\n## {entry_title}\n" 121 | if entry_updated: 122 | md_text += f"Updated on: {entry_updated}\n" 123 | if entry_summary: 124 | md_text += self._parse_content(entry_summary) 125 | if entry_content: 126 | md_text += self._parse_content(entry_content) 127 | 128 | return DocumentConverterResult( 129 | markdown=md_text, 130 | title=title, 131 | ) 132 | 133 | def _parse_rss_type(self, doc: Document) -> DocumentConverterResult: 134 | """Parse the type of an RSS feed. 135 | 136 | Returns None if the feed type is not recognized or something goes wrong. 137 | """ 138 | root = doc.getElementsByTagName("rss")[0] 139 | channel_list = root.getElementsByTagName("channel") 140 | if not channel_list: 141 | raise ValueError("No channel found in RSS feed") 142 | channel = channel_list[0] 143 | channel_title = self._get_data_by_tag_name(channel, "title") 144 | channel_description = self._get_data_by_tag_name(channel, "description") 145 | items = channel.getElementsByTagName("item") 146 | if channel_title: 147 | md_text = f"# {channel_title}\n" 148 | if channel_description: 149 | md_text += f"{channel_description}\n" 150 | for item in items: 151 | title = self._get_data_by_tag_name(item, "title") 152 | description = self._get_data_by_tag_name(item, "description") 153 | pubDate = self._get_data_by_tag_name(item, "pubDate") 154 | content = self._get_data_by_tag_name(item, "content:encoded") 155 | 156 | if title: 157 | md_text += f"\n## {title}\n" 158 | if pubDate: 159 | md_text += f"Published on: {pubDate}\n" 160 | if description: 161 | md_text += self._parse_content(description) 162 | if content: 163 | md_text += self._parse_content(content) 164 | 165 | return DocumentConverterResult( 166 | markdown=md_text, 167 | title=channel_title, 168 | ) 169 | 170 | def _parse_content(self, content: str) -> str: 171 | """Parse the content of an RSS feed item""" 172 | try: 173 | # using bs4 because many RSS feeds have HTML-styled content 174 | soup = BeautifulSoup(content, "html.parser") 175 | return _CustomMarkdownify(**self._kwargs).convert_soup(soup) 176 | except BaseException as _: 177 | return content 178 | 179 | def _get_data_by_tag_name( 180 | self, element: Element, tag_name: str 181 | ) -> Union[str, None]: 182 | """Get data from first child element with the given tag name. 183 | Returns None when no such element is found. 184 | """ 185 | nodes = element.getElementsByTagName(tag_name) 186 | if not nodes: 187 | return None 188 | fc = nodes[0].firstChild 189 | if fc: 190 | if hasattr(fc, "data"): 191 | return fc.data 192 | return None 193 | -------------------------------------------------------------------------------- /packages/markitdown/src/markitdown/converters/_transcribe_audio.py: -------------------------------------------------------------------------------- 1 | import io 2 | import sys 3 | from typing import BinaryIO 4 | from .._exceptions import MissingDependencyException 5 | 6 | # Try loading optional (but in this case, required) dependencies 7 | # Save reporting of any exceptions for later 8 | _dependency_exc_info = None 9 | try: 10 | # Suppress some warnings on library import 11 | import warnings 12 | 13 | with warnings.catch_warnings(): 14 | warnings.filterwarnings("ignore", category=DeprecationWarning) 15 | warnings.filterwarnings("ignore", category=SyntaxWarning) 16 | import speech_recognition as sr 17 | import pydub 18 | except ImportError: 19 | # Preserve the error and stack trace for later 20 | _dependency_exc_info = sys.exc_info() 21 | 22 | 23 | def transcribe_audio(file_stream: BinaryIO, *, audio_format: str = "wav") -> str: 24 | # Check for installed dependencies 25 | if _dependency_exc_info is not None: 26 | raise MissingDependencyException( 27 | "Speech transcription requires installing MarkItdown with the [audio-transcription] optional dependencies. E.g., `pip install markitdown[audio-transcription]` or `pip install markitdown[all]`" 28 | ) from _dependency_exc_info[ 29 | 1 30 | ].with_traceback( # type: ignore[union-attr] 31 | _dependency_exc_info[2] 32 | ) 33 | 34 | if audio_format in ["wav", "aiff", "flac"]: 35 | audio_source = file_stream 36 | elif audio_format in ["mp3", "mp4"]: 37 | audio_segment = pydub.AudioSegment.from_file(file_stream, format=audio_format) 38 | 39 | audio_source = io.BytesIO() 40 | audio_segment.export(audio_source, format="wav") 41 | audio_source.seek(0) 42 | else: 43 | raise ValueError(f"Unsupported audio format: {audio_format}") 44 | 45 | recognizer = sr.Recognizer() 46 | with sr.AudioFile(audio_source) as source: 47 | audio = recognizer.record(source) 48 | transcript = recognizer.recognize_google(audio).strip() 49 | return "[No speech detected]" if transcript == "" else transcript 50 | -------------------------------------------------------------------------------- /packages/markitdown/src/markitdown/converters/_wikipedia_converter.py: -------------------------------------------------------------------------------- 1 | import re 2 | import bs4 3 | from typing import Any, BinaryIO 4 | 5 | from .._base_converter import DocumentConverter, DocumentConverterResult 6 | from .._stream_info import StreamInfo 7 | from ._markdownify import _CustomMarkdownify 8 | 9 | ACCEPTED_MIME_TYPE_PREFIXES = [ 10 | "text/html", 11 | "application/xhtml", 12 | ] 13 | 14 | ACCEPTED_FILE_EXTENSIONS = [ 15 | ".html", 16 | ".htm", 17 | ] 18 | 19 | 20 | class WikipediaConverter(DocumentConverter): 21 | """Handle Wikipedia pages separately, focusing only on the main document content.""" 22 | 23 | def accepts( 24 | self, 25 | file_stream: BinaryIO, 26 | stream_info: StreamInfo, 27 | **kwargs: Any, # Options to pass to the converter 28 | ) -> bool: 29 | """ 30 | Make sure we're dealing with HTML content *from* Wikipedia. 31 | """ 32 | 33 | url = stream_info.url or "" 34 | mimetype = (stream_info.mimetype or "").lower() 35 | extension = (stream_info.extension or "").lower() 36 | 37 | if not re.search(r"^https?:\/\/[a-zA-Z]{2,3}\.wikipedia.org\/", url): 38 | # Not a Wikipedia URL 39 | return False 40 | 41 | if extension in ACCEPTED_FILE_EXTENSIONS: 42 | return True 43 | 44 | for prefix in ACCEPTED_MIME_TYPE_PREFIXES: 45 | if mimetype.startswith(prefix): 46 | return True 47 | 48 | # Not HTML content 49 | return False 50 | 51 | def convert( 52 | self, 53 | file_stream: BinaryIO, 54 | stream_info: StreamInfo, 55 | **kwargs: Any, # Options to pass to the converter 56 | ) -> DocumentConverterResult: 57 | # Parse the stream 58 | encoding = "utf-8" if stream_info.charset is None else stream_info.charset 59 | soup = bs4.BeautifulSoup(file_stream, "html.parser", from_encoding=encoding) 60 | 61 | # Remove javascript and style blocks 62 | for script in soup(["script", "style"]): 63 | script.extract() 64 | 65 | # Print only the main content 66 | body_elm = soup.find("div", {"id": "mw-content-text"}) 67 | title_elm = soup.find("span", {"class": "mw-page-title-main"}) 68 | 69 | webpage_text = "" 70 | main_title = None if soup.title is None else soup.title.string 71 | 72 | if body_elm: 73 | # What's the title 74 | if title_elm and isinstance(title_elm, bs4.Tag): 75 | main_title = title_elm.string 76 | 77 | # Convert the page 78 | webpage_text = f"# {main_title}\n\n" + _CustomMarkdownify( 79 | **kwargs 80 | ).convert_soup(body_elm) 81 | else: 82 | webpage_text = _CustomMarkdownify(**kwargs).convert_soup(soup) 83 | 84 | return DocumentConverterResult( 85 | markdown=webpage_text, 86 | title=main_title, 87 | ) 88 | -------------------------------------------------------------------------------- /packages/markitdown/src/markitdown/converters/_xlsx_converter.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from typing import BinaryIO, Any 3 | from ._html_converter import HtmlConverter 4 | from .._base_converter import DocumentConverter, DocumentConverterResult 5 | from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE 6 | from .._stream_info import StreamInfo 7 | 8 | # Try loading optional (but in this case, required) dependencies 9 | # Save reporting of any exceptions for later 10 | _xlsx_dependency_exc_info = None 11 | try: 12 | import pandas as pd 13 | import openpyxl # noqa: F401 14 | except ImportError: 15 | _xlsx_dependency_exc_info = sys.exc_info() 16 | 17 | _xls_dependency_exc_info = None 18 | try: 19 | import pandas as pd # noqa: F811 20 | import xlrd # noqa: F401 21 | except ImportError: 22 | _xls_dependency_exc_info = sys.exc_info() 23 | 24 | ACCEPTED_XLSX_MIME_TYPE_PREFIXES = [ 25 | "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" 26 | ] 27 | ACCEPTED_XLSX_FILE_EXTENSIONS = [".xlsx"] 28 | 29 | ACCEPTED_XLS_MIME_TYPE_PREFIXES = [ 30 | "application/vnd.ms-excel", 31 | "application/excel", 32 | ] 33 | ACCEPTED_XLS_FILE_EXTENSIONS = [".xls"] 34 | 35 | 36 | class XlsxConverter(DocumentConverter): 37 | """ 38 | Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table. 39 | """ 40 | 41 | def __init__(self): 42 | super().__init__() 43 | self._html_converter = HtmlConverter() 44 | 45 | def accepts( 46 | self, 47 | file_stream: BinaryIO, 48 | stream_info: StreamInfo, 49 | **kwargs: Any, # Options to pass to the converter 50 | ) -> bool: 51 | mimetype = (stream_info.mimetype or "").lower() 52 | extension = (stream_info.extension or "").lower() 53 | 54 | if extension in ACCEPTED_XLSX_FILE_EXTENSIONS: 55 | return True 56 | 57 | for prefix in ACCEPTED_XLSX_MIME_TYPE_PREFIXES: 58 | if mimetype.startswith(prefix): 59 | return True 60 | 61 | return False 62 | 63 | def convert( 64 | self, 65 | file_stream: BinaryIO, 66 | stream_info: StreamInfo, 67 | **kwargs: Any, # Options to pass to the converter 68 | ) -> DocumentConverterResult: 69 | # Check the dependencies 70 | if _xlsx_dependency_exc_info is not None: 71 | raise MissingDependencyException( 72 | MISSING_DEPENDENCY_MESSAGE.format( 73 | converter=type(self).__name__, 74 | extension=".xlsx", 75 | feature="xlsx", 76 | ) 77 | ) from _xlsx_dependency_exc_info[ 78 | 1 79 | ].with_traceback( # type: ignore[union-attr] 80 | _xlsx_dependency_exc_info[2] 81 | ) 82 | 83 | sheets = pd.read_excel(file_stream, sheet_name=None, engine="openpyxl") 84 | md_content = "" 85 | for s in sheets: 86 | md_content += f"## {s}\n" 87 | html_content = sheets[s].to_html(index=False) 88 | md_content += ( 89 | self._html_converter.convert_string( 90 | html_content, **kwargs 91 | ).markdown.strip() 92 | + "\n\n" 93 | ) 94 | 95 | return DocumentConverterResult(markdown=md_content.strip()) 96 | 97 | 98 | class XlsConverter(DocumentConverter): 99 | """ 100 | Converts XLS files to Markdown, with each sheet presented as a separate Markdown table. 101 | """ 102 | 103 | def __init__(self): 104 | super().__init__() 105 | self._html_converter = HtmlConverter() 106 | 107 | def accepts( 108 | self, 109 | file_stream: BinaryIO, 110 | stream_info: StreamInfo, 111 | **kwargs: Any, # Options to pass to the converter 112 | ) -> bool: 113 | mimetype = (stream_info.mimetype or "").lower() 114 | extension = (stream_info.extension or "").lower() 115 | 116 | if extension in ACCEPTED_XLS_FILE_EXTENSIONS: 117 | return True 118 | 119 | for prefix in ACCEPTED_XLS_MIME_TYPE_PREFIXES: 120 | if mimetype.startswith(prefix): 121 | return True 122 | 123 | return False 124 | 125 | def convert( 126 | self, 127 | file_stream: BinaryIO, 128 | stream_info: StreamInfo, 129 | **kwargs: Any, # Options to pass to the converter 130 | ) -> DocumentConverterResult: 131 | # Load the dependencies 132 | if _xls_dependency_exc_info is not None: 133 | raise MissingDependencyException( 134 | MISSING_DEPENDENCY_MESSAGE.format( 135 | converter=type(self).__name__, 136 | extension=".xls", 137 | feature="xls", 138 | ) 139 | ) from _xls_dependency_exc_info[ 140 | 1 141 | ].with_traceback( # type: ignore[union-attr] 142 | _xls_dependency_exc_info[2] 143 | ) 144 | 145 | sheets = pd.read_excel(file_stream, sheet_name=None, engine="xlrd") 146 | md_content = "" 147 | for s in sheets: 148 | md_content += f"## {s}\n" 149 | html_content = sheets[s].to_html(index=False) 150 | md_content += ( 151 | self._html_converter.convert_string( 152 | html_content, **kwargs 153 | ).markdown.strip() 154 | + "\n\n" 155 | ) 156 | 157 | return DocumentConverterResult(markdown=md_content.strip()) 158 | -------------------------------------------------------------------------------- /packages/markitdown/src/markitdown/converters/_youtube_converter.py: -------------------------------------------------------------------------------- 1 | import json 2 | import time 3 | import re 4 | import bs4 5 | from typing import Any, BinaryIO, Dict, List, Union 6 | from urllib.parse import parse_qs, urlparse, unquote 7 | 8 | from .._base_converter import DocumentConverter, DocumentConverterResult 9 | from .._stream_info import StreamInfo 10 | 11 | # Optional YouTube transcription support 12 | try: 13 | # Suppress some warnings on library import 14 | import warnings 15 | 16 | with warnings.catch_warnings(): 17 | warnings.filterwarnings("ignore", category=SyntaxWarning) 18 | # Patch submitted upstream to fix the SyntaxWarning 19 | from youtube_transcript_api import YouTubeTranscriptApi 20 | 21 | IS_YOUTUBE_TRANSCRIPT_CAPABLE = True 22 | except ModuleNotFoundError: 23 | IS_YOUTUBE_TRANSCRIPT_CAPABLE = False 24 | 25 | 26 | ACCEPTED_MIME_TYPE_PREFIXES = [ 27 | "text/html", 28 | "application/xhtml", 29 | ] 30 | 31 | ACCEPTED_FILE_EXTENSIONS = [ 32 | ".html", 33 | ".htm", 34 | ] 35 | 36 | 37 | class YouTubeConverter(DocumentConverter): 38 | """Handle YouTube specially, focusing on the video title, description, and transcript.""" 39 | 40 | def accepts( 41 | self, 42 | file_stream: BinaryIO, 43 | stream_info: StreamInfo, 44 | **kwargs: Any, # Options to pass to the converter 45 | ) -> bool: 46 | """ 47 | Make sure we're dealing with HTML content *from* YouTube. 48 | """ 49 | url = stream_info.url or "" 50 | mimetype = (stream_info.mimetype or "").lower() 51 | extension = (stream_info.extension or "").lower() 52 | 53 | url = unquote(url) 54 | url = url.replace(r"\?", "?").replace(r"\=", "=") 55 | 56 | if not url.startswith("https://www.youtube.com/watch?"): 57 | # Not a YouTube URL 58 | return False 59 | 60 | if extension in ACCEPTED_FILE_EXTENSIONS: 61 | return True 62 | 63 | for prefix in ACCEPTED_MIME_TYPE_PREFIXES: 64 | if mimetype.startswith(prefix): 65 | return True 66 | 67 | # Not HTML content 68 | return False 69 | 70 | def convert( 71 | self, 72 | file_stream: BinaryIO, 73 | stream_info: StreamInfo, 74 | **kwargs: Any, # Options to pass to the converter 75 | ) -> DocumentConverterResult: 76 | # Parse the stream 77 | encoding = "utf-8" if stream_info.charset is None else stream_info.charset 78 | soup = bs4.BeautifulSoup(file_stream, "html.parser", from_encoding=encoding) 79 | 80 | # Read the meta tags 81 | metadata: Dict[str, str] = {} 82 | 83 | if soup.title and soup.title.string: 84 | metadata["title"] = soup.title.string 85 | 86 | for meta in soup(["meta"]): 87 | if not isinstance(meta, bs4.Tag): 88 | continue 89 | 90 | for a in meta.attrs: 91 | if a in ["itemprop", "property", "name"]: 92 | key = str(meta.get(a, "")) 93 | content = str(meta.get("content", "")) 94 | if key and content: # Only add non-empty content 95 | metadata[key] = content 96 | break 97 | 98 | # Try reading the description 99 | try: 100 | for script in soup(["script"]): 101 | if not isinstance(script, bs4.Tag): 102 | continue 103 | if not script.string: # Skip empty scripts 104 | continue 105 | content = script.string 106 | if "ytInitialData" in content: 107 | match = re.search(r"var ytInitialData = ({.*?});", content) 108 | if match: 109 | data = json.loads(match.group(1)) 110 | attrdesc = self._findKey(data, "attributedDescriptionBodyText") 111 | if attrdesc and isinstance(attrdesc, dict): 112 | metadata["description"] = str(attrdesc.get("content", "")) 113 | break 114 | except Exception as e: 115 | print(f"Error extracting description: {e}") 116 | pass 117 | 118 | # Start preparing the page 119 | webpage_text = "# YouTube\n" 120 | 121 | title = self._get(metadata, ["title", "og:title", "name"]) # type: ignore 122 | assert isinstance(title, str) 123 | 124 | if title: 125 | webpage_text += f"\n## {title}\n" 126 | 127 | stats = "" 128 | views = self._get(metadata, ["interactionCount"]) # type: ignore 129 | if views: 130 | stats += f"- **Views:** {views}\n" 131 | 132 | keywords = self._get(metadata, ["keywords"]) # type: ignore 133 | if keywords: 134 | stats += f"- **Keywords:** {keywords}\n" 135 | 136 | runtime = self._get(metadata, ["duration"]) # type: ignore 137 | if runtime: 138 | stats += f"- **Runtime:** {runtime}\n" 139 | 140 | if len(stats) > 0: 141 | webpage_text += f"\n### Video Metadata\n{stats}\n" 142 | 143 | description = self._get(metadata, ["description", "og:description"]) # type: ignore 144 | if description: 145 | webpage_text += f"\n### Description\n{description}\n" 146 | 147 | if IS_YOUTUBE_TRANSCRIPT_CAPABLE: 148 | ytt_api = YouTubeTranscriptApi() 149 | transcript_text = "" 150 | parsed_url = urlparse(stream_info.url) # type: ignore 151 | params = parse_qs(parsed_url.query) # type: ignore 152 | if "v" in params and params["v"][0]: 153 | video_id = str(params["v"][0]) 154 | transcript_list = ytt_api.list(video_id) 155 | languages = ["en"] 156 | for transcript in transcript_list: 157 | languages.append(transcript.language_code) 158 | break 159 | try: 160 | youtube_transcript_languages = kwargs.get( 161 | "youtube_transcript_languages", languages 162 | ) 163 | # Retry the transcript fetching operation 164 | transcript = self._retry_operation( 165 | lambda: ytt_api.fetch( 166 | video_id, languages=youtube_transcript_languages 167 | ), 168 | retries=3, # Retry 3 times 169 | delay=2, # 2 seconds delay between retries 170 | ) 171 | 172 | if transcript: 173 | transcript_text = " ".join( 174 | [part.text for part in transcript] 175 | ) # type: ignore 176 | except Exception as e: 177 | # No transcript available 178 | if len(languages) == 1: 179 | print(f"Error fetching transcript: {e}") 180 | else: 181 | # Translate transcript into first kwarg 182 | transcript = ( 183 | transcript_list.find_transcript(languages) 184 | .translate(youtube_transcript_languages[0]) 185 | .fetch() 186 | ) 187 | transcript_text = " ".join([part.text for part in transcript]) 188 | if transcript_text: 189 | webpage_text += f"\n### Transcript\n{transcript_text}\n" 190 | 191 | title = title if title else (soup.title.string if soup.title else "") 192 | assert isinstance(title, str) 193 | 194 | return DocumentConverterResult( 195 | markdown=webpage_text, 196 | title=title, 197 | ) 198 | 199 | def _get( 200 | self, 201 | metadata: Dict[str, str], 202 | keys: List[str], 203 | default: Union[str, None] = None, 204 | ) -> Union[str, None]: 205 | """Get first non-empty value from metadata matching given keys.""" 206 | for k in keys: 207 | if k in metadata: 208 | return metadata[k] 209 | return default 210 | 211 | def _findKey(self, json: Any, key: str) -> Union[str, None]: # TODO: Fix json type 212 | """Recursively search for a key in nested dictionary/list structures.""" 213 | if isinstance(json, list): 214 | for elm in json: 215 | ret = self._findKey(elm, key) 216 | if ret is not None: 217 | return ret 218 | elif isinstance(json, dict): 219 | for k, v in json.items(): 220 | if k == key: 221 | return json[k] 222 | if result := self._findKey(v, key): 223 | return result 224 | return None 225 | 226 | def _retry_operation(self, operation, retries=3, delay=2): 227 | """Retries the operation if it fails.""" 228 | attempt = 0 229 | while attempt < retries: 230 | try: 231 | return operation() # Attempt the operation 232 | except Exception as e: 233 | print(f"Attempt {attempt + 1} failed: {e}") 234 | if attempt < retries - 1: 235 | time.sleep(delay) # Wait before retrying 236 | attempt += 1 237 | # If all attempts fail, raise the last exception 238 | raise Exception(f"Operation failed after {retries} attempts.") 239 | -------------------------------------------------------------------------------- /packages/markitdown/src/markitdown/converters/_zip_converter.py: -------------------------------------------------------------------------------- 1 | import zipfile 2 | import io 3 | import os 4 | 5 | from typing import BinaryIO, Any, TYPE_CHECKING 6 | 7 | from .._base_converter import DocumentConverter, DocumentConverterResult 8 | from .._stream_info import StreamInfo 9 | from .._exceptions import UnsupportedFormatException, FileConversionException 10 | 11 | # Break otherwise circular import for type hinting 12 | if TYPE_CHECKING: 13 | from .._markitdown import MarkItDown 14 | 15 | ACCEPTED_MIME_TYPE_PREFIXES = [ 16 | "application/zip", 17 | ] 18 | 19 | ACCEPTED_FILE_EXTENSIONS = [".zip"] 20 | 21 | 22 | class ZipConverter(DocumentConverter): 23 | """Converts ZIP files to markdown by extracting and converting all contained files. 24 | 25 | The converter extracts the ZIP contents to a temporary directory, processes each file 26 | using appropriate converters based on file extensions, and then combines the results 27 | into a single markdown document. The temporary directory is cleaned up after processing. 28 | 29 | Example output format: 30 | ```markdown 31 | Content from the zip file `example.zip`: 32 | 33 | ## File: docs/readme.txt 34 | 35 | This is the content of readme.txt 36 | Multiple lines are preserved 37 | 38 | ## File: images/example.jpg 39 | 40 | ImageSize: 1920x1080 41 | DateTimeOriginal: 2024-02-15 14:30:00 42 | Description: A beautiful landscape photo 43 | 44 | ## File: data/report.xlsx 45 | 46 | ## Sheet1 47 | | Column1 | Column2 | Column3 | 48 | |---------|---------|---------| 49 | | data1 | data2 | data3 | 50 | | data4 | data5 | data6 | 51 | ``` 52 | 53 | Key features: 54 | - Maintains original file structure in headings 55 | - Processes nested files recursively 56 | - Uses appropriate converters for each file type 57 | - Preserves formatting of converted content 58 | - Cleans up temporary files after processing 59 | """ 60 | 61 | def __init__( 62 | self, 63 | *, 64 | markitdown: "MarkItDown", 65 | ): 66 | super().__init__() 67 | self._markitdown = markitdown 68 | 69 | def accepts( 70 | self, 71 | file_stream: BinaryIO, 72 | stream_info: StreamInfo, 73 | **kwargs: Any, # Options to pass to the converter 74 | ) -> bool: 75 | mimetype = (stream_info.mimetype or "").lower() 76 | extension = (stream_info.extension or "").lower() 77 | 78 | if extension in ACCEPTED_FILE_EXTENSIONS: 79 | return True 80 | 81 | for prefix in ACCEPTED_MIME_TYPE_PREFIXES: 82 | if mimetype.startswith(prefix): 83 | return True 84 | 85 | return False 86 | 87 | def convert( 88 | self, 89 | file_stream: BinaryIO, 90 | stream_info: StreamInfo, 91 | **kwargs: Any, # Options to pass to the converter 92 | ) -> DocumentConverterResult: 93 | file_path = stream_info.url or stream_info.local_path or stream_info.filename 94 | md_content = f"Content from the zip file `{file_path}`:\n\n" 95 | 96 | with zipfile.ZipFile(file_stream, "r") as zipObj: 97 | for name in zipObj.namelist(): 98 | try: 99 | z_file_stream = io.BytesIO(zipObj.read(name)) 100 | z_file_stream_info = StreamInfo( 101 | extension=os.path.splitext(name)[1], 102 | filename=os.path.basename(name), 103 | ) 104 | result = self._markitdown.convert_stream( 105 | stream=z_file_stream, 106 | stream_info=z_file_stream_info, 107 | ) 108 | if result is not None: 109 | md_content += f"## File: {name}\n\n" 110 | md_content += result.markdown + "\n\n" 111 | except UnsupportedFormatException: 112 | pass 113 | except FileConversionException: 114 | pass 115 | 116 | return DocumentConverterResult(markdown=md_content.strip()) 117 | -------------------------------------------------------------------------------- /packages/markitdown/src/markitdown/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/markitdown/62b72284feb986ffaf8c22fa73614545b5713c30/packages/markitdown/src/markitdown/py.typed -------------------------------------------------------------------------------- /packages/markitdown/tests/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: 2024-present Adam Fourney 2 | # 3 | # SPDX-License-Identifier: MIT 4 | -------------------------------------------------------------------------------- /packages/markitdown/tests/_test_vectors.py: -------------------------------------------------------------------------------- 1 | import dataclasses 2 | from typing import List 3 | 4 | 5 | @dataclasses.dataclass(frozen=True, kw_only=True) 6 | class FileTestVector(object): 7 | filename: str 8 | mimetype: str | None 9 | charset: str | None 10 | url: str | None 11 | must_include: List[str] 12 | must_not_include: List[str] 13 | 14 | 15 | GENERAL_TEST_VECTORS = [ 16 | FileTestVector( 17 | filename="test.docx", 18 | mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document", 19 | charset=None, 20 | url=None, 21 | must_include=[ 22 | "314b0a30-5b04-470b-b9f7-eed2c2bec74a", 23 | "49e168b7-d2ae-407f-a055-2167576f39a1", 24 | "## d666f1f7-46cb-42bd-9a39-9a39cf2a509f", 25 | "# Abstract", 26 | "# Introduction", 27 | "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation", 28 | "data:image/png;base64...", 29 | ], 30 | must_not_include=[ 31 | "", 32 | ], 33 | ), 34 | FileTestVector( 35 | filename="test.xlsx", 36 | mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", 37 | charset=None, 38 | url=None, 39 | must_include=[ 40 | "## 09060124-b5e7-4717-9d07-3c046eb", 41 | "6ff4173b-42a5-4784-9b19-f49caff4d93d", 42 | "affc7dad-52dc-4b98-9b5d-51e65d8a8ad0", 43 | ], 44 | must_not_include=[], 45 | ), 46 | FileTestVector( 47 | filename="test.xls", 48 | mimetype="application/vnd.ms-excel", 49 | charset=None, 50 | url=None, 51 | must_include=[ 52 | "## 09060124-b5e7-4717-9d07-3c046eb", 53 | "6ff4173b-42a5-4784-9b19-f49caff4d93d", 54 | "affc7dad-52dc-4b98-9b5d-51e65d8a8ad0", 55 | ], 56 | must_not_include=[], 57 | ), 58 | FileTestVector( 59 | filename="test.pptx", 60 | mimetype="application/vnd.openxmlformats-officedocument.presentationml.presentation", 61 | charset=None, 62 | url=None, 63 | must_include=[ 64 | "2cdda5c8-e50e-4db4-b5f0-9722a649f455", 65 | "04191ea8-5c73-4215-a1d3-1cfb43aaaf12", 66 | "44bf7d06-5e7a-4a40-a2e1-a2e42ef28c8a", 67 | "1b92870d-e3b5-4e65-8153-919f4ff45592", 68 | "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation", 69 | "a3f6004b-6f4f-4ea8-bee3-3741f4dc385f", # chart title 70 | "2003", # chart value 71 | "![This phrase of the caption is Human-written.](Picture4.jpg)", 72 | ], 73 | must_not_include=[""], 74 | ), 75 | FileTestVector( 76 | filename="test_outlook_msg.msg", 77 | mimetype="application/vnd.ms-outlook", 78 | charset=None, 79 | url=None, 80 | must_include=[ 81 | "# Email Message", 82 | "**From:** test.sender@example.com", 83 | "**To:** test.recipient@example.com", 84 | "**Subject:** Test Email Message", 85 | "## Content", 86 | "This is the body of the test email message", 87 | ], 88 | must_not_include=[], 89 | ), 90 | FileTestVector( 91 | filename="test.pdf", 92 | mimetype="application/pdf", 93 | charset=None, 94 | url=None, 95 | must_include=[ 96 | "While there is contemporaneous exploration of multi-agent approaches" 97 | ], 98 | must_not_include=[], 99 | ), 100 | FileTestVector( 101 | filename="test_blog.html", 102 | mimetype="text/html", 103 | charset="utf-8", 104 | url="https://microsoft.github.io/autogen/blog/2023/04/21/LLM-tuning-math", 105 | must_include=[ 106 | "Large language models (LLMs) are powerful tools that can generate natural language texts for various applications, such as chatbots, summarization, translation, and more. GPT-4 is currently the state of the art LLM in the world. Is model selection irrelevant? What about inference parameters?", 107 | "an example where high cost can easily prevent a generic complex", 108 | ], 109 | must_not_include=[], 110 | ), 111 | FileTestVector( 112 | filename="test_wikipedia.html", 113 | mimetype="text/html", 114 | charset="utf-8", 115 | url="https://en.wikipedia.org/wiki/Microsoft", 116 | must_include=[ 117 | "Microsoft entered the operating system (OS) business in 1980 with its own version of [Unix]", 118 | 'Microsoft was founded by [Bill Gates](/wiki/Bill_Gates "Bill Gates")', 119 | ], 120 | must_not_include=[ 121 | "You are encouraged to create an account and log in", 122 | "154 languages", 123 | "move to sidebar", 124 | ], 125 | ), 126 | FileTestVector( 127 | filename="test_serp.html", 128 | mimetype="text/html", 129 | charset="utf-8", 130 | url="https://www.bing.com/search?q=microsoft+wikipedia", 131 | must_include=[ 132 | "](https://en.wikipedia.org/wiki/Microsoft", 133 | "Microsoft Corporation is **an American multinational corporation and technology company headquartered** in Redmond", 134 | "1995–2007: Foray into the Web, Windows 95, Windows XP, and Xbox", 135 | ], 136 | must_not_include=[ 137 | "https://www.bing.com/ck/a?!&&p=", 138 | "", 254 | ], 255 | must_not_include=[ 256 | "data:image/png;base64...", 257 | ], 258 | ), 259 | FileTestVector( 260 | filename="test.pptx", 261 | mimetype="application/vnd.openxmlformats-officedocument.presentationml.presentation", 262 | charset=None, 263 | url=None, 264 | must_include=[ 265 | "2cdda5c8-e50e-4db4-b5f0-9722a649f455", 266 | "04191ea8-5c73-4215-a1d3-1cfb43aaaf12", 267 | "44bf7d06-5e7a-4a40-a2e1-a2e42ef28c8a", 268 | "1b92870d-e3b5-4e65-8153-919f4ff45592", 269 | "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation", 270 | "a3f6004b-6f4f-4ea8-bee3-3741f4dc385f", # chart title 271 | "2003", # chart value 272 | "![This phrase of the caption is Human-written.]", # image caption 273 | "", 274 | ], 275 | must_not_include=[ 276 | "![This phrase of the caption is Human-written.](Picture4.jpg)", 277 | ], 278 | ), 279 | ] 280 | -------------------------------------------------------------------------------- /packages/markitdown/tests/test_cli_misc.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 -m pytest 2 | import subprocess 3 | from markitdown import __version__ 4 | 5 | # This file contains CLI tests that are not directly tested by the FileTestVectors. 6 | # This includes things like help messages, version numbers, and invalid flags. 7 | 8 | 9 | def test_version() -> None: 10 | result = subprocess.run( 11 | ["python", "-m", "markitdown", "--version"], capture_output=True, text=True 12 | ) 13 | 14 | assert result.returncode == 0, f"CLI exited with error: {result.stderr}" 15 | assert __version__ in result.stdout, f"Version not found in output: {result.stdout}" 16 | 17 | 18 | def test_invalid_flag() -> None: 19 | result = subprocess.run( 20 | ["python", "-m", "markitdown", "--foobar"], capture_output=True, text=True 21 | ) 22 | 23 | assert result.returncode != 0, f"CLI exited with error: {result.stderr}" 24 | assert ( 25 | "unrecognized arguments" in result.stderr 26 | ), "Expected 'unrecognized arguments' to appear in STDERR" 27 | assert "SYNTAX" in result.stderr, "Expected 'SYNTAX' to appear in STDERR" 28 | 29 | 30 | if __name__ == "__main__": 31 | """Runs this file's tests from the command line.""" 32 | test_version() 33 | test_invalid_flag() 34 | print("All tests passed!") 35 | -------------------------------------------------------------------------------- /packages/markitdown/tests/test_cli_vectors.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 -m pytest 2 | import os 3 | import time 4 | import pytest 5 | import subprocess 6 | import locale 7 | from typing import List 8 | 9 | if __name__ == "__main__": 10 | from _test_vectors import ( 11 | GENERAL_TEST_VECTORS, 12 | DATA_URI_TEST_VECTORS, 13 | FileTestVector, 14 | ) 15 | else: 16 | from ._test_vectors import ( 17 | GENERAL_TEST_VECTORS, 18 | DATA_URI_TEST_VECTORS, 19 | FileTestVector, 20 | ) 21 | 22 | skip_remote = ( 23 | True if os.environ.get("GITHUB_ACTIONS") else False 24 | ) # Don't run these tests in CI 25 | 26 | TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files") 27 | TEST_FILES_URL = "https://raw.githubusercontent.com/microsoft/markitdown/refs/heads/main/packages/markitdown/tests/test_files" 28 | 29 | 30 | # Prepare CLI test vectors (remove vectors that require mockig the url) 31 | CLI_TEST_VECTORS: List[FileTestVector] = [] 32 | for test_vector in GENERAL_TEST_VECTORS: 33 | if test_vector.url is not None: 34 | continue 35 | CLI_TEST_VECTORS.append(test_vector) 36 | 37 | 38 | @pytest.fixture(scope="session") 39 | def shared_tmp_dir(tmp_path_factory): 40 | return tmp_path_factory.mktemp("pytest_tmp") 41 | 42 | 43 | @pytest.mark.parametrize("test_vector", CLI_TEST_VECTORS) 44 | def test_output_to_stdout(shared_tmp_dir, test_vector) -> None: 45 | """Test that the CLI outputs to stdout correctly.""" 46 | 47 | result = subprocess.run( 48 | [ 49 | "python", 50 | "-m", 51 | "markitdown", 52 | os.path.join(TEST_FILES_DIR, test_vector.filename), 53 | ], 54 | capture_output=True, 55 | text=True, 56 | ) 57 | 58 | assert result.returncode == 0, f"CLI exited with error: {result.stderr}" 59 | for test_string in test_vector.must_include: 60 | assert test_string in result.stdout 61 | for test_string in test_vector.must_not_include: 62 | assert test_string not in result.stdout 63 | 64 | 65 | @pytest.mark.parametrize("test_vector", CLI_TEST_VECTORS) 66 | def test_output_to_file(shared_tmp_dir, test_vector) -> None: 67 | """Test that the CLI outputs to a file correctly.""" 68 | 69 | output_file = os.path.join(shared_tmp_dir, test_vector.filename + ".output") 70 | result = subprocess.run( 71 | [ 72 | "python", 73 | "-m", 74 | "markitdown", 75 | "-o", 76 | output_file, 77 | os.path.join(TEST_FILES_DIR, test_vector.filename), 78 | ], 79 | capture_output=True, 80 | text=True, 81 | ) 82 | 83 | assert result.returncode == 0, f"CLI exited with error: {result.stderr}" 84 | assert os.path.exists(output_file), f"Output file not created: {output_file}" 85 | 86 | with open(output_file, "r") as f: 87 | output_data = f.read() 88 | for test_string in test_vector.must_include: 89 | assert test_string in output_data 90 | for test_string in test_vector.must_not_include: 91 | assert test_string not in output_data 92 | 93 | os.remove(output_file) 94 | assert not os.path.exists(output_file), f"Output file not deleted: {output_file}" 95 | 96 | 97 | @pytest.mark.parametrize("test_vector", CLI_TEST_VECTORS) 98 | def test_input_from_stdin_without_hints(shared_tmp_dir, test_vector) -> None: 99 | """Test that the CLI readds from stdin correctly.""" 100 | 101 | test_input = b"" 102 | with open(os.path.join(TEST_FILES_DIR, test_vector.filename), "rb") as stream: 103 | test_input = stream.read() 104 | 105 | result = subprocess.run( 106 | [ 107 | "python", 108 | "-m", 109 | "markitdown", 110 | os.path.join(TEST_FILES_DIR, test_vector.filename), 111 | ], 112 | input=test_input, 113 | capture_output=True, 114 | text=False, 115 | ) 116 | 117 | stdout = result.stdout.decode(locale.getpreferredencoding()) 118 | assert ( 119 | result.returncode == 0 120 | ), f"CLI exited with error: {result.stderr.decode('utf-8')}" 121 | for test_string in test_vector.must_include: 122 | assert test_string in stdout 123 | for test_string in test_vector.must_not_include: 124 | assert test_string not in stdout 125 | 126 | 127 | @pytest.mark.skipif( 128 | skip_remote, 129 | reason="do not run tests that query external urls", 130 | ) 131 | @pytest.mark.parametrize("test_vector", CLI_TEST_VECTORS) 132 | def test_convert_url(shared_tmp_dir, test_vector): 133 | """Test the conversion of a stream with no stream info.""" 134 | # Note: tmp_dir is not used here, but is needed to match the signature 135 | 136 | time.sleep(1) # Ensure we don't hit rate limits 137 | result = subprocess.run( 138 | ["python", "-m", "markitdown", TEST_FILES_URL + "/" + test_vector.filename], 139 | capture_output=True, 140 | text=False, 141 | ) 142 | 143 | stdout = result.stdout.decode(locale.getpreferredencoding()) 144 | assert result.returncode == 0, f"CLI exited with error: {result.stderr}" 145 | for test_string in test_vector.must_include: 146 | assert test_string in stdout 147 | for test_string in test_vector.must_not_include: 148 | assert test_string not in stdout 149 | 150 | 151 | @pytest.mark.parametrize("test_vector", DATA_URI_TEST_VECTORS) 152 | def test_output_to_file_with_data_uris(shared_tmp_dir, test_vector) -> None: 153 | """Test CLI functionality when keep_data_uris is enabled""" 154 | 155 | output_file = os.path.join(shared_tmp_dir, test_vector.filename + ".output") 156 | result = subprocess.run( 157 | [ 158 | "python", 159 | "-m", 160 | "markitdown", 161 | "--keep-data-uris", 162 | "-o", 163 | output_file, 164 | os.path.join(TEST_FILES_DIR, test_vector.filename), 165 | ], 166 | capture_output=True, 167 | text=True, 168 | ) 169 | 170 | assert result.returncode == 0, f"CLI exited with error: {result.stderr}" 171 | assert os.path.exists(output_file), f"Output file not created: {output_file}" 172 | 173 | with open(output_file, "r") as f: 174 | output_data = f.read() 175 | for test_string in test_vector.must_include: 176 | assert test_string in output_data 177 | for test_string in test_vector.must_not_include: 178 | assert test_string not in output_data 179 | 180 | os.remove(output_file) 181 | assert not os.path.exists(output_file), f"Output file not deleted: {output_file}" 182 | 183 | 184 | if __name__ == "__main__": 185 | import tempfile 186 | 187 | """Runs this file's tests from the command line.""" 188 | 189 | with tempfile.TemporaryDirectory() as tmp_dir: 190 | # General tests 191 | for test_function in [ 192 | test_output_to_stdout, 193 | test_output_to_file, 194 | test_input_from_stdin_without_hints, 195 | test_convert_url, 196 | ]: 197 | for test_vector in CLI_TEST_VECTORS: 198 | print( 199 | f"Running {test_function.__name__} on {test_vector.filename}...", 200 | end="", 201 | ) 202 | test_function(tmp_dir, test_vector) 203 | print("OK") 204 | 205 | # Data URI tests 206 | for test_function in [ 207 | test_output_to_file_with_data_uris, 208 | ]: 209 | for test_vector in DATA_URI_TEST_VECTORS: 210 | print( 211 | f"Running {test_function.__name__} on {test_vector.filename}...", 212 | end="", 213 | ) 214 | test_function(tmp_dir, test_vector) 215 | print("OK") 216 | 217 | print("All tests passed!") 218 | -------------------------------------------------------------------------------- /packages/markitdown/tests/test_files/equations.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/markitdown/62b72284feb986ffaf8c22fa73614545b5713c30/packages/markitdown/tests/test_files/equations.docx -------------------------------------------------------------------------------- /packages/markitdown/tests/test_files/random.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/markitdown/62b72284feb986ffaf8c22fa73614545b5713c30/packages/markitdown/tests/test_files/random.bin -------------------------------------------------------------------------------- /packages/markitdown/tests/test_files/test.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/markitdown/62b72284feb986ffaf8c22fa73614545b5713c30/packages/markitdown/tests/test_files/test.docx -------------------------------------------------------------------------------- /packages/markitdown/tests/test_files/test.epub: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/markitdown/62b72284feb986ffaf8c22fa73614545b5713c30/packages/markitdown/tests/test_files/test.epub -------------------------------------------------------------------------------- /packages/markitdown/tests/test_files/test.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/markitdown/62b72284feb986ffaf8c22fa73614545b5713c30/packages/markitdown/tests/test_files/test.jpg -------------------------------------------------------------------------------- /packages/markitdown/tests/test_files/test.json: -------------------------------------------------------------------------------- 1 | { 2 | "key1": "string_value", 3 | "key2": 1234, 4 | "key3": [ 5 | "list_value1", 6 | "list_value2" 7 | ], 8 | "5b64c88c-b3c3-4510-bcb8-da0b200602d8": "uuid_key", 9 | "uuid_value": "9700dc99-6685-40b4-9a3a-5e406dcb37f3" 10 | } 11 | -------------------------------------------------------------------------------- /packages/markitdown/tests/test_files/test.m4a: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/markitdown/62b72284feb986ffaf8c22fa73614545b5713c30/packages/markitdown/tests/test_files/test.m4a -------------------------------------------------------------------------------- /packages/markitdown/tests/test_files/test.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/markitdown/62b72284feb986ffaf8c22fa73614545b5713c30/packages/markitdown/tests/test_files/test.mp3 -------------------------------------------------------------------------------- /packages/markitdown/tests/test_files/test.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/markitdown/62b72284feb986ffaf8c22fa73614545b5713c30/packages/markitdown/tests/test_files/test.pdf -------------------------------------------------------------------------------- /packages/markitdown/tests/test_files/test.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/markitdown/62b72284feb986ffaf8c22fa73614545b5713c30/packages/markitdown/tests/test_files/test.pptx -------------------------------------------------------------------------------- /packages/markitdown/tests/test_files/test.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/markitdown/62b72284feb986ffaf8c22fa73614545b5713c30/packages/markitdown/tests/test_files/test.wav -------------------------------------------------------------------------------- /packages/markitdown/tests/test_files/test.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/markitdown/62b72284feb986ffaf8c22fa73614545b5713c30/packages/markitdown/tests/test_files/test.xls -------------------------------------------------------------------------------- /packages/markitdown/tests/test_files/test.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/markitdown/62b72284feb986ffaf8c22fa73614545b5713c30/packages/markitdown/tests/test_files/test.xlsx -------------------------------------------------------------------------------- /packages/markitdown/tests/test_files/test_files.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/markitdown/62b72284feb986ffaf8c22fa73614545b5713c30/packages/markitdown/tests/test_files/test_files.zip -------------------------------------------------------------------------------- /packages/markitdown/tests/test_files/test_llm.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/markitdown/62b72284feb986ffaf8c22fa73614545b5713c30/packages/markitdown/tests/test_files/test_llm.jpg -------------------------------------------------------------------------------- /packages/markitdown/tests/test_files/test_mskanji.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/markitdown/62b72284feb986ffaf8c22fa73614545b5713c30/packages/markitdown/tests/test_files/test_mskanji.csv -------------------------------------------------------------------------------- /packages/markitdown/tests/test_files/test_notebook.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "0f61db80", 6 | "metadata": {}, 7 | "source": [ 8 | "# Test Notebook" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 11, 14 | "id": "3f2a5bbd", 15 | "metadata": {}, 16 | "outputs": [ 17 | { 18 | "name": "stdout", 19 | "output_type": "stream", 20 | "text": [ 21 | "markitdown\n" 22 | ] 23 | } 24 | ], 25 | "source": [ 26 | "print(\"markitdown\")" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "id": "9b9c0468", 32 | "metadata": {}, 33 | "source": [ 34 | "## Code Cell Below" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 10, 40 | "id": "37d8088a", 41 | "metadata": {}, 42 | "outputs": [ 43 | { 44 | "name": "stdout", 45 | "output_type": "stream", 46 | "text": [ 47 | "42\n" 48 | ] 49 | } 50 | ], 51 | "source": [ 52 | "# comment in code\n", 53 | "print(42)" 54 | ] 55 | }, 56 | { 57 | "cell_type": "markdown", 58 | "id": "2e3177bd", 59 | "metadata": {}, 60 | "source": [ 61 | "End\n", 62 | "\n", 63 | "---" 64 | ] 65 | } 66 | ], 67 | "metadata": { 68 | "kernelspec": { 69 | "display_name": "Python 3", 70 | "language": "python", 71 | "name": "python3" 72 | }, 73 | "language_info": { 74 | "codemirror_mode": { 75 | "name": "ipython", 76 | "version": 3 77 | }, 78 | "file_extension": ".py", 79 | "mimetype": "text/x-python", 80 | "name": "python", 81 | "nbconvert_exporter": "python", 82 | "pygments_lexer": "ipython3", 83 | "version": "3.12.8" 84 | }, 85 | "title": "Test Notebook Title" 86 | }, 87 | "nbformat": 4, 88 | "nbformat_minor": 5 89 | } 90 | -------------------------------------------------------------------------------- /packages/markitdown/tests/test_files/test_outlook_msg.msg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/markitdown/62b72284feb986ffaf8c22fa73614545b5713c30/packages/markitdown/tests/test_files/test_outlook_msg.msg -------------------------------------------------------------------------------- /packages/markitdown/tests/test_files/test_with_comment.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/markitdown/62b72284feb986ffaf8c22fa73614545b5713c30/packages/markitdown/tests/test_files/test_with_comment.docx -------------------------------------------------------------------------------- /packages/markitdown/tests/test_module_vectors.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 -m pytest 2 | import os 3 | import time 4 | import pytest 5 | import base64 6 | 7 | from pathlib import Path 8 | 9 | if __name__ == "__main__": 10 | from _test_vectors import GENERAL_TEST_VECTORS, DATA_URI_TEST_VECTORS 11 | else: 12 | from ._test_vectors import GENERAL_TEST_VECTORS, DATA_URI_TEST_VECTORS 13 | 14 | from markitdown import ( 15 | MarkItDown, 16 | StreamInfo, 17 | ) 18 | 19 | skip_remote = ( 20 | True if os.environ.get("GITHUB_ACTIONS") else False 21 | ) # Don't run these tests in CI 22 | 23 | TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files") 24 | TEST_FILES_URL = "https://raw.githubusercontent.com/microsoft/markitdown/refs/heads/main/packages/markitdown/tests/test_files" 25 | 26 | 27 | @pytest.mark.parametrize("test_vector", GENERAL_TEST_VECTORS) 28 | def test_guess_stream_info(test_vector): 29 | """Test the ability to guess stream info.""" 30 | markitdown = MarkItDown() 31 | 32 | local_path = os.path.join(TEST_FILES_DIR, test_vector.filename) 33 | expected_extension = os.path.splitext(test_vector.filename)[1] 34 | 35 | with open(local_path, "rb") as stream: 36 | guesses = markitdown._get_stream_info_guesses( 37 | stream, 38 | base_guess=StreamInfo( 39 | filename=os.path.basename(test_vector.filename), 40 | local_path=local_path, 41 | extension=expected_extension, 42 | ), 43 | ) 44 | 45 | # For some limited exceptions, we can't guarantee the exact 46 | # mimetype or extension, so we'll special-case them here. 47 | if test_vector.filename in [ 48 | "test_outlook_msg.msg", 49 | ]: 50 | return 51 | 52 | assert guesses[0].mimetype == test_vector.mimetype 53 | assert guesses[0].extension == expected_extension 54 | assert guesses[0].charset == test_vector.charset 55 | 56 | 57 | @pytest.mark.parametrize("test_vector", GENERAL_TEST_VECTORS) 58 | def test_convert_local(test_vector): 59 | """Test the conversion of a local file.""" 60 | markitdown = MarkItDown() 61 | 62 | result = markitdown.convert( 63 | os.path.join(TEST_FILES_DIR, test_vector.filename), url=test_vector.url 64 | ) 65 | for string in test_vector.must_include: 66 | assert string in result.markdown 67 | for string in test_vector.must_not_include: 68 | assert string not in result.markdown 69 | 70 | 71 | @pytest.mark.parametrize("test_vector", GENERAL_TEST_VECTORS) 72 | def test_convert_stream_with_hints(test_vector): 73 | """Test the conversion of a stream with full stream info.""" 74 | markitdown = MarkItDown() 75 | 76 | stream_info = StreamInfo( 77 | extension=os.path.splitext(test_vector.filename)[1], 78 | mimetype=test_vector.mimetype, 79 | charset=test_vector.charset, 80 | ) 81 | 82 | with open(os.path.join(TEST_FILES_DIR, test_vector.filename), "rb") as stream: 83 | result = markitdown.convert( 84 | stream, stream_info=stream_info, url=test_vector.url 85 | ) 86 | for string in test_vector.must_include: 87 | assert string in result.markdown 88 | for string in test_vector.must_not_include: 89 | assert string not in result.markdown 90 | 91 | 92 | @pytest.mark.parametrize("test_vector", GENERAL_TEST_VECTORS) 93 | def test_convert_stream_without_hints(test_vector): 94 | """Test the conversion of a stream with no stream info.""" 95 | markitdown = MarkItDown() 96 | 97 | with open(os.path.join(TEST_FILES_DIR, test_vector.filename), "rb") as stream: 98 | result = markitdown.convert(stream, url=test_vector.url) 99 | for string in test_vector.must_include: 100 | assert string in result.markdown 101 | for string in test_vector.must_not_include: 102 | assert string not in result.markdown 103 | 104 | 105 | @pytest.mark.skipif( 106 | skip_remote, 107 | reason="do not run tests that query external urls", 108 | ) 109 | @pytest.mark.parametrize("test_vector", GENERAL_TEST_VECTORS) 110 | def test_convert_http_uri(test_vector): 111 | """Test the conversion of an HTTP:// or HTTPS:// URI.""" 112 | markitdown = MarkItDown() 113 | 114 | time.sleep(1) # Ensure we don't hit rate limits 115 | 116 | result = markitdown.convert( 117 | TEST_FILES_URL + "/" + test_vector.filename, 118 | url=test_vector.url, # Mock where this file would be found 119 | ) 120 | for string in test_vector.must_include: 121 | assert string in result.markdown 122 | for string in test_vector.must_not_include: 123 | assert string not in result.markdown 124 | 125 | 126 | @pytest.mark.parametrize("test_vector", GENERAL_TEST_VECTORS) 127 | def test_convert_file_uri(test_vector): 128 | """Test the conversion of a file:// URI.""" 129 | markitdown = MarkItDown() 130 | 131 | result = markitdown.convert( 132 | Path(os.path.join(TEST_FILES_DIR, test_vector.filename)).as_uri(), 133 | url=test_vector.url, 134 | ) 135 | for string in test_vector.must_include: 136 | assert string in result.markdown 137 | for string in test_vector.must_not_include: 138 | assert string not in result.markdown 139 | 140 | 141 | @pytest.mark.parametrize("test_vector", GENERAL_TEST_VECTORS) 142 | def test_convert_data_uri(test_vector): 143 | """Test the conversion of a data URI.""" 144 | markitdown = MarkItDown() 145 | 146 | data = "" 147 | with open(os.path.join(TEST_FILES_DIR, test_vector.filename), "rb") as stream: 148 | data = base64.b64encode(stream.read()).decode("utf-8") 149 | mimetype = test_vector.mimetype 150 | data_uri = f"data:{mimetype};base64,{data}" 151 | 152 | result = markitdown.convert( 153 | data_uri, 154 | url=test_vector.url, 155 | ) 156 | for string in test_vector.must_include: 157 | assert string in result.markdown 158 | for string in test_vector.must_not_include: 159 | assert string not in result.markdown 160 | 161 | 162 | @pytest.mark.parametrize("test_vector", DATA_URI_TEST_VECTORS) 163 | def test_convert_keep_data_uris(test_vector): 164 | """Test API functionality when keep_data_uris is enabled""" 165 | markitdown = MarkItDown() 166 | 167 | # Test local file conversion 168 | result = markitdown.convert( 169 | os.path.join(TEST_FILES_DIR, test_vector.filename), 170 | keep_data_uris=True, 171 | url=test_vector.url, 172 | ) 173 | 174 | for string in test_vector.must_include: 175 | assert string in result.markdown 176 | for string in test_vector.must_not_include: 177 | assert string not in result.markdown 178 | 179 | 180 | @pytest.mark.parametrize("test_vector", DATA_URI_TEST_VECTORS) 181 | def test_convert_stream_keep_data_uris(test_vector): 182 | """Test the conversion of a stream with no stream info.""" 183 | markitdown = MarkItDown() 184 | 185 | stream_info = StreamInfo( 186 | extension=os.path.splitext(test_vector.filename)[1], 187 | mimetype=test_vector.mimetype, 188 | charset=test_vector.charset, 189 | ) 190 | 191 | with open(os.path.join(TEST_FILES_DIR, test_vector.filename), "rb") as stream: 192 | result = markitdown.convert( 193 | stream, stream_info=stream_info, keep_data_uris=True, url=test_vector.url 194 | ) 195 | 196 | for string in test_vector.must_include: 197 | assert string in result.markdown 198 | for string in test_vector.must_not_include: 199 | assert string not in result.markdown 200 | 201 | 202 | if __name__ == "__main__": 203 | """Runs this file's tests from the command line.""" 204 | 205 | # General tests 206 | for test_function in [ 207 | test_guess_stream_info, 208 | test_convert_local, 209 | test_convert_stream_with_hints, 210 | test_convert_stream_without_hints, 211 | test_convert_http_uri, 212 | test_convert_file_uri, 213 | test_convert_data_uri, 214 | ]: 215 | for test_vector in GENERAL_TEST_VECTORS: 216 | print( 217 | f"Running {test_function.__name__} on {test_vector.filename}...", end="" 218 | ) 219 | test_function(test_vector) 220 | print("OK") 221 | 222 | # Data URI tests 223 | for test_function in [ 224 | test_convert_keep_data_uris, 225 | test_convert_stream_keep_data_uris, 226 | ]: 227 | for test_vector in DATA_URI_TEST_VECTORS: 228 | print( 229 | f"Running {test_function.__name__} on {test_vector.filename}...", end="" 230 | ) 231 | test_function(test_vector) 232 | print("OK") 233 | 234 | print("All tests passed!") 235 | --------------------------------------------------------------------------------