├── .github ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── ISSUE_TEMPLATE │ ├── bug-report.md │ ├── bug_report.md │ ├── feature-request.md │ ├── feature_request.md │ └── update-request.md ├── SUPPORT.md ├── pull_request_template.md └── workflows │ ├── docker-image.yml │ └── python-publish.yml ├── .gitignore ├── CODE_OF_CONDUCT.md ├── Dockerfile ├── LICENSE ├── README.md ├── docs ├── .gitbook │ └── assets │ │ ├── Twitter header - 1.png │ │ └── image.png ├── README.md ├── SUMMARY.md ├── api.md ├── assets │ ├── hero_image.png │ ├── hero_image_2.png │ └── omniparse_logo.png ├── deployment.md ├── examples │ ├── omniprase-x-langchain.md │ ├── omniprase-x-llamaindex.md │ └── vision-rag-using-omniparse.md ├── installation.md └── integration.md ├── download.py ├── examples └── OmniParse_GoogleColab.ipynb ├── omniparse ├── __init__.py ├── chunking │ └── __init__.py ├── demo.py ├── documents │ ├── __init__.py │ └── router.py ├── extraction │ └── __init__.py ├── image │ ├── __init__.py │ ├── process.py │ ├── router.py │ └── utils.py ├── media │ ├── __init__.py │ ├── router.py │ └── utils.py ├── models │ └── __init__.py ├── sheets │ └── __init__.py ├── utils.py └── web │ ├── __init__.py │ ├── config.py │ ├── crawler_strategy.py │ ├── model_loader.py │ ├── models.py │ ├── prompts.py │ ├── router.py │ ├── utils.py │ └── web_crawler.py ├── pyproject.toml ├── python-sdk ├── omniparse_client │ ├── __init__.py │ ├── omniparse.py │ └── utils.py └── pyproject.toml └── server.py /.github/CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adithya-s-k/omniparse/9d1ae83c46de777427e67b48f82eaca45ad7994a/.github/CODE_OF_CONDUCT.md -------------------------------------------------------------------------------- /.github/CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | ## Contributing 2 | 3 | We welcome contributions to OmniParse. If you'd like to contribute, please fork the repository and use a feature branch. Pull requests are warmly welcome. 4 | 5 | 1. Fork the repo 6 | 2. Create your feature branch (`git checkout -b feature/new-feature`) 7 | 3. Commit your changes (`git commit -am 'Add a new feature'`) 8 | 4. Push to the branch (`git push origin feature/new-feature`) 9 | 5. Create a new Pull Request -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug-report.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adithya-s-k/omniparse/9d1ae83c46de777427e67b48f82eaca45ad7994a/.github/ISSUE_TEMPLATE/bug-report.md -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 1. Go to '...' 16 | 2. Click on '....' 17 | 3. Scroll down to '....' 18 | 4. See error 19 | 20 | **Expected behavior** 21 | A clear and concise description of what you expected to happen. 22 | 23 | **Screenshots** 24 | If applicable, add screenshots to help explain your problem. 25 | 26 | **Desktop (please complete the following information):** 27 | - OS: [e.g. iOS] 28 | - Browser [e.g. chrome, safari] 29 | - Version [e.g. 22] 30 | 31 | **Smartphone (please complete the following information):** 32 | - Device: [e.g. iPhone6] 33 | - OS: [e.g. iOS8.1] 34 | - Browser [e.g. stock browser, safari] 35 | - Version [e.g. 22] 36 | 37 | **Additional context** 38 | Add any other context about the problem here. 39 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature-request.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adithya-s-k/omniparse/9d1ae83c46de777427e67b48f82eaca45ad7994a/.github/ISSUE_TEMPLATE/feature-request.md -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/update-request.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adithya-s-k/omniparse/9d1ae83c46de777427e67b48f82eaca45ad7994a/.github/ISSUE_TEMPLATE/update-request.md -------------------------------------------------------------------------------- /.github/SUPPORT.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adithya-s-k/omniparse/9d1ae83c46de777427e67b48f82eaca45ad7994a/.github/SUPPORT.md -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adithya-s-k/omniparse/9d1ae83c46de777427e67b48f82eaca45ad7994a/.github/pull_request_template.md -------------------------------------------------------------------------------- /.github/workflows/docker-image.yml: -------------------------------------------------------------------------------- 1 | name: Docker Image CI 2 | 3 | on: 4 | push: 5 | branches: [ "main" ] 6 | pull_request: 7 | branches: [ "main" ] 8 | 9 | jobs: 10 | build-and-push: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - uses: actions/checkout@v4 14 | 15 | - name: Log in to Docker Hub 16 | uses: docker/login-action@v3 17 | with: 18 | username: ${{ secrets.DOCKER_HUB_USERNAME }} 19 | password: ${{ secrets.DOCKER_HUB_ACCESS_TOKEN }} 20 | 21 | - name: Set up Docker Buildx 22 | uses: docker/setup-buildx-action@v3 23 | 24 | - name: Build and push Docker image 25 | uses: docker/build-push-action@v5 26 | with: 27 | context: . 28 | file: ./Dockerfile 29 | push: true 30 | tags: | 31 | cognitivelab/omniparse:latest 32 | cognitivelab/omniparse:${{ github.sha }} 33 | 34 | - name: Image digest 35 | run: echo ${{ steps.docker_build.outputs.digest }} -------------------------------------------------------------------------------- /.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | name: Upload Python Package 2 | 3 | on: 4 | release: 5 | types: [published] 6 | 7 | permissions: 8 | contents: read 9 | 10 | jobs: 11 | deploy: 12 | runs-on: ubuntu-latest 13 | 14 | steps: 15 | - uses: actions/checkout@v4 16 | - name: Set up Python 17 | uses: actions/setup-python@v3 18 | with: 19 | python-version: '3.x' 20 | - name: Install dependencies 21 | run: | 22 | python -m pip install --upgrade pip 23 | pip install build 24 | - name: Build package 25 | run: | 26 | cd python-sdk 27 | python -m build 28 | - name: Publish package 29 | env: 30 | TWINE_USERNAME: __token__ 31 | TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }} 32 | run: | 33 | cd python-sdk 34 | python -m twine upload dist/* 35 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 110 | .pdm.toml 111 | .pdm-python 112 | .pdm-build/ 113 | 114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 115 | __pypackages__/ 116 | 117 | # Celery stuff 118 | celerybeat-schedule 119 | celerybeat.pid 120 | 121 | # SageMath parsed files 122 | *.sage.py 123 | 124 | # Environments 125 | .env 126 | .venv 127 | env/ 128 | venv/ 129 | ENV/ 130 | env.bak/ 131 | venv.bak/ 132 | 133 | # Spyder project settings 134 | .spyderproject 135 | .spyproject 136 | 137 | # Rope project settings 138 | .ropeproject 139 | 140 | # mkdocs documentation 141 | /site 142 | 143 | # mypy 144 | .mypy_cache/ 145 | .dmypy.json 146 | dmypy.json 147 | 148 | # Pyre type checker 149 | .pyre/ 150 | 151 | # pytype static type analyzer 152 | .pytype/ 153 | 154 | # Cython debug symbols 155 | cython_debug/ 156 | 157 | # PyCharm 158 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 159 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 160 | # and can be added to the global gitignore or merged into this file. For a more nuclear 161 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 162 | #.idea/ 163 | poetry.lock 164 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | We as members, contributors, and leaders pledge to make participation in our 6 | community a harassment-free experience for everyone, regardless of age, body 7 | size, visible or invisible disability, ethnicity, sex characteristics, gender 8 | identity and expression, level of experience, education, socio-economic status, 9 | nationality, personal appearance, race, religion, or sexual identity 10 | and orientation. 11 | 12 | We pledge to act and interact in ways that contribute to an open, welcoming, 13 | diverse, inclusive, and healthy community. 14 | 15 | ## Our Standards 16 | 17 | Examples of behavior that contributes to a positive environment for our 18 | community include: 19 | 20 | * Demonstrating empathy and kindness toward other people 21 | * Being respectful of differing opinions, viewpoints, and experiences 22 | * Giving and gracefully accepting constructive feedback 23 | * Accepting responsibility and apologizing to those affected by our mistakes, 24 | and learning from the experience 25 | * Focusing on what is best not just for us as individuals, but for the 26 | overall community 27 | 28 | Examples of unacceptable behavior include: 29 | 30 | * The use of sexualized language or imagery, and sexual attention or 31 | advances of any kind 32 | * Trolling, insulting or derogatory comments, and personal or political attacks 33 | * Public or private harassment 34 | * Publishing others' private information, such as a physical or email 35 | address, without their explicit permission 36 | * Other conduct which could reasonably be considered inappropriate in a 37 | professional setting 38 | 39 | ## Enforcement Responsibilities 40 | 41 | Community leaders are responsible for clarifying and enforcing our standards of 42 | acceptable behavior and will take appropriate and fair corrective action in 43 | response to any behavior that they deem inappropriate, threatening, offensive, 44 | or harmful. 45 | 46 | Community leaders have the right and responsibility to remove, edit, or reject 47 | comments, commits, code, wiki edits, issues, and other contributions that are 48 | not aligned to this Code of Conduct, and will communicate reasons for moderation 49 | decisions when appropriate. 50 | 51 | ## Scope 52 | 53 | This Code of Conduct applies within all community spaces, and also applies when 54 | an individual is officially representing the community in public spaces. 55 | Examples of representing our community include using an official e-mail address, 56 | posting via an official social media account, or acting as an appointed 57 | representative at an online or offline event. 58 | 59 | ## Enforcement 60 | 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 62 | reported to the community leaders responsible for enforcement at 63 | . 64 | All complaints will be reviewed and investigated promptly and fairly. 65 | 66 | All community leaders are obligated to respect the privacy and security of the 67 | reporter of any incident. 68 | 69 | ## Enforcement Guidelines 70 | 71 | Community leaders will follow these Community Impact Guidelines in determining 72 | the consequences for any action they deem in violation of this Code of Conduct: 73 | 74 | ### 1. Correction 75 | 76 | **Community Impact**: Use of inappropriate language or other behavior deemed 77 | unprofessional or unwelcome in the community. 78 | 79 | **Consequence**: A private, written warning from community leaders, providing 80 | clarity around the nature of the violation and an explanation of why the 81 | behavior was inappropriate. A public apology may be requested. 82 | 83 | ### 2. Warning 84 | 85 | **Community Impact**: A violation through a single incident or series 86 | of actions. 87 | 88 | **Consequence**: A warning with consequences for continued behavior. No 89 | interaction with the people involved, including unsolicited interaction with 90 | those enforcing the Code of Conduct, for a specified period of time. This 91 | includes avoiding interactions in community spaces as well as external channels 92 | like social media. Violating these terms may lead to a temporary or 93 | permanent ban. 94 | 95 | ### 3. Temporary Ban 96 | 97 | **Community Impact**: A serious violation of community standards, including 98 | sustained inappropriate behavior. 99 | 100 | **Consequence**: A temporary ban from any sort of interaction or public 101 | communication with the community for a specified period of time. No public or 102 | private interaction with the people involved, including unsolicited interaction 103 | with those enforcing the Code of Conduct, is allowed during this period. 104 | Violating these terms may lead to a permanent ban. 105 | 106 | ### 4. Permanent Ban 107 | 108 | **Community Impact**: Demonstrating a pattern of violation of community 109 | standards, including sustained inappropriate behavior, harassment of an 110 | individual, or aggression toward or disparagement of classes of individuals. 111 | 112 | **Consequence**: A permanent ban from any sort of public interaction within 113 | the community. 114 | 115 | ## Attribution 116 | 117 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], 118 | version 2.0, available at 119 | https://www.contributor-covenant.org/version/2/0/code_of_conduct.html. 120 | 121 | Community Impact Guidelines were inspired by [Mozilla's code of conduct 122 | enforcement ladder](https://github.com/mozilla/diversity). 123 | 124 | [homepage]: https://www.contributor-covenant.org 125 | 126 | For answers to common questions about this code of conduct, see the FAQ at 127 | https://www.contributor-covenant.org/faq. Translations are available at 128 | https://www.contributor-covenant.org/translations. 129 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | ARG CUDA_VERSION="11.8.0" 2 | ARG CUDNN_VERSION="8" 3 | ARG UBUNTU_VERSION="22.04" 4 | ARG MAX_JOBS=4 5 | 6 | FROM nvidia/cuda:$CUDA_VERSION-cudnn$CUDNN_VERSION-devel-ubuntu$UBUNTU_VERSION 7 | 8 | # Update package lists and install necessary packages 9 | RUN apt-get update && \ 10 | apt-get install -y --no-install-recommends \ 11 | wget \ 12 | curl \ 13 | unzip \ 14 | git \ 15 | python3 \ 16 | python3-pip \ 17 | libgl1 \ 18 | libglib2.0-0 \ 19 | curl \ 20 | gnupg2 \ 21 | ca-certificates \ 22 | apt-transport-https \ 23 | software-properties-common \ 24 | libreoffice \ 25 | ffmpeg \ 26 | git-lfs \ 27 | xvfb \ 28 | && ln -s /usr/bin/python3 /usr/bin/python \ 29 | && curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | bash \ 30 | && wget -q -O - https://dl.google.com/linux/linux_signing_key.pub | apt-key add - \ 31 | && echo "deb http://dl.google.com/linux/chrome/deb/ stable main" > /etc/apt/sources.list.d/google-chrome.list \ 32 | && apt-get update \ 33 | && apt install python3-packaging \ 34 | && apt-get install -y --no-install-recommends google-chrome-stable \ 35 | && rm -rf /var/lib/apt/lists/* 36 | 37 | # Download and install ChromeDriver 38 | RUN CHROMEDRIVER_VERSION=$(curl -sS chromedriver.storage.googleapis.com/LATEST_RELEASE) && \ 39 | wget -N https://chromedriver.storage.googleapis.com/$CHROMEDRIVER_VERSION/chromedriver_linux64.zip -P /tmp && \ 40 | unzip /tmp/chromedriver_linux64.zip -d /tmp && \ 41 | mv /tmp/chromedriver /usr/local/bin/chromedriver && \ 42 | chmod +x /usr/local/bin/chromedriver && \ 43 | rm /tmp/chromedriver_linux64.zip 44 | 45 | # Copy Chromedriver from the builder stage (if applicable) 46 | # COPY --from=builder /usr/local/bin/chromedriver /usr/local/bin/chromedriver 47 | 48 | # Install PyTorch and related packages 49 | RUN pip3 install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 50 | 51 | # Set up working directory and copy application code 52 | COPY . /app 53 | WORKDIR /app 54 | 55 | # Install Python package (assuming it has a setup.py) 56 | RUN pip3 install --no-cache-dir -e . 57 | 58 | RUN pip install transformers==4.41.2 59 | 60 | # Set environment variables 61 | ENV CHROME_BIN=/usr/bin/google-chrome \ 62 | CHROMEDRIVER=/usr/local/bin/chromedriver \ 63 | DISPLAY=:99 \ 64 | DBUS_SESSION_BUS_ADDRESS=/dev/null \ 65 | PYTHONUNBUFFERED=1 66 | 67 | # Ensure the PATH environment variable includes the location of the installed packages 68 | ENV PATH /usr/local/bin:$PATH 69 | 70 | # Expose the desired port 71 | EXPOSE 8000 72 | 73 | # Run the server 74 | CMD ["python", "server.py", "--host", "0.0.0.0", "--port", "8000", "--documents", "--media", "--web"] 75 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # OmniParse 2 | 3 | ![OmniParse](https://raw.githubusercontent.com/adithya-s-k/omniparse/main/docs/assets/hero_image_2.png) 4 | [![GitHub Stars](https://img.shields.io/github/stars/adithya-s-k/omniparse?style=social)](https://github.com/adithya-s-k/omniparse/stargazers) 5 | [![GitHub Forks](https://img.shields.io/github/forks/adithya-s-k/omniparse?style=social)](https://github.com/adithya-s-k/omniparse/network/members) 6 | [![GitHub Issues](https://img.shields.io/github/issues/adithya-s-k/omniparse)](https://github.com/adithya-s-k/omniparse/issues) 7 | [![GitHub Pull Requests](https://img.shields.io/github/issues-pr/adithya-s-k/omniparse)](https://github.com/adithya-s-k/omniparse/pulls) 8 | [![License](https://img.shields.io/github/license/adithya-s-k/omniparse)](https://github.com/adithya-s-k/omniparse/blob/main/LICENSE) 9 | 10 | 11 | > [!IMPORTANT] 12 | > 13 | >OmniParse is a platform that ingests and parses any unstructured data into structured, actionable data optimized for GenAI (LLM) applications. Whether you are working with documents, tables, images, videos, audio files, or web pages, OmniParse prepares your data to be clean, structured, and ready for AI applications such as RAG, fine-tuning, and more 14 | 15 | ## Try it out 16 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/adithya-s-k/omniparse/blob/main/examples/OmniParse_GoogleColab.ipynb) 17 | 18 | ## Intro 19 | https://github.com/adithya-s-k/omniparse/assets/27956426/457d8b5b-9573-44da-8bcf-616000651a13 20 | 21 | ## Features 22 | ✅ Completely local, no external APIs \ 23 | ✅ Fits in a T4 GPU \ 24 | ✅ Supports ~20 file types \ 25 | ✅ Convert documents, multimedia, and web pages to high-quality structured markdown \ 26 | ✅ Table extraction, image extraction/captioning, audio/video transcription, web page crawling \ 27 | ✅ Easily deployable using Docker and Skypilot \ 28 | ✅ Colab friendly \ 29 | ✅ Interative UI powered by Gradio 30 | 31 | ### Why OmniParse ? 32 | It's challenging to process data as it comes in different shapes and sizes. OmniParse aims to be an ingestion/parsing platform where you can ingest any type of data, such as documents, images, audio, video, and web content, and get the most structured and actionable output that is GenAI (LLM) friendly. 33 | 34 | ## Installation 35 | > [!IMPORTANT] 36 | > The server only works on Linux-based systems. This is due to certain dependencies and system-specific configurations that are not compatible with Windows or macOS. 37 | 38 | ```bash 39 | git clone https://github.com/adithya-s-k/omniparse 40 | cd omniparse 41 | ``` 42 | 43 | Create a Virtual Environment: 44 | 45 | ```bash 46 | conda create -n omniparse-venv python=3.10 47 | conda activate omniparse-venv 48 | ``` 49 | 50 | Install Dependencies: 51 | 52 | ```bash 53 | poetry install 54 | # or 55 | pip install -e . 56 | # or 57 | pip install -r pyproject.toml 58 | ``` 59 | 60 | ### 🛳️ Docker 61 | 62 | To use OmniParse with Docker, execute the following commands: 63 | 64 | 1. Pull the OmniParse API Docker image from Docker Hub: 65 | 2. Run the Docker container, exposing port 8000: 66 | 👉🏼[Docker Image](https://hub.docker.com/r/savatar101/omniparse) 67 | ```bash 68 | docker pull savatar101/omniparse:0.1 69 | # if you are running on a gpu 70 | docker run --gpus all -p 8000:8000 savatar101/omniparse:0.1 71 | # else 72 | docker run -p 8000:8000 savatar101/omniparse:0.1 73 | ``` 74 | 75 | Alternatively, if you prefer to build the Docker image locally: 76 | Then, run the Docker container as follows: 77 | 78 | ```bash 79 | docker build -t omniparse . 80 | # if you are running on a gpu 81 | docker run --gpus all -p 8000:8000 omniparse 82 | # else 83 | docker run -p 8000:8000 omniparse 84 | 85 | ``` 86 | ## Usage 87 | 88 | Run the Server: 89 | 90 | ```bash 91 | python server.py --host 0.0.0.0 --port 8000 --documents --media --web 92 | ``` 93 | 94 | - `--documents`: Load in all the models that help you parse and ingest documents (Surya OCR series of models and Florence-2). 95 | - `--media`: Load in Whisper model to transcribe audio and video files. 96 | - `--web`: Set up selenium crawler. 97 | 98 | Download Models: 99 | If you want to download the models before starting the server 100 | 101 | ```bash 102 | python download.py --documents --media --web 103 | ``` 104 | 105 | - `--documents`: Load in all the models that help you parse and ingest documents (Surya OCR series of models and Florence-2). 106 | - `--media`: Load in Whisper model to transcribe audio and video files. 107 | - `--web`: Set up selenium crawler. 108 | 109 | ## Supported Data Types 110 | 111 | | Type | Supported Extensions | 112 | |-----------|-----------------------------------------------------| 113 | | Documents | .doc, .docx, .pdf, .ppt, .pptx | 114 | | Images | .png, .jpg, .jpeg, .tiff, .bmp, .heic | 115 | | Video | .mp4, .mkv, .avi, .mov | 116 | | Audio | .mp3, .wav, .aac | 117 | | Web | dynamic webpages, http://.com | 118 | 119 | 120 |
121 |

API Endpoints

122 | 123 | > Client library compatible with Langchain, llamaindex, and haystack integrations coming soon. 124 | 125 | - [API Endpoints](#api-endpoints) 126 | - [Document Parsing](#document-parsing) 127 | - [Parse Any Document](#parse-any-document) 128 | - [Parse PDF](#parse-pdf) 129 | - [Parse PowerPoint](#parse-powerpoint) 130 | - [Parse Word Document](#parse-word-document) 131 | - [Media Parsing](#media-parsing) 132 | - [Parse Any Media](#parse-any-media) 133 | - [Parse Image](#parse-image) 134 | - [Process Image](#process-image) 135 | - [Parse Video](#parse-video) 136 | - [Parse Audio](#parse-audio) 137 | - [Website Parsing](#website-parsing) 138 | - [Parse Website](#parse-website) 139 | 140 | ### Document Parsing 141 | 142 | #### Parse Any Document 143 | 144 | Endpoint: `/parse_document` 145 | Method: POST 146 | 147 | Parses PDF, PowerPoint, or Word documents. 148 | 149 | Curl command: 150 | ``` 151 | curl -X POST -F "file=@/path/to/document" http://localhost:8000/parse_document 152 | ``` 153 | 154 | #### Parse PDF 155 | 156 | Endpoint: `/parse_document/pdf` 157 | Method: POST 158 | 159 | Parses PDF documents. 160 | 161 | Curl command: 162 | ``` 163 | curl -X POST -F "file=@/path/to/document.pdf" http://localhost:8000/parse_document/pdf 164 | ``` 165 | 166 | #### Parse PowerPoint 167 | 168 | Endpoint: `/parse_document/ppt` 169 | Method: POST 170 | 171 | Parses PowerPoint presentations. 172 | 173 | Curl command: 174 | ``` 175 | curl -X POST -F "file=@/path/to/presentation.ppt" http://localhost:8000/parse_document/ppt 176 | ``` 177 | 178 | #### Parse Word Document 179 | 180 | Endpoint: `/parse_document/docs` 181 | Method: POST 182 | 183 | Parses Word documents. 184 | 185 | Curl command: 186 | ``` 187 | curl -X POST -F "file=@/path/to/document.docx" http://localhost:8000/parse_document/docs 188 | ``` 189 | 190 | ### Media Parsing 191 | 192 | 203 | 204 | #### Parse Image 205 | 206 | Endpoint: `/parse_image/image` 207 | Method: POST 208 | 209 | Parses image files (PNG, JPEG, JPG, TIFF, WEBP). 210 | 211 | Curl command: 212 | ``` 213 | curl -X POST -F "file=@/path/to/image.jpg" http://localhost:8000/parse_media/image 214 | ``` 215 | 216 | #### Process Image 217 | 218 | Endpoint: `/parse_image/process_image` 219 | Method: POST 220 | 221 | Processes an image with a specific task. 222 | 223 | Possible task inputs: 224 | `OCR | OCR with Region | Caption | Detailed Caption | More Detailed Caption | Object Detection | Dense Region Caption | Region Proposal` 225 | 226 | Curl command: 227 | ``` 228 | curl -X POST -F "image=@/path/to/image.jpg" -F "task=Caption" -F "prompt=Optional prompt" http://localhost:8000/parse_media/process_image 229 | ``` 230 | 231 | Arguments: 232 | - `image`: The image file 233 | - `task`: The processing task (e.g., Caption, Object Detection) 234 | - `prompt`: Optional prompt for certain tasks 235 | 236 | #### Parse Video 237 | 238 | Endpoint: `/parse_media/video` 239 | Method: POST 240 | 241 | Parses video files (MP4, AVI, MOV, MKV). 242 | 243 | Curl command: 244 | ``` 245 | curl -X POST -F "file=@/path/to/video.mp4" http://localhost:8000/parse_media/video 246 | ``` 247 | 248 | #### Parse Audio 249 | 250 | Endpoint: `/parse_media/audio` 251 | Method: POST 252 | 253 | Parses audio files (MP3, WAV, FLAC). 254 | 255 | Curl command: 256 | ``` 257 | curl -X POST -F "file=@/path/to/audio.mp3" http://localhost:8000/parse_media/audio 258 | ``` 259 | 260 | ### Website Parsing 261 | 262 | #### Parse Website 263 | 264 | Endpoint: `/parse_website/parse` 265 | Method: POST 266 | 267 | Parses a website given its URL. 268 | 269 | Curl command: 270 | ``` 271 | curl -X POST -H "Content-Type: application/json" -d '{"url": "https://example.com"}' http://localhost:8000/parse_website 272 | ``` 273 | Arguments: 274 | - `url`: The URL of the website to parse 275 | 276 |
277 | 278 | 279 | ## Coming Soon/ RoadMap 280 | 🦙 LlamaIndex | Langchain | Haystack integrations coming soon 281 | 📚 Batch processing data 282 | ⭐ Dynamic chunking and structured data extraction based on specified Schema 283 | 🛠️ One magic API: just feed in your file prompt what you want, and we will take care of the rest 284 | 🔧 Dynamic model selection and support for external APIs 285 | 📄 Batch processing for handling multiple files at once 286 | 📦 New open-source model to replace Surya OCR and Marker 287 | 288 | **Final goal**: replace all the different models currently being used with a single MultiModel Model to parse any type of data and get the data you need. 289 | 290 | 291 | ## Limitations 292 | There is a need for a GPU with 8~10 GB minimum VRAM as we are using deep learning models. 293 | \ 294 | 295 | Document Parsing Limitations 296 | \ 297 | - [Marker](https://github.com/VikParuchuri/marker) which is the underlying PDF parser will not convert 100% of equations to LaTeX because it has to detect and then convert them. 298 | - It is good at parsing english but might struggle for languages such as Chinese 299 | - Tables are not always formatted 100% correctly; text can be in the wrong column. 300 | - Whitespace and indentations are not always respected. 301 | - Not all lines/spans will be joined properly. 302 | - This works best on digital PDFs that won't require a lot of OCR. It's optimized for speed, and limited OCR is used to fix errors. 303 | - To fit all the models in the GPU, we are using the smallest variants, which might not offer the best-in-class performance. 304 | 305 | ## License 306 | OmniParse is licensed under the GPL-3.0 license. See `LICENSE` for more information. 307 | The project uses Marker under the hood, which has a commercial license that needs to be followed. Here are the details: 308 | 309 | ### Commercial Usage 310 | Marker and Surya OCR Models are designed to be as widely accessible as possible while still funding development and training costs. Research and personal usage are always allowed, but there are some restrictions on commercial usage. 311 | The weights for the models are licensed under cc-by-nc-sa-4.0. However, this restriction is waived for any organization with less than $5M USD in gross revenue in the most recent 12-month period AND less than $5M in lifetime VC/angel funding raised. To remove the GPL license requirements (dual-license) and/or use the weights commercially over the revenue limit, check out the options provided. 312 | Please refer to [Marker](https://github.com/VikParuchuri/marker) for more Information about the License of the Model weights 313 | 314 | ## Acknowledgements 315 | 316 | This project builds upon the remarkable [Marker](https://github.com/VikParuchuri/marker) project created by [Vik Paruchuri](https://twitter.com/VikParuchuri). We express our gratitude for the inspiration and foundation provided by this project. Special thanks to [Surya-OCR](https://github.com/VikParuchuri/surya) and [Texify](https://github.com/VikParuchuri/texify) for the OCR models extensively used in this project, and to [Crawl4AI](https://github.com/unclecode/crawl4ai) for their contributions. 317 | 318 | Models being used: 319 | - Surya OCR, Detect, Layout, Order, and Texify 320 | - Florence-2 base 321 | - Whisper Small 322 | 323 | Thank you to the authors for their contributions to these models. 324 | 325 | --- 326 | 327 | ## Contact 328 |

329 | 330 | Star History Chart 331 | 332 |

333 | For any inquiries, please contact us at adithyaskolavi@gmail.com 334 | 335 | 336 | 363 | -------------------------------------------------------------------------------- /docs/.gitbook/assets/Twitter header - 1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adithya-s-k/omniparse/9d1ae83c46de777427e67b48f82eaca45ad7994a/docs/.gitbook/assets/Twitter header - 1.png -------------------------------------------------------------------------------- /docs/.gitbook/assets/image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adithya-s-k/omniparse/9d1ae83c46de777427e67b48f82eaca45ad7994a/docs/.gitbook/assets/image.png -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | cover: .gitbook/assets/Twitter header - 1.png 3 | coverY: 0 4 | layout: 5 | cover: 6 | visible: true 7 | size: hero 8 | title: 9 | visible: true 10 | description: 11 | visible: true 12 | tableOfContents: 13 | visible: true 14 | outline: 15 | visible: true 16 | pagination: 17 | visible: true 18 | --- 19 | 20 | # OmniParse 21 | 22 | [![GitHub Stars](https://img.shields.io/github/stars/adithya-s-k/omniparse?style=social)](https://github.com/adithya-s-k/omniparse/stargazers) [![GitHub Forks](https://img.shields.io/github/forks/adithya-s-k/omniparse?style=social)](https://github.com/adithya-s-k/omniparse/network/members) [![GitHub Issues](https://img.shields.io/github/issues/adithya-s-k/omniparse)](https://github.com/adithya-s-k/omniparse/issues) [![GitHub Pull Requests](https://img.shields.io/github/issues-pr/adithya-s-k/omniparse)](https://github.com/adithya-s-k/omniparse/pulls) [![License](https://img.shields.io/github/license/adithya-s-k/omniparse)](../LICENSE) 23 | 24 | > **OmniParse is a platform that ingests/parses any unstructured data into structured, actionable data optimized for GenAI (LLM) applcaitons. Whether working with documents, tables, images, videos, audio files, or web pages, OmniParse prepares your data to be clean, structured and ready for AI applications, such as RAG , fine-tuning and more.** 25 | 26 | ### Try it out 27 | 28 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/adithya-s-k/omniparse/blob/main/examples/OmniParse\_GoogleColab.ipynb) 29 | 30 |

OmniPrase UI

31 | 32 | ### Features 33 | 34 | ✅ Completely local, no external APIs\ 35 | ✅ Fits in a T4 GPU\ 36 | ✅ Supports 10+ file types\ 37 | ✅ Convert documents, multimedia, and web pages to high-quality structured markdown\ 38 | ✅ Table extraction, image extraction/captioning, audio/video transcription, web page crawling\ 39 | ✅ Easily deployable using Docker and Skypilot\ 40 | ✅ Colab friendly 41 | 42 | #### Problem Statement 43 | 44 | It's challenging to process data as it comes in different shapes and sizes. OmniParse aims to be an ingestion/parsing platform where you can ingest any type of data, such as documents, images, audio, video, and web content, and get the most structured and actionable output that is GenAI (LLM) friendly. 45 | 46 | | Original PDF | OmniParse | PyPDF | 47 | | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------- | 48 | | [![Original PDF](https://github.com/adithya-s-k/marker-api/raw/master/data/images/original\_pdf.png)](https://github.com/adithya-s-k/marker-api/blob/master/data/images/original\_pdf.png) | [![Marker-API](https://github.com/adithya-s-k/marker-api/raw/master/data/images/marker\_api.png)](https://github.com/adithya-s-k/marker-api/blob/master/data/images/marker\_api.png) | [![PyPDF](https://github.com/adithya-s-k/marker-api/raw/master/data/images/pypdf.png)](https://github.com/adithya-s-k/marker-api/blob/master/data/images/pypdf.png) | 49 | 50 | ### Installation 51 | 52 | > Note: The server only works on Linux-based systems. This is due to certain dependencies and system-specific configurations that are not compatible with Windows or macOS. To install OmniParse, you can use `pip`: 53 | 54 | ```bash 55 | git clone https://github.com/adithya-s-k/omniparse 56 | cd omniparse 57 | ``` 58 | 59 | Create a Virtual Environment: 60 | 61 | ```bash 62 | conda create --name omniparse-venv python=3.10 63 | conda activate omniparse-venv 64 | ``` 65 | 66 | Install Dependencies: 67 | 68 | ```bash 69 | poetry install 70 | # or 71 | pip install -e . 72 | ``` 73 | 74 | #### 🛳️ Docker 75 | 76 | To use OmniParse with Docker, execute the following commands: 77 | 78 | 1. Pull the OmniParse API Docker image from Docker Hub: 79 | 2. Run the Docker container, exposing port 8000: 👉🏼[Docker Image](https://hub.docker.com/r/savatar101/omniparse) 80 | 81 | ```bash 82 | docker pull savatar101/omniparse:0.1 83 | # if you are running on a gpu 84 | docker run --gpus all -p 8000:8000 savatar101/omniparse:0.1 85 | # else 86 | docker run -p 8000:8000 savatar101/omniparse:0.1 87 | ``` 88 | 89 | Alternatively, if you prefer to build the Docker image locally: Then, run the Docker container as follows: 90 | 91 | ```bash 92 | docker build -t omniparse . 93 | # if you are running on a gpu 94 | docker run --gpus all -p 8000:8000 omniparse 95 | # else 96 | docker run -p 8000:8000 omniparse 97 | 98 | ``` 99 | 100 | ### Usage 101 | 102 | Run the Server: 103 | 104 | ```bash 105 | python server.py --host 0.0.0.0 --port 8000 --documents --media --web 106 | ``` 107 | 108 | * `--documents`: Load in all the models that help you parse and ingest documents (Surya OCR series of models and Florence-2). 109 | * `--media`: Load in Whisper model to transcribe audio and video files. 110 | * `--web`: Set up selenium crawler. 111 | 112 | ### Running the Server 113 | 114 | To start the API server, run the following command: 115 | 116 | ``` 117 | python main.py --host 0.0.0.0 --port 8000 118 | ``` 119 | 120 | Arguments: 121 | 122 | * `--host`: Host IP address (default: 0.0.0.0) 123 | * `--port`: Port number (default: 8000) 124 | 125 | ### Supported Data Types 126 | 127 | | Type | Supported Extensions | 128 | | --------- | ------------------------------------- | 129 | | Documents | .doc, .docx, .odt, .pdf, .ppt, .pptx | 130 | | Images | .png, .jpg, .jpeg, .tiff, .bmp, .heic | 131 | | Video | .mp4, .mkv, .avi, .mov | 132 | | Audio | .mp3, .wav, .aac | 133 | | Web | dynamic webpages, http://.com | 134 | 135 |
136 | 137 | API Endpoints 138 | 139 | Client library compatible with Langchain, llamaindex, and haystack integrations coming soon. 140 | 141 | * API Endpoints 142 | * Document Parsing 143 | * Parse Any Document 144 | * Parse PDF 145 | * Parse PowerPoint 146 | * Parse Word Document 147 | * Media Parsing 148 | * Parse Any Media 149 | * Parse Image 150 | * Process Image 151 | * Parse Video 152 | * Parse Audio 153 | * Website Parsing 154 | * Parse Website 155 | 156 | #### Document Parsing 157 | 158 | **Parse Any Document** 159 | 160 | Endpoint: `/parse_document` Method: POST 161 | 162 | Parses PDF, PowerPoint, or Word documents. 163 | 164 | Curl command: 165 | 166 | ``` 167 | curl -X POST -F "file=@/path/to/document" http://localhost:8000/parse_document 168 | ``` 169 | 170 | **Parse PDF** 171 | 172 | Endpoint: `/parse_document/pdf` Method: POST 173 | 174 | Parses PDF documents. 175 | 176 | Curl command: 177 | 178 | ``` 179 | curl -X POST -F "file=@/path/to/document.pdf" http://localhost:8000/parse_document/pdf 180 | ``` 181 | 182 | **Parse PowerPoint** 183 | 184 | Endpoint: `/parse_document/ppt` Method: POST 185 | 186 | Parses PowerPoint presentations. 187 | 188 | Curl command: 189 | 190 | ``` 191 | curl -X POST -F "file=@/path/to/presentation.ppt" http://localhost:8000/parse_document/ppt 192 | ``` 193 | 194 | **Parse Word Document** 195 | 196 | Endpoint: `/parse_document/docs` Method: POST 197 | 198 | Parses Word documents. 199 | 200 | Curl command: 201 | 202 | ``` 203 | curl -X POST -F "file=@/path/to/document.docx" http://localhost:8000/parse_document/docs 204 | ``` 205 | 206 | #### Media Parsing 207 | 208 | **Parse Image** 209 | 210 | Endpoint: `/parse_media/image` Method: POST 211 | 212 | Parses image files (PNG, JPEG, JPG, TIFF, WEBP). 213 | 214 | Curl command: 215 | 216 | ``` 217 | curl -X POST -F "file=@/path/to/image.jpg" http://localhost:8000/parse_media/image 218 | ``` 219 | 220 | **Process Image** 221 | 222 | Endpoint: `/parse_media/process_image` Method: POST 223 | 224 | Processes an image with a specific task. 225 | 226 | Possible task inputs: `OCR | OCR with Region | Caption | Detailed Caption | More Detailed Caption | Object Detection | Dense Region Caption | Region Proposal` 227 | 228 | Curl command: 229 | 230 | ``` 231 | curl -X POST -F "image=@/path/to/image.jpg" -F "task=Caption" -F "prompt=Optional prompt" http://localhost:8000/parse_media/process_image 232 | ``` 233 | 234 | Arguments: 235 | 236 | * `image`: The image file 237 | * `task`: The processing task (e.g., Caption, Object Detection) 238 | * `prompt`: Optional prompt for certain tasks 239 | 240 | **Parse Video** 241 | 242 | Endpoint: `/parse_media/video` Method: POST 243 | 244 | Parses video files (MP4, AVI, MOV, MKV). 245 | 246 | Curl command: 247 | 248 | ``` 249 | curl -X POST -F "file=@/path/to/video.mp4" http://localhost:8000/parse_media/video 250 | ``` 251 | 252 | **Parse Audio** 253 | 254 | Endpoint: `/parse_media/audio` Method: POST 255 | 256 | Parses audio files (MP3, WAV, FLAC). 257 | 258 | Curl command: 259 | 260 | ``` 261 | curl -X POST -F "file=@/path/to/audio.mp3" http://localhost:8000/parse_media/audio 262 | ``` 263 | 264 | #### Website Parsing 265 | 266 | **Parse Website** 267 | 268 | Endpoint: `/parse_website` Method: POST 269 | 270 | Parses a website given its URL. 271 | 272 | Curl command: 273 | 274 | ``` 275 | curl -X POST -H "Content-Type: application/json" -d '{"url": "https://example.com"}' http://localhost:8000/parse_website 276 | ``` 277 | 278 | Arguments: 279 | 280 | * `url`: The URL of the website to parse 281 | 282 |
283 | 284 | ### Coming Soon/ RoadMap 285 | 286 | 🦙 LlamaIndex | Langchain | Haystack integrations coming soon 📚 Batch processing data ⭐ Dynamic chunking and structured data extraction based on specified Schema\ 287 | 🛠️ One magic API: just feed in your file prompt what you want, and we will take care of the rest\ 288 | 🔧 Dynamic model selection and support for external APIs\ 289 | 📄 Batch processing for handling multiple files at once\ 290 | 📦 New open-source model to replace Surya OCR and Marker 291 | 292 | **Final goal**: replace all the different models currently being used with a single MultiModel Model to parse any type of data and get the data you need. 293 | 294 | ### License 295 | 296 | OmniParse is licensed under the GPL-3.0 license. See `LICENSE` for more information. 297 | 298 | ### Acknowledgements 299 | 300 | This project builds upon the remarkable [Marker](https://github.com/VikParuchuri/marker) project created by [Vik Paruchuri](https://twitter.com/VikParuchuri). We express our gratitude for the inspiration and foundation provided by this project. Special thanks to [Surya-OCR](https://github.com/VikParuchuri/surya) and [Texify](https://github.com/VikParuchuri/texify) for the OCR models extensively used in this project, and to [Crawl4AI](https://github.com/unclecode/crawl4ai) for their contributions. 301 | 302 | Models being used: 303 | 304 | * Surya OCR, Detect, Layout, Order, and Texify 305 | * Florence-2 base 306 | * Whisper Small 307 | 308 | Thank you to the authors for their contributions to these models. 309 | 310 | *** 311 | 312 |
313 | -------------------------------------------------------------------------------- /docs/SUMMARY.md: -------------------------------------------------------------------------------- 1 | # Table of contents 2 | 3 | * [OmniParse](README.md) 4 | * [Installation](installation.md) 5 | * [Deployment](deployment.md) 6 | * [API](api.md) 7 | * [Integration](integration.md) 8 | 9 | ## Examples 10 | 11 | * [OmniPrase x Langchain](examples/omniprase-x-langchain.md) 12 | * [OmniPrase x LlamaIndex](examples/omniprase-x-llamaindex.md) 13 | * [Vision RAG using OmniParse](examples/vision-rag-using-omniparse.md) 14 | -------------------------------------------------------------------------------- /docs/api.md: -------------------------------------------------------------------------------- 1 | --- 2 | description: >- 3 | detailed description of the API endpoints, including their functionality and 4 | how to use them via curl requests. 5 | --- 6 | 7 | # API 8 | 9 | ## Documents 10 | 11 | **Parse Any Document** 12 | 13 | Endpoint: `/parse_document` Method: POST 14 | 15 | Parses PDF, PowerPoint, or Word documents. 16 | 17 | Curl command: 18 | 19 | ``` 20 | curl -X POST -F "file=@/path/to/document" http://localhost:8000/parse_document 21 | ``` 22 | 23 | **Parse PDF** 24 | 25 | Endpoint: `/parse_document/pdf` Method: POST 26 | 27 | Parses PDF documents. 28 | 29 | Curl command: 30 | 31 | ``` 32 | curl -X POST -F "file=@/path/to/document.pdf" http://localhost:8000/parse_document/pdf 33 | ``` 34 | 35 | **Parse PowerPoint** 36 | 37 | Endpoint: `/parse_document/ppt` Method: POST 38 | 39 | Parses PowerPoint presentations. 40 | 41 | Curl command: 42 | 43 | ``` 44 | curl -X POST -F "file=@/path/to/presentation.ppt" http://localhost:8000/parse_document/ppt 45 | ``` 46 | 47 | **Parse Word Document** 48 | 49 | Endpoint: `/parse_document/docs` Method: POST 50 | 51 | Parses Word documents. 52 | 53 | Curl command: 54 | 55 | ``` 56 | curl -X POST -F "file=@/path/to/document.docx" http://localhost:8000/parse_document/docs 57 | ``` 58 | 59 | ## Image 60 | 61 | **Parse Image** 62 | 63 | Endpoint: `/parse_media/image` Method: POST 64 | 65 | Parses image files (PNG, JPEG, JPG, TIFF, WEBP). 66 | 67 | Curl command: 68 | 69 | ``` 70 | curl -X POST -F "file=@/path/to/image.jpg" http://localhost:8000/parse_image/image 71 | ``` 72 | 73 | **Process Image** 74 | 75 | Endpoint: `/parse_media/process_image` Method: POST 76 | 77 | Processes an image with a specific task. 78 | 79 | Possible task inputs: `OCR | OCR with Region | Caption | Detailed Caption | More Detailed Caption | Object Detection | Dense Region Caption | Region Proposal` 80 | 81 | Curl command: 82 | 83 | ``` 84 | curl -X POST -F "image=@/path/to/image.jpg" -F "task=Caption" http://localhost:8000/parse_image/process_image 85 | ``` 86 | 87 | Arguments: 88 | 89 | * `image`: The image file 90 | * `task`: The processing task (e.g., Caption, Object Detection) 91 | * `prompt`: Optional prompt for certain tasks 92 | 93 | ## Media 94 | 95 | **Parse Video** 96 | 97 | Endpoint: `/parse_media/video` Method: POST 98 | 99 | Parses video files (MP4, AVI, MOV, MKV). 100 | 101 | Curl command: 102 | 103 | ``` 104 | curl -X POST -F "file=@/path/to/video.mp4" http://localhost:8000/parse_media/video 105 | ``` 106 | 107 | **Parse Audio** 108 | 109 | Endpoint: `/parse_media/audio` Method: POST 110 | 111 | Parses audio files (MP3, WAV, FLAC). 112 | 113 | Curl command: 114 | 115 | ``` 116 | curl -X POST -F "file=@/path/to/audio.mp3" http://localhost:8000/parse_media/audio 117 | ``` 118 | 119 | ## Website 120 | 121 | **Parse Website** 122 | 123 | Endpoint: `/parse_website` Method: POST 124 | 125 | Parses a website given its URL. 126 | 127 | Curl command: 128 | 129 | ``` 130 | curl -X POST -H "Content-Type: application/json" -d '{"url": "https://example.com"}' http://localhost:8000/parse_website 131 | ``` 132 | 133 | Arguments: 134 | 135 | * `url`: The URL of the website to parse 136 | -------------------------------------------------------------------------------- /docs/assets/hero_image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adithya-s-k/omniparse/9d1ae83c46de777427e67b48f82eaca45ad7994a/docs/assets/hero_image.png -------------------------------------------------------------------------------- /docs/assets/hero_image_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adithya-s-k/omniparse/9d1ae83c46de777427e67b48f82eaca45ad7994a/docs/assets/hero_image_2.png -------------------------------------------------------------------------------- /docs/assets/omniparse_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adithya-s-k/omniparse/9d1ae83c46de777427e67b48f82eaca45ad7994a/docs/assets/omniparse_logo.png -------------------------------------------------------------------------------- /docs/deployment.md: -------------------------------------------------------------------------------- 1 | --- 2 | description: Different Ways to Deploy the omniparse API endpoint 3 | --- 4 | 5 | # Deployment 6 | 7 | ## 🛳️ Docker 8 | 9 | To use OmniParse with Docker, execute the following commands: 10 | 11 | 1. Pull the OmniParse API Docker image from Docker Hub: 12 | 2. Run the Docker container, exposing port 8000: 👉🏼[Docker Image](https://hub.docker.com/r/savatar101/omniparse) 13 | 14 | ```bash 15 | docker pull savatar101/omniparse:0.1 16 | # if you are running on a gpu 17 | docker run --gpus all -p 8000:8000 savatar101/omniparse:0.1 18 | # else 19 | docker run -p 8000:8000 savatar101/omniparse:0.1 20 | ``` 21 | 22 | Alternatively, if you prefer to build the Docker image locally: Then, run the Docker container as follows: 23 | 24 | ```bash 25 | docker build -t omniparse . 26 | # if you are running on a gpu 27 | docker run --gpus all -p 8000:8000 omniparse 28 | # else 29 | docker run -p 8000:8000 omniparse 30 | 31 | ``` 32 | 33 | ## ✈️ Skypilot(coming soon) 34 | 35 | SkyPilot is a framework for running LLMs, AI, and batch jobs on any cloud, offering maximum cost savings, highest GPU availability, and managed execution. To deploy Marker API using Skypilot on any cloud provider, execute the following command: 36 | 37 | ```bash 38 | pip install skypilot-nightly[all] 39 | 40 | # setup skypilot with the cloud provider our your 41 | 42 | sky launch skypilot.yaml 43 | ``` 44 | -------------------------------------------------------------------------------- /docs/examples/omniprase-x-langchain.md: -------------------------------------------------------------------------------- 1 | # OmniPrase x Langchain 2 | 3 | -------------------------------------------------------------------------------- /docs/examples/omniprase-x-llamaindex.md: -------------------------------------------------------------------------------- 1 | # OmniPrase x LlamaIndex 2 | 3 | Example coming soon 4 | -------------------------------------------------------------------------------- /docs/examples/vision-rag-using-omniparse.md: -------------------------------------------------------------------------------- 1 | # Vision RAG using OmniParse 2 | 3 | -------------------------------------------------------------------------------- /docs/installation.md: -------------------------------------------------------------------------------- 1 | # Installation 2 | 3 | ### Installation 4 | 5 | To install OmniParse, you can use `pip`: 6 | 7 | ```bash 8 | git clone https://github.com/adithya-s-k/omniparse 9 | cd omniparse 10 | ``` 11 | 12 | Create a Virtual Environment: 13 | 14 | ```bash 15 | conda create -n omniparse-venv python=3.10 16 | conda activate omniparse-venv 17 | ``` 18 | 19 | Install Dependencies: 20 | 21 | ```bash 22 | poetry install 23 | # or 24 | pip install -e . 25 | ``` 26 | 27 | ### Usage 28 | 29 | Run the Server: 30 | 31 | ```bash 32 | python server.py --host 0.0.0.0 --port 8000 --documents --media --web 33 | ``` 34 | 35 | * `--documents`: Load in all the models that help you parse and ingest documents (Surya OCR series of models and Florence-2). 36 | * `--media`: Load in Whisper model to transcribe audio and video files. 37 | * `--web`: Set up selenium crawler. 38 | -------------------------------------------------------------------------------- /docs/integration.md: -------------------------------------------------------------------------------- 1 | # Integration 2 | 3 | ## Llamaindex 4 | 5 | (coming soon) 6 | 7 | ## Langchain 8 | 9 | (coming soon) 10 | -------------------------------------------------------------------------------- /download.py: -------------------------------------------------------------------------------- 1 | """ 2 | Script to download models 3 | """ 4 | 5 | import argparse 6 | from omniparse import load_omnimodel 7 | 8 | 9 | def download_models(): 10 | parser = argparse.ArgumentParser(description="Download models for omniparse") 11 | 12 | parser.add_argument("--documents", action="store_true", help="Load document models") 13 | parser.add_argument("--media", action="store_true", help="Load media models") 14 | parser.add_argument("--web", action="store_true", help="Load web models") 15 | args = parser.parse_args() 16 | 17 | load_omnimodel(args.documents, args.media, args.web) 18 | 19 | 20 | if __name__ == "__main__": 21 | download_models() 22 | -------------------------------------------------------------------------------- /examples/OmniParse_GoogleColab.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## [OmniParse](https://github.com/adithya-s-k/omniparse)\n", 8 | "Seamlessly ingest any data and get structured, actionable output.\n", 9 | "\n", 10 | "![OmniParse](https://raw.githubusercontent.com/adithya-s-k/omniparse/main/docs/assets/hero_image.png)\n", 11 | "[![GitHub Stars](https://img.shields.io/github/stars/adithya-s-k/omniparse?style=social)](https://github.com/adithya-s-k/omniparse/stargazers)\n", 12 | "[![GitHub Forks](https://img.shields.io/github/forks/adithya-s-k/omniparse?style=social)](https://github.com/adithya-s-k/omniparse/network/members)\n", 13 | "[![GitHub Issues](https://img.shields.io/github/issues/adithya-s-k/omniparse)](https://github.com/adithya-s-k/omniparse/issues)\n", 14 | "[![GitHub Pull Requests](https://img.shields.io/github/issues-pr/adithya-s-k/omniparse)](https://github.com/adithya-s-k/omniparse/pulls)\n", 15 | "[![License](https://img.shields.io/github/license/adithya-s-k/omniparse)](https://github.com/adithya-s-k/omniparse/blob/main/LICENSE)\n", 16 | "\n", 17 | "\n", 18 | "\n", 19 | "## Features\n", 20 | "✅ Completely local, no external APIs \n", 21 | "✅ Supports 10+ file types \n", 22 | "✅ Convert documents, multimedia, and web pages to high-quality structured markdown \n", 23 | "✅ Table extraction, image extraction/captioning, audio/video transcription, web page crawling \n", 24 | "✅ Easily deployable using Docker and Skypilot \n", 25 | "✅ Colab friendly \n", 26 | "\n", 27 | "### Problem Statement:\n", 28 | "It's challenging to process data as it comes in different shapes and sizes. OmniParse aims to be an ingestion/parsing platform where you can ingest any type of data, such as documents, images, audio, video, and web content, and get the most structured and actionable output that is GenAI (LLM) friendly.\n", 29 | "\n", 30 | "## Coming Soon\n", 31 | "⭐ Dynamic chunking and structured data extraction based on specified Schema\n", 32 | "🛠️ One magic API: just feed in your file prompt what you want, and we will take care of the rest \n", 33 | "🔧 Dynamic model selection and support for external APIs \n", 34 | "📄 Batch processing for handling multiple files at once \n", 35 | "🦙 New open-source model to replace Surya OCR and Marker \n", 36 | "\n", 37 | "**Final goal** - replace all the different models currently being used with a single MultiModel Model to parse any type of data and get the data you need\n", 38 | "\n", 39 | "📄 - [Documentation](https://docs.cognitivelab.in/) \\\n", 40 | "Created by [Adithya](https://x.com/adithya_s_k).\n", 41 | "\n", 42 | "| Original PDF | OmniParse-API | PyPDF |\n", 43 | "| ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------- |\n", 44 | "| [![Original PDF](https://github.com/adithya-s-k/marker-api/raw/master/data/images/original\\_pdf.png)](https://github.com/adithya-s-k/marker-api/blob/master/data/images/original\\_pdf.png) | [![OmniParse-API](https://github.com/adithya-s-k/marker-api/raw/master/data/images/marker\\_api.png)](https://github.com/adithya-s-k/marker-api/blob/master/data/images/marker\\_api.png) | [![PyPDF](https://github.com/adithya-s-k/marker-api/raw/master/data/images/pypdf.png)](https://github.com/adithya-s-k/marker-api/blob/master/data/images/pypdf.png) |" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "## Clone the repository\n", 54 | "\n", 55 | "!git clone https://github.com/adithya-s-k/omniparse.git\n", 56 | "%cd omniparse\n", 57 | "%pwd" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": null, 63 | "metadata": { 64 | "colab": { 65 | "base_uri": "https://localhost:8080/", 66 | "height": 1000 67 | }, 68 | "id": "Wjd0_Fy3f4Wa", 69 | "outputId": "82315a2f-67f0-40c8-a300-9cfbe392b988" 70 | }, 71 | "outputs": [], 72 | "source": [ 73 | "## Install dependencies\n", 74 | "## if you get a restart session warning you can ignore it\n", 75 | "\n", 76 | "%pip install -e ." 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "%pip install transformers==4.41.2" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": null, 91 | "metadata": { 92 | "colab": { 93 | "base_uri": "https://localhost:8080/" 94 | }, 95 | "id": "5uLY5mBBjBah", 96 | "outputId": "b0d532cf-3734-4f0f-93db-392170773c5a" 97 | }, 98 | "outputs": [], 99 | "source": [ 100 | "# Update and install necessary packages\n", 101 | "!apt-get update && apt-get install -y --no-install-recommends \\\n", 102 | " wget \\\n", 103 | " curl \\\n", 104 | " unzip \\\n", 105 | " git \\\n", 106 | " libgl1 \\\n", 107 | " libglib2.0-0 \\\n", 108 | " curl \\\n", 109 | " gnupg2 \\\n", 110 | " ca-certificates \\\n", 111 | " apt-transport-https \\\n", 112 | " software-properties-common \\\n", 113 | " libreoffice \\\n", 114 | " ffmpeg \\\n", 115 | " git-lfs \\\n", 116 | " xvfb \\\n", 117 | " && ln -s /usr/bin/python3 /usr/bin/python \\\n", 118 | " && curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | bash \\\n", 119 | " && wget -q -O - https://dl.google.com/linux/linux_signing_key.pub | apt-key add - \\\n", 120 | " && echo \"deb http://dl.google.com/linux/chrome/deb/ stable main\" > /etc/apt/sources.list.d/google-chrome.list \\\n", 121 | " && apt-get update \\\n", 122 | " && apt install python3-packaging \\\n", 123 | " && apt-get install -y --no-install-recommends google-chrome-stable \\\n", 124 | " && rm -rf /var/lib/apt/lists/*\n", 125 | "\n", 126 | "# Download and install ChromeDriver\n", 127 | "!CHROMEDRIVER_VERSION=$(curl -sS chromedriver.storage.googleapis.com/LATEST_RELEASE) && \\\n", 128 | " wget -q -N https://chromedriver.storage.googleapis.com/$CHROMEDRIVER_VERSION/chromedriver_linux64.zip -P /tmp && \\\n", 129 | " unzip -o /tmp/chromedriver_linux64.zip -d /tmp && \\\n", 130 | " mv /tmp/chromedriver /usr/local/bin/chromedriver && \\\n", 131 | " chmod +x /usr/local/bin/chromedriver && \\\n", 132 | " rm /tmp/chromedriver_linux64.zip\n", 133 | "\n", 134 | "# Set environment variables\n", 135 | "import os\n", 136 | "os.environ['CHROME_BIN'] = '/usr/bin/google-chrome'\n", 137 | "os.environ['CHROMEDRIVER'] = '/usr/local/bin/chromedriver'\n", 138 | "os.environ['DISPLAY'] = ':99'\n", 139 | "os.environ['DBUS_SESSION_BUS_ADDRESS'] = '/dev/null'\n", 140 | "os.environ['PYTHONUNBUFFERED'] = '1'\n", 141 | "\n", 142 | "print(\"✅ Set up complete\")" 143 | ] 144 | }, 145 | { 146 | "cell_type": "markdown", 147 | "metadata": {}, 148 | "source": [ 149 | "### Using Cloudflare tunnels (Recommended)\n", 150 | "After the server is set up and cloudflare is available please go to /docs to access all the api endpoints" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": null, 156 | "metadata": { 157 | "colab": { 158 | "base_uri": "https://localhost:8080/" 159 | }, 160 | "id": "U_l14xEFgKpV", 161 | "outputId": "646ff2a4-02fc-49d7-9425-a7355c22d451" 162 | }, 163 | "outputs": [], 164 | "source": [ 165 | "!wget https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64.deb\n", 166 | "!dpkg -i cloudflared-linux-amd64.deb\n", 167 | "\n", 168 | "import subprocess\n", 169 | "import threading\n", 170 | "import time\n", 171 | "import socket\n", 172 | "import urllib.request\n", 173 | "\n", 174 | "def iframe_thread(port):\n", 175 | " while True:\n", 176 | " time.sleep(0.5)\n", 177 | " sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)\n", 178 | " result = sock.connect_ex(('127.0.0.1', port))\n", 179 | " if result == 0:\n", 180 | " break\n", 181 | " sock.close()\n", 182 | " print(\"\\nOmniPrase API finished loading, trying to launch cloudflared (if it gets stuck here cloudflared is having issues)\\n\")\n", 183 | "\n", 184 | " p = subprocess.Popen([\"cloudflared\", \"tunnel\", \"--url\", \"http://127.0.0.1:{}\".format(port)], stdout=subprocess.PIPE, stderr=subprocess.PIPE)\n", 185 | " for line in p.stderr:\n", 186 | " l = line.decode()\n", 187 | " if \"trycloudflare.com \" in l:\n", 188 | " print(\"This is the URL to access OmniPrase:\", l[l.find(\"http\"):], end='')\n", 189 | " #print(l, end='')\n", 190 | "\n", 191 | "\n", 192 | "threading.Thread(target=iframe_thread, daemon=True, args=(8000,)).start()\n", 193 | "\n", 194 | "!python server.py --host 127.0.0.1 --port 8000 --documents --media --web" 195 | ] 196 | }, 197 | { 198 | "cell_type": "markdown", 199 | "metadata": {}, 200 | "source": [ 201 | "### Forward using localtunnel" 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": null, 207 | "metadata": { 208 | "colab": { 209 | "base_uri": "https://localhost:8080/" 210 | }, 211 | "id": "kj_Xx-VBij08", 212 | "outputId": "c0db6d9b-b90f-4a3d-9e7b-7aaaf2c71ed7" 213 | }, 214 | "outputs": [], 215 | "source": [ 216 | "!npm install -g localtunnel\n", 217 | "\n", 218 | "import subprocess\n", 219 | "import threading\n", 220 | "import time\n", 221 | "import socket\n", 222 | "import urllib.request\n", 223 | "\n", 224 | "def iframe_thread(port):\n", 225 | " while True:\n", 226 | " time.sleep(0.5)\n", 227 | " sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)\n", 228 | " result = sock.connect_ex(('127.0.0.1', port))\n", 229 | " if result == 0:\n", 230 | " break\n", 231 | " sock.close()\n", 232 | " print(\"\\Omniparse finished loading, trying to launch localtunnel (if it gets stuck here localtunnel is having issues)\\n\")\n", 233 | "\n", 234 | " print(\"The password/enpoint ip for localtunnel is:\", urllib.request.urlopen('https://ipv4.icanhazip.com').read().decode('utf8').strip(\"\\n\"))\n", 235 | " p = subprocess.Popen([\"lt\", \"--port\", \"{}\".format(port)], stdout=subprocess.PIPE)\n", 236 | " for line in p.stdout:\n", 237 | " print(line.decode(), end='')\n", 238 | "\n", 239 | "\n", 240 | "threading.Thread(target=iframe_thread, daemon=True, args=(8000,)).start()\n", 241 | "\n", 242 | "!python server.py --host 127.0.0.1 --port 8000 --documents --media --web" 243 | ] 244 | } 245 | ], 246 | "metadata": { 247 | "accelerator": "GPU", 248 | "colab": { 249 | "gpuType": "T4", 250 | "provenance": [] 251 | }, 252 | "kernelspec": { 253 | "display_name": "Python 3", 254 | "name": "python3" 255 | }, 256 | "language_info": { 257 | "codemirror_mode": { 258 | "name": "ipython", 259 | "version": 3 260 | }, 261 | "file_extension": ".py", 262 | "mimetype": "text/x-python", 263 | "name": "python", 264 | "nbconvert_exporter": "python", 265 | "pygments_lexer": "ipython3", 266 | "version": "3.10.14" 267 | } 268 | }, 269 | "nbformat": 4, 270 | "nbformat_minor": 0 271 | } 272 | -------------------------------------------------------------------------------- /omniparse/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Title: OmniPrase 3 | Author: Adithya S Kolavi 4 | Date: 2024-07-02 5 | 6 | This code includes portions of code from the marker repository by VikParuchuri. 7 | Original repository: https://github.com/VikParuchuri/marker 8 | 9 | Original Author: VikParuchuri 10 | Original Date: 2024-01-15 11 | 12 | License: GNU General Public License (GPL) Version 3 13 | URL: https://github.com/VikParuchuri/marker/blob/master/LICENSE 14 | 15 | Description: 16 | This section of the code was adapted from the marker repository to load all the OCR, layout and reading order detection models. 17 | All credits for the original implementation go to VikParuchuri. 18 | """ 19 | 20 | import torch 21 | from typing import Any 22 | from pydantic import BaseModel 23 | from transformers import AutoProcessor, AutoModelForCausalLM 24 | import whisper 25 | from omniparse.utils import print_omniparse_text_art 26 | from omniparse.web.web_crawler import WebCrawler 27 | from marker.models import load_all_models 28 | # from omniparse.documents.models import load_all_models 29 | 30 | 31 | class SharedState(BaseModel): 32 | model_list: Any = None 33 | vision_model: Any = None 34 | vision_processor: Any = None 35 | whisper_model: Any = None 36 | crawler: Any = None 37 | 38 | 39 | shared_state = SharedState() 40 | 41 | 42 | def load_omnimodel(load_documents: bool, load_media: bool, load_web: bool): 43 | global shared_state 44 | print_omniparse_text_art() 45 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 46 | if load_documents: 47 | print("[LOG] ✅ Loading OCR Model") 48 | shared_state.model_list = load_all_models() 49 | print("[LOG] ✅ Loading Vision Model") 50 | # if device == "cuda": 51 | shared_state.vision_model = AutoModelForCausalLM.from_pretrained( 52 | "microsoft/Florence-2-base", trust_remote_code=True 53 | ).to(device) 54 | shared_state.vision_processor = AutoProcessor.from_pretrained( 55 | "microsoft/Florence-2-base", trust_remote_code=True 56 | ) 57 | 58 | if load_media: 59 | print("[LOG] ✅ Loading Audio Model") 60 | shared_state.whisper_model = whisper.load_model("small") 61 | 62 | if load_web: 63 | print("[LOG] ✅ Loading Web Crawler") 64 | shared_state.crawler = WebCrawler(verbose=True) 65 | 66 | 67 | def get_shared_state(): 68 | return shared_state 69 | 70 | 71 | def get_active_models(): 72 | print(shared_state) 73 | # active_models = [key for key, value in shared_state.dict().items() if value is not None] 74 | # print(f"These are the active model : {active_models}") 75 | return shared_state 76 | -------------------------------------------------------------------------------- /omniparse/chunking/__init__.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | import re 3 | from collections import Counter 4 | import string 5 | from nltk.tokenize import sent_tokenize 6 | from omniparse.web.model_loader import load_nltk_punkt 7 | 8 | 9 | # Define the abstract base class for chunking strategies 10 | class ChunkingStrategy(ABC): 11 | @abstractmethod 12 | def chunk(self, text: str) -> list: 13 | """ 14 | Abstract method to chunk the given text. 15 | """ 16 | pass 17 | 18 | 19 | # Regex-based chunking 20 | class RegexChunking(ChunkingStrategy): 21 | def __init__(self, patterns=None, **kwargs): 22 | if patterns is None: 23 | patterns = [r"\n\n"] # Default split pattern 24 | self.patterns = patterns 25 | 26 | def chunk(self, text: str) -> list: 27 | paragraphs = [text] 28 | for pattern in self.patterns: 29 | new_paragraphs = [] 30 | for paragraph in paragraphs: 31 | new_paragraphs.extend(re.split(pattern, paragraph)) 32 | paragraphs = new_paragraphs 33 | return paragraphs 34 | 35 | 36 | # NLP-based sentence chunking 37 | class NlpSentenceChunking(ChunkingStrategy): 38 | def __init__(self, **kwargs): 39 | load_nltk_punkt() 40 | pass 41 | 42 | def chunk(self, text: str) -> list: 43 | sentences = sent_tokenize(text) 44 | sens = [sent.strip() for sent in sentences] 45 | 46 | return list(set(sens)) 47 | 48 | 49 | # Topic-based segmentation using TextTiling 50 | class TopicSegmentationChunking(ChunkingStrategy): 51 | def __init__(self, num_keywords=3, **kwargs): 52 | import nltk as nl 53 | 54 | self.tokenizer = nl.toknize.TextTilingTokenizer() 55 | self.num_keywords = num_keywords 56 | 57 | def chunk(self, text: str) -> list: 58 | # Use the TextTilingTokenizer to segment the text 59 | segmented_topics = self.tokenizer.tokenize(text) 60 | return segmented_topics 61 | 62 | def extract_keywords(self, text: str) -> list: 63 | # Tokenize and remove stopwords and punctuation 64 | import nltk as nl 65 | 66 | tokens = nl.toknize.word_tokenize(text) 67 | tokens = [ 68 | token.lower() 69 | for token in tokens 70 | if token not in nl.corpus.stopwords.words("english") 71 | and token not in string.punctuation 72 | ] 73 | 74 | # Calculate frequency distribution 75 | freq_dist = Counter(tokens) 76 | keywords = [word for word, freq in freq_dist.most_common(self.num_keywords)] 77 | return keywords 78 | 79 | def chunk_with_topics(self, text: str) -> list: 80 | # Segment the text into topics 81 | segments = self.chunk(text) 82 | # Extract keywords for each topic segment 83 | segments_with_topics = [ 84 | (segment, self.extract_keywords(segment)) for segment in segments 85 | ] 86 | return segments_with_topics 87 | 88 | 89 | # Fixed-length word chunks 90 | class FixedLengthWordChunking(ChunkingStrategy): 91 | def __init__(self, chunk_size=100, **kwargs): 92 | self.chunk_size = chunk_size 93 | 94 | def chunk(self, text: str) -> list: 95 | words = text.split() 96 | return [ 97 | " ".join(words[i : i + self.chunk_size]) 98 | for i in range(0, len(words), self.chunk_size) 99 | ] 100 | 101 | 102 | # Sliding window chunking 103 | class SlidingWindowChunking(ChunkingStrategy): 104 | def __init__(self, window_size=100, step=50, **kwargs): 105 | self.window_size = window_size 106 | self.step = step 107 | 108 | def chunk(self, text: str) -> list: 109 | words = text.split() 110 | chunks = [] 111 | for i in range(0, len(words), self.step): 112 | chunks.append(" ".join(words[i : i + self.window_size])) 113 | return chunks 114 | -------------------------------------------------------------------------------- /omniparse/documents/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Title: OmniPrase 3 | Author: Adithya S Kolavi 4 | Date: 2024-07-02 5 | 6 | This code includes portions of code from the marker repository by VikParuchuri. 7 | Original repository: https://github.com/VikParuchuri/marker 8 | 9 | Original Author: VikParuchuri 10 | Original Date: 2024-01-15 11 | 12 | License: GNU General Public License (GPL) Version 3 13 | URL: https://github.com/VikParuchuri/marker/blob/master/LICENSE 14 | 15 | Description: 16 | This section of the code was adapted from the marker repository to enhance text pdf/word/ppt parsing. 17 | All credits for the original implementation go to VikParuchuri. 18 | """ 19 | 20 | import os 21 | import tempfile 22 | import subprocess 23 | 24 | # from omniparse.documents.parse import parse_single_pdf 25 | from marker.convert import convert_single_pdf 26 | from omniparse.utils import encode_images 27 | from omniparse.models import responseDocument 28 | 29 | 30 | # Function to handle PDF parsing 31 | def parse_pdf(input_data, model_state) -> responseDocument: 32 | try: 33 | if isinstance(input_data, bytes): 34 | with tempfile.NamedTemporaryFile( 35 | delete=False, suffix=".pdf" 36 | ) as temp_pdf_file: 37 | temp_pdf_file.write(input_data) 38 | temp_pdf_path = temp_pdf_file.name 39 | 40 | input_path = temp_pdf_path 41 | cleanup_tempfile = True 42 | 43 | elif isinstance(input_data, str) and input_data.endswith(".pdf"): 44 | input_path = input_data 45 | cleanup_tempfile = False 46 | 47 | else: 48 | raise ValueError( 49 | "Invalid input data format. Expected bytes or PDF file path." 50 | ) 51 | 52 | full_text, images, out_meta = convert_single_pdf( 53 | input_path, model_state.model_list 54 | ) 55 | 56 | parse_pdf_result = responseDocument(text=full_text, metadata=out_meta) 57 | encode_images(images, parse_pdf_result) 58 | 59 | if cleanup_tempfile: 60 | os.remove(input_path) 61 | 62 | return parse_pdf_result 63 | 64 | except Exception as e: 65 | raise RuntimeError(f"Error parsing PPT: {str(e)}") 66 | 67 | 68 | # Function to handle PPT and DOC parsing 69 | def parse_ppt(input_data, model_state) -> responseDocument: 70 | try: 71 | if isinstance(input_data, bytes): 72 | print("Recieved ppt file") 73 | with tempfile.NamedTemporaryFile(delete=False) as tmp_file: 74 | tmp_file.write(input_data) 75 | tmp_file.flush() 76 | input_path = tmp_file.name 77 | 78 | elif isinstance(input_data, str) and ( 79 | input_data.endswith(".ppt") 80 | or input_data.endswith(".pptx") 81 | or input_data.endswith(".doc") 82 | or input_data.endswith(".docx") 83 | ): 84 | input_path = input_data 85 | 86 | else: 87 | raise ValueError( 88 | "Invalid input data format. Expected bytes or PPT/DOC file path." 89 | ) 90 | 91 | if input_path.endswith((".ppt", ".pptx", ".doc", ".docx")): 92 | output_dir = tempfile.mkdtemp() 93 | command = [ 94 | "libreoffice", 95 | "--headless", 96 | "--convert-to", 97 | "pdf", 98 | "--outdir", 99 | output_dir, 100 | input_path, 101 | ] 102 | subprocess.run(command, check=True) 103 | output_pdf_path = os.path.join( 104 | output_dir, os.path.splitext(os.path.basename(input_path))[0] + ".pdf" 105 | ) 106 | input_path = output_pdf_path 107 | 108 | full_text, images, out_meta = convert_single_pdf( 109 | input_path, model_state.model_list 110 | ) 111 | images = encode_images(images) 112 | 113 | parse_ppt_result = responseDocument(text=full_text, metadata=out_meta) 114 | encode_images(images, parse_ppt_result) 115 | 116 | if input_data != input_path: 117 | os.remove(input_path) 118 | 119 | return parse_ppt_result 120 | 121 | except Exception as e: 122 | raise RuntimeError(f"Error parsing PPT: {str(e)}") 123 | 124 | 125 | def parse_doc(input_data, model_state) -> responseDocument: 126 | try: 127 | if isinstance(input_data, bytes): 128 | with tempfile.NamedTemporaryFile(delete=False) as tmp_file: 129 | tmp_file.write(input_data) 130 | tmp_file.flush() 131 | input_path = tmp_file.name 132 | 133 | elif isinstance(input_data, str) and ( 134 | input_data.endswith(".ppt") 135 | or input_data.endswith(".pptx") 136 | or input_data.endswith(".doc") 137 | or input_data.endswith(".docx") 138 | ): 139 | input_path = input_data 140 | 141 | else: 142 | raise ValueError( 143 | "Invalid input data format. Expected bytes or PPT/DOC file path." 144 | ) 145 | 146 | if input_path.endswith((".ppt", ".pptx", ".doc", ".docx")): 147 | output_dir = tempfile.mkdtemp() 148 | command = [ 149 | "libreoffice", 150 | "--headless", 151 | "--convert-to", 152 | "pdf", 153 | "--outdir", 154 | output_dir, 155 | input_path, 156 | ] 157 | subprocess.run(command, check=True) 158 | output_pdf_path = os.path.join( 159 | output_dir, os.path.splitext(os.path.basename(input_path))[0] + ".pdf" 160 | ) 161 | input_path = output_pdf_path 162 | 163 | full_text, images, out_meta = convert_single_pdf( 164 | input_path, model_state.model_list 165 | ) 166 | images = encode_images(images) 167 | 168 | parse_doc_result = responseDocument(text=full_text, metadata=out_meta) 169 | encode_images(images, parse_doc_result) 170 | 171 | if input_data != input_path: 172 | os.remove(input_path) 173 | 174 | return parse_doc_result 175 | 176 | except Exception as e: 177 | raise RuntimeError(f"Error parsing PPT: {str(e)}") 178 | -------------------------------------------------------------------------------- /omniparse/documents/router.py: -------------------------------------------------------------------------------- 1 | """ 2 | Title: OmniPrase 3 | Author: Adithya S Kolavi 4 | Date: 2024-07-02 5 | 6 | This code includes portions of code from the marker repository by VikParuchuri. 7 | Original repository: https://github.com/VikParuchuri/marker 8 | 9 | Original Author: VikParuchuri 10 | Original Date: 2024-01-15 11 | 12 | License: GNU General Public License (GPL) Version 3 13 | URL: https://github.com/VikParuchuri/marker/blob/master/LICENSE 14 | 15 | Description: 16 | This section of the code was adapted from the marker repository to enhance text pdf/word/ppt parsing. 17 | All credits for the original implementation go to VikParuchuri. 18 | """ 19 | 20 | import os 21 | import tempfile 22 | import subprocess 23 | 24 | # from omniparse.documents.parse import parse_single_pdf 25 | from fastapi import APIRouter, File, UploadFile, HTTPException 26 | from fastapi.responses import JSONResponse 27 | from omniparse import get_shared_state 28 | 29 | # from omniparse.documents import parse_pdf , parse_ppt , parse_doc 30 | # from omniparse.documents import parse_pdf 31 | from marker.convert import convert_single_pdf 32 | from omniparse.utils import encode_images 33 | from omniparse.models import responseDocument 34 | 35 | document_router = APIRouter() 36 | model_state = get_shared_state() 37 | 38 | 39 | # Document parsing endpoints 40 | @document_router.post("/pdf") 41 | async def parse_pdf_endpoint(file: UploadFile = File(...)): 42 | try: 43 | file_bytes = await file.read() 44 | full_text, images, out_meta = convert_single_pdf( 45 | file_bytes, model_state.model_list 46 | ) 47 | 48 | result = responseDocument(text=full_text, metadata=out_meta) 49 | encode_images(images, result) 50 | # result : responseDocument = convert_single_pdf(file_bytes , model_state.model_list) 51 | 52 | return JSONResponse(content=result.model_dump()) 53 | 54 | except Exception as e: 55 | raise HTTPException(status_code=500, detail=str(e)) 56 | 57 | 58 | # Document parsing endpoints 59 | @document_router.post("/ppt") 60 | async def parse_ppt_endpoint(file: UploadFile = File(...)): 61 | with tempfile.NamedTemporaryFile(delete=False, suffix=".ppt") as tmp_ppt: 62 | tmp_ppt.write(await file.read()) 63 | tmp_ppt.flush() 64 | input_path = tmp_ppt.name 65 | 66 | output_dir = tempfile.mkdtemp() 67 | command = [ 68 | "libreoffice", 69 | "--headless", 70 | "--convert-to", 71 | "pdf", 72 | "--outdir", 73 | output_dir, 74 | input_path, 75 | ] 76 | subprocess.run(command, check=True) 77 | 78 | output_pdf_path = os.path.join( 79 | output_dir, os.path.splitext(os.path.basename(input_path))[0] + ".pdf" 80 | ) 81 | 82 | with open(output_pdf_path, "rb") as pdf_file: 83 | pdf_bytes = pdf_file.read() 84 | 85 | full_text, images, out_meta = convert_single_pdf(pdf_bytes, model_state.model_list) 86 | 87 | os.remove(input_path) 88 | os.remove(output_pdf_path) 89 | os.rmdir(output_dir) 90 | 91 | result = responseDocument(text=full_text, metadata=out_meta) 92 | encode_images(images, result) 93 | 94 | return JSONResponse(content=result.model_dump()) 95 | 96 | 97 | @document_router.post("/docs") 98 | async def parse_doc_endpoint(file: UploadFile = File(...)): 99 | with tempfile.NamedTemporaryFile(delete=False, suffix=".ppt") as tmp_ppt: 100 | tmp_ppt.write(await file.read()) 101 | tmp_ppt.flush() 102 | input_path = tmp_ppt.name 103 | 104 | output_dir = tempfile.mkdtemp() 105 | command = [ 106 | "libreoffice", 107 | "--headless", 108 | "--convert-to", 109 | "pdf", 110 | "--outdir", 111 | output_dir, 112 | input_path, 113 | ] 114 | subprocess.run(command, check=True) 115 | 116 | output_pdf_path = os.path.join( 117 | output_dir, os.path.splitext(os.path.basename(input_path))[0] + ".pdf" 118 | ) 119 | 120 | with open(output_pdf_path, "rb") as pdf_file: 121 | pdf_bytes = pdf_file.read() 122 | 123 | full_text, images, out_meta = convert_single_pdf(pdf_bytes, model_state.model_list) 124 | 125 | result = responseDocument(text=full_text, metadata=out_meta) 126 | encode_images(images, result) 127 | 128 | return JSONResponse(content=result.model_dump()) 129 | 130 | 131 | @document_router.post("") 132 | async def parse_any_endpoint(file: UploadFile = File(...)): 133 | allowed_extensions = {".pdf", ".ppt", ".pptx", ".doc", ".docx"} 134 | file_ext = os.path.splitext(file.filename)[1] 135 | 136 | if file_ext.lower() not in allowed_extensions: 137 | return JSONResponse( 138 | content={ 139 | "message": "Unsupported file type. Only PDF, PPT, and DOCX are allowed." 140 | }, 141 | status_code=400, 142 | ) 143 | 144 | with tempfile.NamedTemporaryFile(delete=False) as tmp_file: 145 | tmp_file.write(await file.read()) 146 | tmp_file.flush() 147 | input_path = tmp_file.name 148 | 149 | if file_ext.lower() in {".ppt", ".pptx", ".doc", ".docx"}: 150 | output_dir = tempfile.mkdtemp() 151 | command = [ 152 | "libreoffice", 153 | "--headless", 154 | "--convert-to", 155 | "pdf", 156 | "--outdir", 157 | output_dir, 158 | input_path, 159 | ] 160 | subprocess.run(command, check=True) 161 | output_pdf_path = os.path.join( 162 | output_dir, os.path.splitext(os.path.basename(input_path))[0] + ".pdf" 163 | ) 164 | input_path = output_pdf_path 165 | 166 | # Common parsing logic 167 | full_text, images, out_meta = convert_single_pdf(input_path, model_state.model_list) 168 | 169 | os.remove(input_path) 170 | 171 | result = responseDocument(text=full_text, metadata=out_meta) 172 | encode_images(images, result) 173 | 174 | return JSONResponse(content=result.model_dump()) 175 | 176 | 177 | # @document_router.post("/docs") 178 | # async def parse_docs_endpoint(file: UploadFile = File(...)): 179 | # try: 180 | 181 | # file_bytes = await file.read() 182 | # result = parse_doc(file_bytes , model_state) 183 | 184 | # return JSONResponse(content=result) 185 | 186 | # except Exception as e: 187 | # raise HTTPException(status_code=500, detail=str(e)) 188 | 189 | # @document_router.post("/ppt") 190 | # async def parse_ppt_endpoint(file: UploadFile = File(...)): 191 | # try: 192 | # file_bytes = await file.read() 193 | # result = parse_ppt(file_bytes , model_state) 194 | 195 | # return JSONResponse(content=result) 196 | 197 | # except Exception as e: 198 | # raise HTTPException(status_code=500, detail=str(e)) 199 | -------------------------------------------------------------------------------- /omniparse/extraction/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adithya-s-k/omniparse/9d1ae83c46de777427e67b48f82eaca45ad7994a/omniparse/extraction/__init__.py -------------------------------------------------------------------------------- /omniparse/image/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Title: OmniPrase 3 | Author: Adithya S Kolavi 4 | Date: 2024-07-02 5 | 6 | This code includes portions of code from the marker repository by VikParuchuri. 7 | Original repository: https://github.com/VikParuchuri/marker 8 | 9 | Original Author: VikParuchuri 10 | Original Date: 2024-01-15 11 | 12 | License: GNU General Public License (GPL) Version 3 13 | URL: https://github.com/VikParuchuri/marker/blob/master/LICENSE 14 | 15 | Description: 16 | This section of the code was adapted from the marker repository to enhance text image parsing. 17 | All credits for the original implementation go to VikParuchuri. 18 | """ 19 | 20 | """ 21 | Title: OmniPrase 22 | Author: Adithya S Kolavi 23 | Date: 2024-07-02 24 | 25 | This code includes portions of code from the Florence-2 repository by gokaygokay. 26 | Original repository: https://huggingface.co/spaces/gokaygokay/Florence-2 27 | 28 | Original Author: gokaygokay 29 | Original Date: 2024-06-30 30 | 31 | URL: https://huggingface.co/spaces/gokaygokay/Florence-2 32 | """ 33 | 34 | 35 | # Media parsing endpoints 36 | import io 37 | import os 38 | import tempfile 39 | import img2pdf 40 | from PIL import Image 41 | 42 | # from omniparse.document.parse import parse_single_image 43 | from marker.convert import convert_single_pdf 44 | from omniparse.image.process import process_image_task 45 | from omniparse.utils import encode_images 46 | from omniparse.models import responseDocument 47 | 48 | 49 | def parse_image(input_data, model_state) -> dict: 50 | temp_files = [] 51 | 52 | try: 53 | if isinstance(input_data, bytes): 54 | image = Image.open(io.BytesIO(input_data)) 55 | elif isinstance(input_data, str) and os.path.isfile(input_data): 56 | image = Image.open(input_data) 57 | else: 58 | raise ValueError( 59 | "Invalid input data format. Expected image bytes or image file path." 60 | ) 61 | 62 | accepted_formats = {"PNG", "JPEG", "JPG", "TIFF", "WEBP"} 63 | if image.format not in accepted_formats: 64 | raise ValueError( 65 | f"Unsupported image format '{image.format}'. Accepted formats are: {', '.join(accepted_formats)}" 66 | ) 67 | 68 | # Convert RGBA to RGB if necessary 69 | if image.mode == "RGBA": 70 | image = image.convert("RGB") 71 | 72 | # Create a temporary file for the image 73 | with tempfile.NamedTemporaryFile( 74 | delete=False, suffix=".jpg" 75 | ) as temp_image_file: 76 | image.save(temp_image_file.name) 77 | temp_files.append(temp_image_file.name) 78 | 79 | # Convert image to PDF 80 | with tempfile.NamedTemporaryFile( 81 | delete=False, suffix=".pdf" 82 | ) as temp_pdf_file: 83 | pdf_bytes = img2pdf.convert(temp_image_file.name) 84 | 85 | # Write PDF bytes to the temporary file 86 | temp_pdf_file.write(pdf_bytes) 87 | temp_pdf_path = temp_pdf_file.name 88 | temp_files.append(temp_pdf_path) 89 | 90 | # Parse the PDF file 91 | full_text, images, out_meta = convert_single_pdf( 92 | temp_pdf_path, model_state.model_list 93 | ) 94 | 95 | parse_image_result = responseDocument(text=full_text, metadata=out_meta) 96 | encode_images(images, parse_image_result) 97 | 98 | return parse_image_result 99 | 100 | finally: 101 | # Clean up the temporary files 102 | for file_path in temp_files: 103 | if os.path.exists(file_path): 104 | os.remove(file_path) 105 | 106 | 107 | def process_image(input_data, task, model_state) -> responseDocument: 108 | try: 109 | temp_files = [] 110 | 111 | if isinstance(input_data, bytes): 112 | with tempfile.NamedTemporaryFile(delete=False) as temp_file: 113 | temp_file.write(input_data) 114 | temp_file.flush() 115 | temp_file_path = temp_file.name 116 | temp_files.append(temp_file_path) 117 | 118 | elif isinstance(input_data, str) and os.path.isfile(input_data): 119 | temp_file_path = input_data 120 | temp_files.append(temp_file_path) 121 | 122 | else: 123 | raise ValueError( 124 | "Invalid input data format. Expected image bytes or image file path." 125 | ) 126 | 127 | # Open the saved image using PIL 128 | image_data = Image.open(temp_file_path).convert("RGB") 129 | 130 | # Process the image using your function (e.g., process_image) 131 | image_process_results: responseDocument = process_image_task( 132 | image_data, task, model_state 133 | ) 134 | 135 | return image_process_results 136 | 137 | finally: 138 | # Clean up the temporary files 139 | for file_path in temp_files: 140 | if os.path.exists(file_path): 141 | os.remove(file_path) 142 | -------------------------------------------------------------------------------- /omniparse/image/process.py: -------------------------------------------------------------------------------- 1 | """ 2 | Title: OmniPrase 3 | Author: Adithya S Kolavi 4 | Date: 2024-07-02 5 | 6 | This code includes portions of code from the Florence-2 repository by gokaygokay. 7 | Original repository: https://huggingface.co/spaces/gokaygokay/Florence-2 8 | 9 | Original Author: gokaygokay 10 | Original Date: 2024-06-30 11 | 12 | URL: https://huggingface.co/spaces/gokaygokay/Florence-2 13 | """ 14 | 15 | from typing import Dict, Any, Union 16 | from PIL import Image as PILImage 17 | import base64 18 | from io import BytesIO 19 | import copy 20 | from omniparse.image.utils import plot_bbox, fig_to_pil, draw_polygons, draw_ocr_bboxes 21 | from omniparse.models import responseDocument 22 | 23 | 24 | def process_image_task( 25 | image_data: Union[str, bytes, PILImage.Image], task_prompt: str, model_state 26 | ) -> Dict[str, Any]: 27 | # Convert image_data if it's in bytes 28 | if isinstance(image_data, bytes): 29 | pil_image = PILImage.open(BytesIO(image_data)) 30 | elif isinstance(image_data, str): 31 | try: 32 | image_bytes = base64.b64decode(image_data) 33 | pil_image = PILImage.open(BytesIO(image_bytes)) 34 | except Exception as e: 35 | raise ValueError(f"Failed to decode base64 image: {str(e)}") 36 | elif isinstance(image_data, PILImage.Image): 37 | pil_image = image_data 38 | else: 39 | raise ValueError( 40 | "Unsupported image_data type. Should be either string (file path), bytes (binary image data), or PIL.Image instance." 41 | ) 42 | 43 | # Process based on task_prompt 44 | if task_prompt == "Caption": 45 | task_prompt_model = "" 46 | elif task_prompt == "Detailed Caption": 47 | task_prompt_model = "" 48 | elif task_prompt == "More Detailed Caption": 49 | task_prompt_model = "" 50 | elif task_prompt == "Caption + Grounding": 51 | task_prompt_model = "" 52 | elif task_prompt == "Detailed Caption + Grounding": 53 | task_prompt_model = "" 54 | elif task_prompt == "More Detailed Caption + Grounding": 55 | task_prompt_model = "" 56 | elif task_prompt == "Object Detection": 57 | task_prompt_model = "" 58 | elif task_prompt == "Dense Region Caption": 59 | task_prompt_model = "" 60 | elif task_prompt == "Region Proposal": 61 | task_prompt_model = "" 62 | elif task_prompt == "Caption to Phrase Grounding": 63 | task_prompt_model = "" 64 | elif task_prompt == "Referring Expression Segmentation": 65 | task_prompt_model = "" 66 | elif task_prompt == "Region to Segmentation": 67 | task_prompt_model = "" 68 | elif task_prompt == "Open Vocabulary Detection": 69 | task_prompt_model = "" 70 | elif task_prompt == "Region to Category": 71 | task_prompt_model = "" 72 | elif task_prompt == "Region to Description": 73 | task_prompt_model = "" 74 | elif task_prompt == "OCR": 75 | task_prompt_model = "" 76 | elif task_prompt == "OCR with Region": 77 | task_prompt_model = "" 78 | else: 79 | raise ValueError("Invalid task prompt") 80 | 81 | results, processed_image = pre_process_image( 82 | pil_image, 83 | task_prompt_model, 84 | model_state.vision_model, 85 | model_state.vision_processor, 86 | ) 87 | # Update responseDocument fields based on the results 88 | process_image_result = responseDocument(text=str(results)) 89 | 90 | if processed_image is not None: 91 | process_image_result.add_image(f"{task_prompt}", processed_image) 92 | 93 | return process_image_result 94 | 95 | 96 | # Your pre_process_image function with some adjustments 97 | def pre_process_image(image, task_prompt, vision_model, vision_processor): 98 | if task_prompt == "": 99 | results = run_example(task_prompt, image, vision_model, vision_processor) 100 | return results, None 101 | elif task_prompt == "": 102 | results = run_example(task_prompt, image, vision_model, vision_processor) 103 | return results, None 104 | elif task_prompt == "": 105 | results = run_example(task_prompt, image, vision_model, vision_processor) 106 | return results, None 107 | elif task_prompt == "": 108 | results = run_example(task_prompt, image, vision_model, vision_processor) 109 | fig = plot_bbox(image, results[task_prompt]) 110 | return results, fig_to_pil(fig) 111 | elif task_prompt == "": 112 | results = run_example(task_prompt, image, vision_model, vision_processor) 113 | fig = plot_bbox(image, results[task_prompt]) 114 | return results, fig_to_pil(fig) 115 | elif task_prompt == "": 116 | results = run_example(task_prompt, image, vision_model, vision_processor) 117 | fig = plot_bbox(image, results[task_prompt]) 118 | return results, fig_to_pil(fig) 119 | elif task_prompt == "": 120 | results = run_example(task_prompt, image, vision_model, vision_processor) 121 | fig = plot_bbox(image, results[task_prompt]) 122 | return results, fig_to_pil(fig) 123 | elif task_prompt == "": 124 | results = run_example(task_prompt, image, vision_model, vision_processor) 125 | fig = plot_bbox(image, results[task_prompt]) 126 | return results, fig_to_pil(fig) 127 | elif task_prompt == "": 128 | results = run_example(task_prompt, image, vision_model, vision_processor) 129 | fig = plot_bbox(image, results[task_prompt]) 130 | return results, fig_to_pil(fig) 131 | elif task_prompt == "": 132 | results = run_example(task_prompt, image, vision_model, vision_processor) 133 | fig = plot_bbox(image, results[task_prompt]) 134 | return results, fig_to_pil(fig) 135 | elif task_prompt == "": 136 | results = run_example(task_prompt, image, vision_model, vision_processor) 137 | output_image = copy.deepcopy(image) 138 | output_image = draw_polygons(output_image, results[task_prompt], fill_mask=True) 139 | return results, output_image 140 | elif task_prompt == "": 141 | results = run_example(task_prompt, image, vision_model, vision_processor) 142 | output_image = copy.deepcopy(image) 143 | output_image = draw_polygons(output_image, results[task_prompt], fill_mask=True) 144 | return results, output_image 145 | elif task_prompt == "": 146 | results = run_example(task_prompt, image, vision_model, vision_processor) 147 | fig = plot_bbox(image, results[task_prompt]) 148 | return results, fig_to_pil(fig) 149 | elif task_prompt == "": 150 | results = run_example(task_prompt, image, vision_model, vision_processor) 151 | return results, None 152 | elif task_prompt == "": 153 | results = run_example(task_prompt, image, vision_model, vision_processor) 154 | return results, None 155 | elif task_prompt == "": 156 | results = run_example(task_prompt, image, vision_model, vision_processor) 157 | return results, None 158 | elif task_prompt == "": 159 | results = run_example(task_prompt, image, vision_model, vision_processor) 160 | output_image = copy.deepcopy(image) 161 | output_image = draw_ocr_bboxes(output_image, results[task_prompt]) 162 | return results, output_image 163 | else: 164 | raise ValueError("Invalid task prompt") 165 | 166 | 167 | def run_example(task_prompt, image, vision_model, vision_processor): 168 | # if text_input is None: 169 | prompt = task_prompt 170 | # else: 171 | # prompt = task_prompt + text_input 172 | inputs = vision_processor(text=prompt, images=image, return_tensors="pt").to("cuda") 173 | generated_ids = vision_model.generate( 174 | input_ids=inputs["input_ids"], 175 | pixel_values=inputs["pixel_values"], 176 | max_new_tokens=1024, 177 | early_stopping=False, 178 | do_sample=False, 179 | num_beams=3, 180 | ) 181 | generated_text = vision_processor.batch_decode( 182 | generated_ids, skip_special_tokens=False 183 | )[0] 184 | parsed_answer = vision_processor.post_process_generation( 185 | generated_text, task=task_prompt, image_size=(image.width, image.height) 186 | ) 187 | return parsed_answer 188 | -------------------------------------------------------------------------------- /omniparse/image/router.py: -------------------------------------------------------------------------------- 1 | from fastapi import UploadFile, File, HTTPException, APIRouter, Form 2 | from fastapi.responses import JSONResponse 3 | from omniparse import get_shared_state 4 | from omniparse.image import parse_image, process_image 5 | from omniparse.models import responseDocument 6 | 7 | image_router = APIRouter() 8 | model_state = get_shared_state() 9 | 10 | 11 | @image_router.post("/image") 12 | async def parse_image_endpoint(file: UploadFile = File(...)): 13 | try: 14 | file_bytes = await file.read() 15 | result: responseDocument = parse_image(file_bytes, model_state) 16 | return JSONResponse(content=result.model_dump()) 17 | 18 | except Exception as e: 19 | raise HTTPException(status_code=500, detail=str(e)) 20 | 21 | 22 | @image_router.post("/process_image") 23 | async def process_image_route(image: UploadFile = File(...), task: str = Form(...)): 24 | try: 25 | file_bytes = await image.read() 26 | result: responseDocument = process_image(file_bytes, task, model_state) 27 | return JSONResponse(content=result.model_dump()) 28 | 29 | except Exception as e: 30 | raise HTTPException(status_code=500, detail=str(e)) 31 | -------------------------------------------------------------------------------- /omniparse/image/utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Title: OmniPrase 3 | Author: Adithya S Kolavi 4 | Date: 2024-07-02 5 | 6 | This code includes portions of code from the Florence-2 repository by gokaygokay. 7 | Original repository: https://huggingface.co/spaces/gokaygokay/Florence-2 8 | 9 | Original Author: gokaygokay 10 | Original Date: 2024-06-30 11 | 12 | URL: https://huggingface.co/spaces/gokaygokay/Florence-2 13 | """ 14 | 15 | import io 16 | import random 17 | import numpy as np 18 | from PIL import Image, ImageDraw, ImageFont 19 | import matplotlib.pyplot as plt 20 | import matplotlib.patches as patches 21 | 22 | 23 | def plot_bbox(image, data): 24 | fig, ax = plt.subplots() 25 | ax.imshow(image) 26 | for bbox, label in zip(data["bboxes"], data["labels"]): 27 | x1, y1, x2, y2 = bbox 28 | rect = patches.Rectangle( 29 | (x1, y1), x2 - x1, y2 - y1, linewidth=1, edgecolor="r", facecolor="none" 30 | ) 31 | ax.add_patch(rect) 32 | plt.text( 33 | x1, 34 | y1, 35 | label, 36 | color="white", 37 | fontsize=8, 38 | bbox=dict(facecolor="red", alpha=0.5), 39 | ) 40 | ax.axis("off") 41 | return fig 42 | 43 | 44 | colormap = [ 45 | "blue", 46 | "orange", 47 | "green", 48 | "purple", 49 | "brown", 50 | "pink", 51 | "gray", 52 | "olive", 53 | "cyan", 54 | "red", 55 | "lime", 56 | "indigo", 57 | "violet", 58 | "aqua", 59 | "magenta", 60 | "coral", 61 | "gold", 62 | "tan", 63 | "skyblue", 64 | ] 65 | 66 | 67 | def draw_polygons(image, prediction, fill_mask=False): 68 | draw = ImageDraw.Draw(image) 69 | scale = 1 70 | for polygons, label in zip(prediction["polygons"], prediction["labels"]): 71 | color = random.choice(colormap) 72 | fill_color = random.choice(colormap) if fill_mask else None 73 | for _polygon in polygons: 74 | _polygon = np.array(_polygon).reshape(-1, 2) 75 | if len(_polygon) < 3: 76 | print("Invalid polygon:", _polygon) 77 | continue 78 | _polygon = (_polygon * scale).reshape(-1).tolist() 79 | if fill_mask: 80 | draw.polygon(_polygon, outline=color, fill=fill_color) 81 | else: 82 | draw.polygon(_polygon, outline=color) 83 | draw.text((_polygon[0] + 8, _polygon[1] + 2), label, fill=color) 84 | return image 85 | 86 | 87 | def convert_to_od_format(data): 88 | bboxes = data.get("bboxes", []) 89 | labels = data.get("bboxes_labels", []) 90 | od_results = {"bboxes": bboxes, "labels": labels} 91 | return od_results 92 | 93 | 94 | def draw_ocr_bboxes(image, prediction): 95 | scale = 1 96 | draw = ImageDraw.Draw(image) 97 | bboxes, labels = prediction["quad_boxes"], prediction["labels"] 98 | for box, label in zip(bboxes, labels): 99 | color = random.choice(colormap) 100 | new_box = (np.array(box) * scale).tolist() 101 | draw.polygon(new_box, width=3, outline=color) 102 | draw.text( 103 | (new_box[0] + 8, new_box[1] + 2), 104 | "{}".format(label), 105 | align="right", 106 | fill=color, 107 | ) 108 | return image 109 | 110 | 111 | def fig_to_pil(fig): 112 | buf = io.BytesIO() 113 | fig.savefig(buf, format="png") 114 | buf.seek(0) 115 | return Image.open(buf) 116 | -------------------------------------------------------------------------------- /omniparse/media/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Title: OmniParse 3 | Author: Adithya S K 4 | Date: 2024-07-02 5 | 6 | This code includes portions of code from the CLIP repository by OpenAI. 7 | Original repository: https://github.com/openai/CLIP 8 | 9 | Original Author: OpenAI 10 | Original Date: 2021-01-05 11 | 12 | License: MIT License 13 | URL: https://github.com/openai/CLIP/blob/main/LICENSE 14 | 15 | Description: 16 | This section of the code was adapted from the CLIP repository to integrate audioprocessing capabilities into the OmniParse platform. 17 | All credits for the original implementation go to OpenAI. 18 | """ 19 | 20 | import os 21 | import tempfile 22 | from fastapi import UploadFile 23 | from fastapi.responses import JSONResponse 24 | from moviepy.editor import VideoFileClip 25 | from omniparse.models import responseDocument 26 | from omniparse.media.utils import WHISPER_DEFAULT_SETTINGS 27 | from omniparse.media.utils import transcribe # Assuming transcribe function is imported 28 | 29 | 30 | def parse_audio(input_data, model_state) -> responseDocument: 31 | try: 32 | if isinstance(input_data, bytes): 33 | with tempfile.NamedTemporaryFile( 34 | delete=False, suffix=".wav" 35 | ) as temp_audio_file: 36 | temp_audio_file.write(input_data) 37 | temp_audio_path = temp_audio_file.name 38 | elif isinstance(input_data, str) and os.path.isfile(input_data): 39 | temp_audio_path = input_data 40 | else: 41 | raise ValueError( 42 | "Invalid input data format. Expected audio bytes or audio file path." 43 | ) 44 | 45 | # Transcribe the audio file 46 | transcript = transcribe( 47 | audio_path=temp_audio_path, 48 | whisper_model=model_state.whisper_model, 49 | **WHISPER_DEFAULT_SETTINGS, 50 | ) 51 | 52 | return responseDocument(text=transcript["text"]) 53 | 54 | finally: 55 | # Clean up the temporary file 56 | if os.path.exists(temp_audio_path): 57 | os.remove(temp_audio_path) 58 | 59 | 60 | def parse_video(input_data, model_state) -> responseDocument: 61 | try: 62 | if isinstance(input_data, bytes): 63 | with tempfile.NamedTemporaryFile( 64 | delete=False, suffix=".mp4" 65 | ) as temp_video_file: 66 | temp_video_file.write(input_data) 67 | video_path = temp_video_file.name 68 | elif isinstance(input_data, str) and os.path.isfile(input_data): 69 | video_path = input_data 70 | else: 71 | raise ValueError( 72 | "Invalid input data format. Expected video bytes or video file path." 73 | ) 74 | 75 | # Extract audio from the video 76 | audio_path = f"{tempfile.gettempdir()}/{os.path.splitext(os.path.basename(video_path))[0]}.mp3" 77 | video_clip = VideoFileClip(video_path) 78 | audio_clip = video_clip.audio 79 | audio_clip.write_audiofile(audio_path) 80 | audio_clip.close() 81 | video_clip.close() 82 | 83 | # Transcribe the audio file 84 | transcript = transcribe( 85 | audio_path=audio_path, 86 | whisper_model=model_state.whisper_model, 87 | **WHISPER_DEFAULT_SETTINGS, 88 | ) 89 | 90 | return responseDocument(text=transcript["text"]) 91 | 92 | finally: 93 | # Clean up the temporary files 94 | if os.path.exists(video_path): 95 | os.remove(video_path) 96 | if os.path.exists(audio_path): 97 | os.remove(audio_path) 98 | -------------------------------------------------------------------------------- /omniparse/media/router.py: -------------------------------------------------------------------------------- 1 | """ 2 | Title: OmniParse 3 | Author: Adithya S K 4 | Date: 2024-07-02 5 | 6 | This code includes portions of code from the CLIP repository by OpenAI. 7 | Original repository: https://github.com/openai/CLIP 8 | 9 | Original Author: OpenAI 10 | Original Date: 2021-01-05 11 | 12 | License: MIT License 13 | URL: https://github.com/openai/CLIP/blob/main/LICENSE 14 | 15 | Description: 16 | This section of the code was adapted from the CLIP repository to integrate audioprocessing capabilities into the OmniParse platform. 17 | All credits for the original implementation go to OpenAI. 18 | """ 19 | 20 | from fastapi import FastAPI, UploadFile, File, HTTPException, APIRouter, status, Form 21 | from fastapi.responses import JSONResponse 22 | from omniparse.models import responseDocument 23 | from omniparse.media import parse_audio, parse_video 24 | from omniparse import get_shared_state 25 | 26 | media_router = APIRouter() 27 | model_state = get_shared_state() 28 | 29 | 30 | @media_router.post("/audio") 31 | async def parse_audio_endpoint(file: UploadFile = File(...)): 32 | try: 33 | file_bytes = await file.read() 34 | result: responseDocument = parse_audio(file_bytes, model_state) 35 | return JSONResponse(content=result.model_dump()) 36 | 37 | except Exception as e: 38 | raise HTTPException(status_code=500, detail=str(e)) 39 | 40 | 41 | @media_router.post("/video") 42 | async def parse_video_endpoint(file: UploadFile = File(...)): 43 | try: 44 | file_bytes = await file.read() 45 | result: responseDocument = parse_video(file_bytes, model_state) 46 | return JSONResponse(content=result.model_dump()) 47 | 48 | except Exception as e: 49 | raise HTTPException(status_code=500, detail=str(e)) 50 | -------------------------------------------------------------------------------- /omniparse/media/utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Title: OmniParse 3 | Author: Adithya S K 4 | Date: 2024-07-02 5 | 6 | This code includes portions of code from the CLIP repository by OpenAI. 7 | Original repository: https://github.com/openai/CLIP 8 | 9 | Original Author: OpenAI 10 | Original Date: 2021-01-05 11 | 12 | License: MIT License 13 | URL: https://github.com/openai/CLIP/blob/main/LICENSE 14 | 15 | Description: 16 | This section of the code was adapted from the CLIP repository to integrate audioprocessing capabilities into the OmniParse platform. 17 | All credits for the original implementation go to OpenAI. 18 | """ 19 | 20 | import numpy as np 21 | 22 | 23 | def transcribe(audio_path: str, whisper_model, **whisper_args): 24 | """Transcribe the audio file using whisper""" 25 | 26 | # Get whisper model 27 | # NOTE: If mulitple models are selected, this may keep all of them in memory depending on the cache size 28 | 29 | # Set configs & transcribe 30 | if whisper_args["temperature_increment_on_fallback"] is not None: 31 | whisper_args["temperature"] = tuple( 32 | np.arange( 33 | whisper_args["temperature"], 34 | 1.0 + 1e-6, 35 | whisper_args["temperature_increment_on_fallback"], 36 | ) 37 | ) 38 | else: 39 | whisper_args["temperature"] = [whisper_args["temperature"]] 40 | 41 | del whisper_args["temperature_increment_on_fallback"] 42 | 43 | transcript = whisper_model.transcribe( 44 | audio_path, 45 | **whisper_args, 46 | ) 47 | 48 | return transcript 49 | 50 | 51 | # function for enabling CORS on web server 52 | WHISPER_DEFAULT_SETTINGS = { 53 | "temperature": 0.0, 54 | "temperature_increment_on_fallback": 0.2, 55 | "no_speech_threshold": 0.6, 56 | "logprob_threshold": -1.0, 57 | "compression_ratio_threshold": 2.4, 58 | "condition_on_previous_text": True, 59 | "verbose": False, 60 | "task": "transcribe", 61 | } 62 | -------------------------------------------------------------------------------- /omniparse/models/__init__.py: -------------------------------------------------------------------------------- 1 | import base64 2 | from io import BytesIO 3 | from PIL import Image as PILImage 4 | from typing import Callable, List, Dict, Any, Union 5 | from fastapi import HTTPException 6 | from pydantic import BaseModel, Field 7 | 8 | 9 | class responseImage(BaseModel): 10 | image: str = "" 11 | image_name: str = "" 12 | image_info: Union[Dict[str, Any], None] = Field(default_factory=dict) 13 | 14 | 15 | class responseDocument(BaseModel): 16 | text: str = "" 17 | images: List[responseImage] = Field(default_factory=list) 18 | metadata: Dict[str, Any] = Field(default_factory=dict) 19 | chunks: List[str] = Field(default_factory=list) 20 | 21 | def add_image( 22 | self, 23 | image_name: str, 24 | image_data: Union[str, PILImage.Image], 25 | image_info: Union[Dict[str, Any], None] = {}, 26 | ): 27 | if isinstance(image_data, str): 28 | # If image_data is base64 encoded, decode it 29 | try: 30 | image_bytes = base64.b64decode(image_data) 31 | pil_image = PILImage.open(BytesIO(image_bytes)) 32 | except Exception as e: 33 | raise HTTPException( 34 | status_code=500, detail=f"Failed to decode base64 image: {str(e)}" 35 | ) 36 | elif isinstance(image_data, PILImage.Image): 37 | # If image_data is already a PIL.Image instance, use it directly 38 | pil_image = image_data 39 | else: 40 | raise ValueError( 41 | "Unsupported image_data type. Should be either string (file path), PIL.Image instance, or base64 encoded string." 42 | ) 43 | 44 | new_image = responseImage( 45 | image=self.encode_image_to_base64(pil_image), 46 | image_name=image_name, 47 | image_info=image_info, 48 | ) 49 | self.images.append(new_image) 50 | 51 | def encode_image_to_base64(self, image: PILImage.Image) -> str: 52 | # Convert PIL image to base64 string 53 | buffered = BytesIO() 54 | image.save(buffered, format="JPEG", quality=85) 55 | img_base64 = base64.b64encode(buffered.getvalue()).decode("utf-8") 56 | return img_base64 57 | 58 | def image_processor(self, image_processor: Callable[[str], str]): 59 | for img in self.image: 60 | if not img.image_info.get("caption"): # Only generate caption if it's empty 61 | img.image_info["caption"] = image_processor(img.image_name) 62 | 63 | def chunk_text(self, chunker: Callable[[str], List[str]]): 64 | self.chunks = chunker(self.text) 65 | -------------------------------------------------------------------------------- /omniparse/sheets/__init__.py: -------------------------------------------------------------------------------- 1 | ## For excel csv and other table/ sheet based file 2 | -------------------------------------------------------------------------------- /omniparse/utils.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import os 3 | from art import text2art 4 | from omniparse.models import responseDocument 5 | 6 | 7 | def encode_images(images, inputDocument: responseDocument): 8 | for i, (filename, image) in enumerate(images.items()): 9 | # print(f"Processing image {filename}") 10 | # Save image as PNG 11 | image.save(filename, "PNG") 12 | # Read the saved image file as bytes 13 | with open(filename, "rb") as f: 14 | image_bytes = f.read() 15 | # Convert image to base64 16 | image_base64 = base64.b64encode(image_bytes).decode("utf-8") 17 | 18 | inputDocument.add_image(image_name=filename, image_data=image_base64) 19 | 20 | # Remove the temporary image file 21 | os.remove(filename) 22 | 23 | 24 | def print_omniparse_text_art(suffix=None): 25 | font = "nancyj" 26 | ascii_text = " OmniParse" 27 | if suffix: 28 | ascii_text += f" x {suffix}" 29 | ascii_art = text2art(ascii_text, font=font) 30 | print("\n") 31 | print(ascii_art) 32 | print("""Created by Adithya S K : https://twitter.com/adithya_s_k""") 33 | print("\n") 34 | print("\n") 35 | -------------------------------------------------------------------------------- /omniparse/web/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Title: OmniParse 3 | Author: Adithya S K 4 | Date: 2024-07-02 5 | 6 | This code includes portions of code from the crawl4ai repository by unclecode, licensed under the Apache 2.0 License. 7 | Original repository: https://github.com/unclecode/crawl4ai 8 | 9 | Original Author: unclecode 10 | 11 | License: Apache 2.0 License 12 | URL: https://github.com/unclecode/crawl4ai/blob/main/LICENSE 13 | """ 14 | 15 | import asyncio 16 | import logging 17 | from concurrent.futures import ThreadPoolExecutor 18 | from omniparse.models import responseDocument 19 | 20 | 21 | async def parse_url(url: str, model_state) -> responseDocument: 22 | try: 23 | logging.debug("[LOG] Loading extraction and chunking strategies...") 24 | # Hardcoded parameters (adjust as needed) 25 | include_raw_html = False 26 | bypass_cache = True 27 | word_count_threshold = 5 28 | css_selector = None 29 | screenshot = True 30 | user_agent = None 31 | verbose = True 32 | 33 | # Use ThreadPoolExecutor to run the synchronous WebCrawler in async manner 34 | logging.debug("[LOG] Running the WebCrawler...") 35 | with ThreadPoolExecutor() as executor: 36 | loop = asyncio.get_event_loop() 37 | future = loop.run_in_executor( 38 | executor, 39 | model_state.crawler.run, 40 | str(url), 41 | word_count_threshold, 42 | bypass_cache, 43 | css_selector, 44 | screenshot, 45 | user_agent, 46 | verbose, 47 | ) 48 | result = await future 49 | 50 | return result 51 | 52 | except Exception as e: 53 | logging.error(f"[ERROR] Error parsing webpage: {str(e)}") 54 | return {"message": "Error in parsing webpage", "error": str(e)} 55 | -------------------------------------------------------------------------------- /omniparse/web/config.py: -------------------------------------------------------------------------------- 1 | """ 2 | Title: OmniParse 3 | Author: Adithya S K 4 | Date: 2024-07-02 5 | 6 | This code includes portions of code from the crawl4ai repository by unclecode, licensed under the Apache 2.0 License. 7 | Original repository: https://github.com/unclecode/crawl4ai 8 | 9 | Original Author: unclecode 10 | 11 | License: Apache 2.0 License 12 | URL: https://github.com/unclecode/crawl4ai/blob/main/LICENSE 13 | """ 14 | 15 | import os 16 | from dotenv import load_dotenv 17 | 18 | load_dotenv() # Load environment variables from .env file 19 | 20 | # Default provider, ONLY used when the extraction strategy is LLMExtractionStrategy 21 | DEFAULT_PROVIDER = "openai/gpt-4-turbo" 22 | MODEL_REPO_BRANCH = "new-release-0.0.2" 23 | # Provider-model dictionary, ONLY used when the extraction strategy is LLMExtractionStrategy 24 | PROVIDER_MODELS = { 25 | "ollama/llama3": "no-token-needed", # Any model from Ollama no need for API token 26 | "groq/llama3-70b-8192": os.getenv("GROQ_API_KEY"), 27 | "groq/llama3-8b-8192": os.getenv("GROQ_API_KEY"), 28 | "openai/gpt-3.5-turbo": os.getenv("OPENAI_API_KEY"), 29 | "openai/gpt-4-turbo": os.getenv("OPENAI_API_KEY"), 30 | "openai/gpt-4o": os.getenv("OPENAI_API_KEY"), 31 | "anthropic/claude-3-haiku-20240307": os.getenv("ANTHROPIC_API_KEY"), 32 | "anthropic/claude-3-opus-20240229": os.getenv("ANTHROPIC_API_KEY"), 33 | "anthropic/claude-3-sonnet-20240229": os.getenv("ANTHROPIC_API_KEY"), 34 | } 35 | 36 | 37 | # Chunk token threshold 38 | CHUNK_TOKEN_THRESHOLD = 1000 39 | 40 | # Threshold for the minimum number of word in a HTML tag to be considered 41 | MIN_WORD_THRESHOLD = 5 42 | -------------------------------------------------------------------------------- /omniparse/web/crawler_strategy.py: -------------------------------------------------------------------------------- 1 | """ 2 | Title: OmniParse 3 | Author: Adithya S K 4 | Date: 2024-07-02 5 | 6 | This code includes portions of code from the crawl4ai repository by unclecode, licensed under the Apache 2.0 License. 7 | Original repository: https://github.com/unclecode/crawl4ai 8 | 9 | Original Author: unclecode 10 | 11 | License: Apache 2.0 License 12 | URL: https://github.com/unclecode/crawl4ai/blob/main/LICENSE 13 | """ 14 | 15 | from abc import ABC, abstractmethod 16 | from selenium import webdriver 17 | from selenium.webdriver.chrome.service import Service 18 | from selenium.webdriver.common.by import By 19 | from selenium.webdriver.support.ui import WebDriverWait 20 | from selenium.webdriver.support import expected_conditions as EC 21 | from selenium.webdriver.chrome.options import Options 22 | from selenium.common.exceptions import InvalidArgumentException 23 | from webdriver_manager.chrome import ChromeDriverManager 24 | import logging 25 | import base64 26 | from PIL import Image, ImageDraw, ImageFont 27 | from io import BytesIO 28 | from typing import List 29 | from pathlib import Path 30 | from omniparse.web.utils import wrap_text 31 | 32 | logger = logging.getLogger("selenium.webdriver.remote.remote_connection") 33 | logger.setLevel(logging.WARNING) 34 | 35 | logger_driver = logging.getLogger("selenium.webdriver.common.service") 36 | logger_driver.setLevel(logging.WARNING) 37 | 38 | urllib3_logger = logging.getLogger("urllib3.connectionpool") 39 | urllib3_logger.setLevel(logging.WARNING) 40 | 41 | # Disable http.client logging 42 | http_client_logger = logging.getLogger("http.client") 43 | http_client_logger.setLevel(logging.WARNING) 44 | 45 | # Disable driver_finder and service logging 46 | driver_finder_logger = logging.getLogger("selenium.webdriver.common.driver_finder") 47 | driver_finder_logger.setLevel(logging.WARNING) 48 | 49 | 50 | class CrawlerStrategy(ABC): 51 | @abstractmethod 52 | def crawl(self, url: str, **kwargs) -> str: 53 | pass 54 | 55 | @abstractmethod 56 | def take_screenshot(self, save_path: str): 57 | pass 58 | 59 | @abstractmethod 60 | def update_user_agent(self, user_agent: str): 61 | pass 62 | 63 | 64 | class LocalSeleniumCrawlerStrategy(CrawlerStrategy): 65 | def __init__(self, use_cached_html=False, js_code=None, **kwargs): 66 | super().__init__() 67 | self.options = Options() 68 | self.options.headless = True 69 | if kwargs.get("user_agent"): 70 | self.options.add_argument("--user-agent=" + kwargs.get("user_agent")) 71 | self.options.add_argument("--no-sandbox") 72 | self.options.add_argument("--headless") 73 | # self.options.add_argument("--disable-dev-shm-usage") 74 | self.options.add_argument("--disable-gpu") 75 | # self.options.add_argument("--disable-extensions") 76 | # self.options.add_argument("--disable-infobars") 77 | # self.options.add_argument("--disable-logging") 78 | # self.options.add_argument("--disable-popup-blocking") 79 | # self.options.add_argument("--disable-translate") 80 | # self.options.add_argument("--disable-default-apps") 81 | # self.options.add_argument("--disable-background-networking") 82 | # self.options.add_argument("--disable-sync") 83 | # self.options.add_argument("--disable-features=NetworkService,NetworkServiceInProcess") 84 | # self.options.add_argument("--disable-browser-side-navigation") 85 | # self.options.add_argument("--dns-prefetch-disable") 86 | # self.options.add_argument("--disable-web-security") 87 | self.options.add_argument("--log-level=3") 88 | self.use_cached_html = use_cached_html 89 | self.use_cached_html = use_cached_html 90 | self.js_code = js_code 91 | self.verbose = kwargs.get("verbose", False) 92 | 93 | # chromedriver_autoinstaller.install() 94 | # import chromedriver_autoinstaller 95 | self.service = Service(ChromeDriverManager().install()) 96 | self.service.log_path = "NUL" 97 | self.driver = webdriver.Chrome(service=self.service, options=self.options) 98 | 99 | def update_user_agent(self, user_agent: str): 100 | self.options.add_argument(f"user-agent={user_agent}") 101 | self.driver.quit() 102 | self.driver = webdriver.Chrome(service=self.service, options=self.options) 103 | 104 | def crawl(self, url: str) -> str: 105 | try: 106 | if self.verbose: 107 | print(f"[LOG] Crawling {url} using Web Crawler...") 108 | self.driver.get(url) 109 | WebDriverWait(self.driver, 10).until( 110 | EC.presence_of_all_elements_located((By.TAG_NAME, "html")) 111 | ) 112 | 113 | # Execute JS code if provided 114 | if self.js_code and type(self.js_code) == str: 115 | self.driver.execute_script(self.js_code) 116 | # Optionally, wait for some condition after executing the JS code 117 | WebDriverWait(self.driver, 10).until( 118 | lambda driver: driver.execute_script("return document.readyState") 119 | == "complete" 120 | ) 121 | elif self.js_code and type(self.js_code) == list: 122 | for js in self.js_code: 123 | self.driver.execute_script(js) 124 | WebDriverWait(self.driver, 10).until( 125 | lambda driver: driver.execute_script( 126 | "return document.readyState" 127 | ) 128 | == "complete" 129 | ) 130 | 131 | html = self.driver.page_source 132 | if self.verbose: 133 | print(f"[LOG] ✅ Crawled {url} successfully!") 134 | 135 | return html 136 | except InvalidArgumentException: 137 | raise InvalidArgumentException(f"Invalid URL {url}") 138 | except Exception as e: 139 | raise Exception(f"Failed to crawl {url}: {str(e)}") 140 | 141 | def take_screenshot(self) -> str: 142 | try: 143 | # Get the dimensions of the page 144 | total_width = self.driver.execute_script("return document.body.scrollWidth") 145 | total_height = self.driver.execute_script( 146 | "return document.body.scrollHeight" 147 | ) 148 | 149 | # Set the window size to the dimensions of the page 150 | self.driver.set_window_size(total_width, total_height) 151 | 152 | # Take screenshot 153 | screenshot = self.driver.get_screenshot_as_png() 154 | 155 | # Open the screenshot with PIL 156 | image = Image.open(BytesIO(screenshot)) 157 | 158 | # Convert to JPEG and compress 159 | buffered = BytesIO() 160 | image.save(buffered, format="JPEG", quality=85) 161 | img_base64 = base64.b64encode(buffered.getvalue()).decode("utf-8") 162 | 163 | if self.verbose: 164 | print(f"[LOG] 📸 Screenshot taken and converted to base64") 165 | 166 | return img_base64 167 | 168 | except Exception as e: 169 | error_message = f"Failed to take screenshot: {str(e)}" 170 | print(error_message) 171 | 172 | # Generate an image with black background 173 | img = Image.new("RGB", (800, 600), color="black") 174 | draw = ImageDraw.Draw(img) 175 | 176 | # Load a font 177 | try: 178 | font = ImageFont.truetype("arial.ttf", 40) 179 | except IOError: 180 | font = ImageFont.load_default(size=40) 181 | 182 | # Define text color and wrap the text 183 | text_color = (255, 255, 255) 184 | max_width = 780 185 | wrapped_text = wrap_text(draw, error_message, font, max_width) 186 | 187 | # Calculate text position 188 | text_position = (10, 10) 189 | 190 | # Draw the text on the image 191 | draw.text(text_position, wrapped_text, fill=text_color, font=font) 192 | 193 | # Convert to base64 194 | buffered = BytesIO() 195 | img.save(buffered, format="JPEG") 196 | img_base64 = base64.b64encode(buffered.getvalue()).decode("utf-8") 197 | 198 | return img_base64 199 | 200 | def quit(self): 201 | self.driver.quit() 202 | -------------------------------------------------------------------------------- /omniparse/web/model_loader.py: -------------------------------------------------------------------------------- 1 | """ 2 | Title: OmniParse 3 | Author: Adithya S K 4 | Date: 2024-07-02 5 | 6 | This code includes portions of code from the crawl4ai repository by unclecode, licensed under the Apache 2.0 License. 7 | Original repository: https://github.com/unclecode/crawl4ai 8 | 9 | Original Author: unclecode 10 | 11 | License: Apache 2.0 License 12 | URL: https://github.com/unclecode/crawl4ai/blob/main/LICENSE 13 | """ 14 | 15 | import os 16 | from functools import lru_cache 17 | from pathlib import Path 18 | import subprocess 19 | import shutil 20 | import tarfile 21 | from .config import MODEL_REPO_BRANCH 22 | import argparse 23 | import urllib.request 24 | 25 | __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) 26 | 27 | 28 | @lru_cache() 29 | def get_available_memory(device): 30 | import torch 31 | 32 | if device.type == "cuda": 33 | return torch.cuda.get_device_properties(device).total_memory 34 | elif device.type == "mps": 35 | return 48 * 1024**3 # Assuming 8GB for MPS, as a conservative estimate 36 | else: 37 | return 0 38 | 39 | 40 | @lru_cache() 41 | def calculate_batch_size(device): 42 | available_memory = get_available_memory(device) 43 | 44 | if device.type == "cpu": 45 | return 16 46 | elif device.type in ["cuda", "mps"]: 47 | # Adjust these thresholds based on your model size and available memory 48 | if available_memory >= 31 * 1024**3: # > 32GB 49 | return 256 50 | elif available_memory >= 15 * 1024**3: # > 16GB to 32GB 51 | return 128 52 | elif available_memory >= 8 * 1024**3: # 8GB to 16GB 53 | return 64 54 | else: 55 | return 32 56 | else: 57 | return 16 # Default batch size 58 | 59 | 60 | @lru_cache() 61 | def get_device(): 62 | import torch 63 | 64 | if torch.cuda.is_available(): 65 | device = torch.device("cuda") 66 | elif torch.backends.mps.is_available(): 67 | device = torch.device("mps") 68 | else: 69 | device = torch.device("cpu") 70 | return device 71 | 72 | 73 | def set_model_device(model): 74 | device = get_device() 75 | model.to(device) 76 | return model, device 77 | 78 | 79 | @lru_cache() 80 | def get_home_folder(): 81 | home_folder = os.path.join(Path.home(), ".omniparse") 82 | os.makedirs(home_folder, exist_ok=True) 83 | os.makedirs(f"{home_folder}/cache", exist_ok=True) 84 | os.makedirs(f"{home_folder}/models", exist_ok=True) 85 | return home_folder 86 | 87 | 88 | @lru_cache() 89 | def load_bert_base_uncased(): 90 | from transformers import BertTokenizer, BertModel, AutoTokenizer, AutoModel 91 | 92 | tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", resume_download=None) 93 | model = BertModel.from_pretrained("bert-base-uncased", resume_download=None) 94 | model.eval() 95 | model, device = set_model_device(model) 96 | return tokenizer, model 97 | 98 | 99 | @lru_cache() 100 | def load_bge_small_en_v1_5(): 101 | from transformers import BertTokenizer, BertModel, AutoTokenizer, AutoModel 102 | 103 | tokenizer = AutoTokenizer.from_pretrained( 104 | "BAAI/bge-small-en-v1.5", resume_download=None 105 | ) 106 | model = AutoModel.from_pretrained("BAAI/bge-small-en-v1.5", resume_download=None) 107 | model.eval() 108 | model, device = set_model_device(model) 109 | return tokenizer, model 110 | 111 | 112 | @lru_cache() 113 | def load_onnx_all_MiniLM_l6_v2(): 114 | from omniparse.web.onnx_embedding import DefaultEmbeddingModel 115 | 116 | model_path = "models/onnx.tar.gz" 117 | model_url = "https://unclecode-files.s3.us-west-2.amazonaws.com/onnx.tar.gz" 118 | __location__ = os.path.realpath( 119 | os.path.join(os.getcwd(), os.path.dirname(__file__)) 120 | ) 121 | download_path = os.path.join(__location__, model_path) 122 | onnx_dir = os.path.join(__location__, "models/onnx") 123 | 124 | # Create the models directory if it does not exist 125 | os.makedirs(os.path.dirname(download_path), exist_ok=True) 126 | 127 | # Download the tar.gz file if it does not exist 128 | if not os.path.exists(download_path): 129 | 130 | def download_with_progress(url, filename): 131 | def reporthook(block_num, block_size, total_size): 132 | downloaded = block_num * block_size 133 | percentage = 100 * downloaded / total_size 134 | if downloaded < total_size: 135 | print( 136 | f"\rDownloading: {percentage:.2f}% ({downloaded / (1024 * 1024):.2f} MB of {total_size / (1024 * 1024):.2f} MB)", 137 | end="", 138 | ) 139 | else: 140 | print("\rDownload complete!") 141 | 142 | urllib.request.urlretrieve(url, filename, reporthook) 143 | 144 | download_with_progress(model_url, download_path) 145 | 146 | # Extract the tar.gz file if the onnx directory does not exist 147 | if not os.path.exists(onnx_dir): 148 | with tarfile.open(download_path, "r:gz") as tar: 149 | tar.extractall(path=os.path.join(__location__, "models")) 150 | 151 | # remove the tar.gz file 152 | os.remove(download_path) 153 | 154 | model = DefaultEmbeddingModel() 155 | return model 156 | 157 | 158 | @lru_cache() 159 | def load_text_classifier(): 160 | from transformers import AutoTokenizer, AutoModelForSequenceClassification 161 | from transformers import pipeline 162 | import torch 163 | 164 | tokenizer = AutoTokenizer.from_pretrained( 165 | "dstefa/roberta-base_topic_classification_nyt_news" 166 | ) 167 | model = AutoModelForSequenceClassification.from_pretrained( 168 | "dstefa/roberta-base_topic_classification_nyt_news" 169 | ) 170 | model.eval() 171 | model, device = set_model_device(model) 172 | pipe = pipeline("text-classification", model=model, tokenizer=tokenizer) 173 | return pipe 174 | 175 | 176 | @lru_cache() 177 | def load_text_multilabel_classifier(): 178 | from transformers import AutoModelForSequenceClassification, AutoTokenizer 179 | import numpy as np 180 | from scipy.special import expit 181 | import torch 182 | 183 | # Check for available device: CUDA, MPS (for Apple Silicon), or CPU 184 | if torch.cuda.is_available(): 185 | device = torch.device("cuda") 186 | elif torch.backends.mps.is_available(): 187 | device = torch.device("mps") 188 | else: 189 | return torch.device("cpu") 190 | 191 | MODEL = "cardiffnlp/tweet-topic-21-multi" 192 | tokenizer = AutoTokenizer.from_pretrained(MODEL, resume_download=None) 193 | model = AutoModelForSequenceClassification.from_pretrained( 194 | MODEL, resume_download=None 195 | ) 196 | model.eval() 197 | model, device = set_model_device(model) 198 | class_mapping = model.config.id2label 199 | 200 | def _classifier(texts, threshold=0.5, max_length=64): 201 | tokens = tokenizer( 202 | texts, 203 | return_tensors="pt", 204 | padding=True, 205 | truncation=True, 206 | max_length=max_length, 207 | ) 208 | tokens = { 209 | key: val.to(device) for key, val in tokens.items() 210 | } # Move tokens to the selected device 211 | 212 | with torch.no_grad(): 213 | output = model(**tokens) 214 | 215 | scores = output.logits.detach().cpu().numpy() 216 | scores = expit(scores) 217 | predictions = (scores >= threshold) * 1 218 | 219 | batch_labels = [] 220 | for prediction in predictions: 221 | labels = [ 222 | class_mapping[i] for i, value in enumerate(prediction) if value == 1 223 | ] 224 | batch_labels.append(labels) 225 | 226 | return batch_labels 227 | 228 | return _classifier, device 229 | 230 | 231 | @lru_cache() 232 | def load_nltk_punkt(): 233 | import nltk 234 | 235 | try: 236 | nltk.data.find("tokenizers/punkt") 237 | except LookupError: 238 | nltk.download("punkt") 239 | return nltk.data.find("tokenizers/punkt") 240 | 241 | 242 | def download_all_models(remove_existing=False): 243 | """Download all models required for OmniParse.""" 244 | if remove_existing: 245 | print("[LOG] Removing existing models...") 246 | home_folder = get_home_folder() 247 | model_folders = [ 248 | os.path.join(home_folder, "models/reuters"), 249 | os.path.join(home_folder, "models"), 250 | ] 251 | for folder in model_folders: 252 | if Path(folder).exists(): 253 | shutil.rmtree(folder) 254 | print("[LOG] Existing models removed.") 255 | 256 | # Load each model to trigger download 257 | # print("[LOG] Downloading BERT Base Uncased...") 258 | # load_bert_base_uncased() 259 | # print("[LOG] Downloading BGE Small EN v1.5...") 260 | # load_bge_small_en_v1_5() 261 | # print("[LOG] Downloading ONNX model...") 262 | # load_onnx_all_MiniLM_l6_v2() 263 | print("[LOG] Downloading text classifier...") 264 | _, device = load_text_multilabel_classifier() 265 | print(f"[LOG] Text classifier loaded on {device}") 266 | print("[LOG] Downloading custom NLTK Punkt model...") 267 | load_nltk_punkt() 268 | print("[LOG] ✅ All models downloaded successfully.") 269 | 270 | 271 | def main(): 272 | parser = argparse.ArgumentParser(description="OmniParse Web Model loader") 273 | parser.add_argument( 274 | "--remove-existing", 275 | action="store_true", 276 | help="Remove existing models before downloading", 277 | ) 278 | args = parser.parse_args() 279 | 280 | download_all_models(remove_existing=args.remove_existing) 281 | 282 | 283 | if __name__ == "__main__": 284 | main() 285 | -------------------------------------------------------------------------------- /omniparse/web/models.py: -------------------------------------------------------------------------------- 1 | """ 2 | Title: OmniParse 3 | Author: Adithya S K 4 | Date: 2024-07-02 5 | 6 | This code includes portions of code from the crawl4ai repository by unclecode, licensed under the Apache 2.0 License. 7 | Original repository: https://github.com/unclecode/crawl4ai 8 | 9 | Original Author: unclecode 10 | 11 | License: Apache 2.0 License 12 | URL: https://github.com/unclecode/crawl4ai/blob/main/LICENSE 13 | """ 14 | 15 | from pydantic import BaseModel, HttpUrl 16 | from typing import List, Dict, Optional 17 | 18 | 19 | class UrlModel(BaseModel): 20 | url: HttpUrl 21 | forced: bool = False 22 | 23 | 24 | class CrawlResult(BaseModel): 25 | url: str 26 | html: str 27 | success: bool 28 | cleaned_html: Optional[str] = None 29 | media: Dict[str, List[Dict]] = {} 30 | links: Dict[str, List[Dict]] = {} 31 | screenshot: Optional[str] = None 32 | markdown: Optional[str] = None 33 | extracted_content: Optional[str] = None 34 | metadata: Optional[dict] = None 35 | error_message: Optional[str] = None 36 | -------------------------------------------------------------------------------- /omniparse/web/prompts.py: -------------------------------------------------------------------------------- 1 | """ 2 | Title: OmniParse 3 | Author: Adithya S K 4 | Date: 2024-07-02 5 | 6 | This code includes portions of code from the crawl4ai repository by unclecode, licensed under the Apache 2.0 License. 7 | Original repository: https://github.com/unclecode/crawl4ai 8 | 9 | Original Author: unclecode 10 | 11 | License: Apache 2.0 License 12 | URL: https://github.com/unclecode/crawl4ai/blob/main/LICENSE 13 | """ 14 | 15 | PROMPT_EXTRACT_BLOCKS = """YHere is the URL of the webpage: 16 | {URL} 17 | 18 | And here is the cleaned HTML content of that webpage: 19 | 20 | {HTML} 21 | 22 | 23 | Your task is to break down this HTML content into semantically relevant blocks, and for each block, generate a JSON object with the following keys: 24 | 25 | - index: an integer representing the index of the block in the content 26 | - tags: a list of semantic tags that are relevant to the content of the block 27 | - content: a list of strings containing the text content of the block 28 | - questions: a list of 3 questions that a user may ask about the content in this block 29 | 30 | To generate the JSON objects: 31 | 32 | 1. Carefully read through the HTML content and identify logical breaks or shifts in the content that would warrant splitting it into separate blocks. 33 | 34 | 2. For each block: 35 | a. Assign it an index based on its order in the content. 36 | b. Analyze the content and generate a list of relevant semantic tags that describe what the block is about. 37 | c. Extract the text content, clean it up if needed, and store it as a list of strings in the "content" field. 38 | d. Come up with 3 questions that a user might ask about this specific block of content, based on the tags and content. The questions should be relevant and answerable by the content in the block. 39 | 40 | 3. Ensure that the order of the JSON objects matches the order of the blocks as they appear in the original HTML content. 41 | 42 | 4. Double-check that each JSON object includes all required keys (index, tags, content, questions) and that the values are in the expected format (integer, list of strings, etc.). 43 | 44 | 5. Make sure the generated JSON is complete and parsable, with no errors or omissions. 45 | 46 | 6. Make sur to escape any special characters in the HTML content, and also single or double quote to avoid JSON parsing issues. 47 | 48 | Please provide your output within tags, like this: 49 | 50 | 51 | [{ 52 | "index": 0, 53 | "tags": ["introduction", "overview"], 54 | "content": ["This is the first paragraph of the article, which provides an introduction and overview of the main topic."], 55 | "questions": [ 56 | "What is the main topic of this article?", 57 | "What can I expect to learn from reading this article?", 58 | "Is this article suitable for beginners or experts in the field?" 59 | ] 60 | }, 61 | { 62 | "index": 1, 63 | "tags": ["history", "background"], 64 | "content": ["This is the second paragraph, which delves into the history and background of the topic.", 65 | "It provides context and sets the stage for the rest of the article."], 66 | "questions": [ 67 | "What historical events led to the development of this topic?", 68 | "How has the understanding of this topic evolved over time?", 69 | "What are some key milestones in the history of this topic?" 70 | ] 71 | }] 72 | 73 | 74 | Remember, the output should be a complete, parsable JSON wrapped in tags, with no omissions or errors. The JSON objects should semantically break down the content into relevant blocks, maintaining the original order.""" 75 | 76 | PROMPT_EXTRACT_BLOCKS = """Here is the URL of the webpage: 77 | {URL} 78 | 79 | And here is the cleaned HTML content of that webpage: 80 | 81 | {HTML} 82 | 83 | 84 | Your task is to break down this HTML content into semantically relevant blocks, and for each block, generate a JSON object with the following keys: 85 | 86 | - index: an integer representing the index of the block in the content 87 | - content: a list of strings containing the text content of the block 88 | 89 | To generate the JSON objects: 90 | 91 | 1. Carefully read through the HTML content and identify logical breaks or shifts in the content that would warrant splitting it into separate blocks. 92 | 93 | 2. For each block: 94 | a. Assign it an index based on its order in the content. 95 | b. Analyze the content and generate ONE semantic tag that describe what the block is about. 96 | c. Extract the text content, EXACTLY SAME AS GIVE DATA, clean it up if needed, and store it as a list of strings in the "content" field. 97 | 98 | 3. Ensure that the order of the JSON objects matches the order of the blocks as they appear in the original HTML content. 99 | 100 | 4. Double-check that each JSON object includes all required keys (index, tag, content) and that the values are in the expected format (integer, list of strings, etc.). 101 | 102 | 5. Make sure the generated JSON is complete and parsable, with no errors or omissions. 103 | 104 | 6. Make sur to escape any special characters in the HTML content, and also single or double quote to avoid JSON parsing issues. 105 | 106 | 7. Never alter the extracted content, just copy and paste it as it is. 107 | 108 | Please provide your output within tags, like this: 109 | 110 | 111 | [{ 112 | "index": 0, 113 | "tags": ["introduction"], 114 | "content": ["This is the first paragraph of the article, which provides an introduction and overview of the main topic."] 115 | }, 116 | { 117 | "index": 1, 118 | "tags": ["background"], 119 | "content": ["This is the second paragraph, which delves into the history and background of the topic.", 120 | "It provides context and sets the stage for the rest of the article."] 121 | }] 122 | 123 | 124 | Remember, the output should be a complete, parsable JSON wrapped in tags, with no omissions or errors. The JSON objects should semantically break down the content into relevant blocks, maintaining the original order.""" 125 | 126 | PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION = """Here is the URL of the webpage: 127 | {URL} 128 | 129 | And here is the cleaned HTML content of that webpage: 130 | 131 | {HTML} 132 | 133 | 134 | Your task is to break down this HTML content into semantically relevant blocks, following the provided user's REQUEST, and for each block, generate a JSON object with the following keys: 135 | 136 | - index: an integer representing the index of the block in the content 137 | - content: a list of strings containing the text content of the block 138 | 139 | This is the user's REQUEST, pay attention to it: 140 | 141 | {REQUEST} 142 | 143 | 144 | To generate the JSON objects: 145 | 146 | 1. Carefully read through the HTML content and identify logical breaks or shifts in the content that would warrant splitting it into separate blocks. 147 | 148 | 2. For each block: 149 | a. Assign it an index based on its order in the content. 150 | b. Analyze the content and generate ONE semantic tag that describe what the block is about. 151 | c. Extract the text content, EXACTLY SAME AS GIVE DATA, clean it up if needed, and store it as a list of strings in the "content" field. 152 | 153 | 3. Ensure that the order of the JSON objects matches the order of the blocks as they appear in the original HTML content. 154 | 155 | 4. Double-check that each JSON object includes all required keys (index, tag, content) and that the values are in the expected format (integer, list of strings, etc.). 156 | 157 | 5. Make sure the generated JSON is complete and parsable, with no errors or omissions. 158 | 159 | 6. Make sur to escape any special characters in the HTML content, and also single or double quote to avoid JSON parsing issues. 160 | 161 | 7. Never alter the extracted content, just copy and paste it as it is. 162 | 163 | Please provide your output within tags, like this: 164 | 165 | 166 | [{ 167 | "index": 0, 168 | "tags": ["introduction"], 169 | "content": ["This is the first paragraph of the article, which provides an introduction and overview of the main topic."] 170 | }, 171 | { 172 | "index": 1, 173 | "tags": ["background"], 174 | "content": ["This is the second paragraph, which delves into the history and background of the topic.", 175 | "It provides context and sets the stage for the rest of the article."] 176 | }] 177 | 178 | 179 | **Make sure to follow the user instruction to extract blocks aligin with the instruction.** 180 | 181 | Remember, the output should be a complete, parsable JSON wrapped in tags, with no omissions or errors. The JSON objects should semantically break down the content into relevant blocks, maintaining the original order.""" 182 | -------------------------------------------------------------------------------- /omniparse/web/router.py: -------------------------------------------------------------------------------- 1 | from fastapi import HTTPException, APIRouter 2 | from fastapi.responses import JSONResponse 3 | from omniparse import get_shared_state 4 | from omniparse.web import parse_url 5 | from omniparse.models import responseDocument 6 | # from omniparse.models import Document 7 | 8 | model_state = get_shared_state() 9 | website_router = APIRouter() 10 | 11 | 12 | # Website parsing endpoint 13 | @website_router.post("/parse") 14 | async def parse_website(url: str): 15 | try: 16 | parse_web_result: responseDocument = await parse_url(url, model_state) 17 | 18 | return JSONResponse(content=parse_web_result.model_dump()) 19 | 20 | except Exception as e: 21 | raise HTTPException(status_code=500, detail=str(e)) 22 | 23 | 24 | @website_router.post("/crawl") 25 | async def crawl_website(url: str): 26 | return {"Coming soon"} 27 | 28 | 29 | @website_router.post("/search") 30 | async def search_web(url: str, prompt: str): 31 | return {"Coming soon"} 32 | -------------------------------------------------------------------------------- /omniparse/web/utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Title: OmniParse 3 | Author: Adithya S K 4 | Date: 2024-07-02 5 | 6 | This code includes portions of code from the crawl4ai repository by unclecode, licensed under the Apache 2.0 License. 7 | Original repository: https://github.com/unclecode/crawl4ai 8 | 9 | Original Author: unclecode 10 | 11 | License: Apache 2.0 License 12 | URL: https://github.com/unclecode/crawl4ai/blob/main/LICENSE 13 | """ 14 | 15 | import time 16 | from concurrent.futures import ThreadPoolExecutor, as_completed 17 | from bs4 import BeautifulSoup, Comment, element, Tag, NavigableString 18 | import html2text 19 | import json 20 | import html 21 | import re 22 | import os 23 | from html2text import HTML2Text 24 | from .prompts import PROMPT_EXTRACT_BLOCKS 25 | from .config import * 26 | from pathlib import Path 27 | 28 | 29 | class InvalidCSSSelectorError(Exception): 30 | pass 31 | 32 | 33 | def get_home_folder(): 34 | home_folder = os.path.join(Path.home(), ".omniparse") 35 | os.makedirs(home_folder, exist_ok=True) 36 | os.makedirs(f"{home_folder}/cache", exist_ok=True) 37 | os.makedirs(f"{home_folder}/models", exist_ok=True) 38 | return home_folder 39 | 40 | 41 | def beautify_html(escaped_html): 42 | """ 43 | Beautifies an escaped HTML string. 44 | 45 | Parameters: 46 | escaped_html (str): A string containing escaped HTML. 47 | 48 | Returns: 49 | str: A beautifully formatted HTML string. 50 | """ 51 | # Unescape the HTML string 52 | unescaped_html = html.unescape(escaped_html) 53 | 54 | # Use BeautifulSoup to parse and prettify the HTML 55 | soup = BeautifulSoup(unescaped_html, "html.parser") 56 | pretty_html = soup.prettify() 57 | 58 | return pretty_html 59 | 60 | 61 | def split_and_parse_json_objects(json_string): 62 | """ 63 | Splits a JSON string which is a list of objects and tries to parse each object. 64 | 65 | Parameters: 66 | json_string (str): A string representation of a list of JSON objects, e.g., '[{...}, {...}, ...]'. 67 | 68 | Returns: 69 | tuple: A tuple containing two lists: 70 | - First list contains all successfully parsed JSON objects. 71 | - Second list contains the string representations of all segments that couldn't be parsed. 72 | """ 73 | # Trim the leading '[' and trailing ']' 74 | if json_string.startswith("[") and json_string.endswith("]"): 75 | json_string = json_string[1:-1].strip() 76 | 77 | # Split the string into segments that look like individual JSON objects 78 | segments = [] 79 | depth = 0 80 | start_index = 0 81 | 82 | for i, char in enumerate(json_string): 83 | if char == "{": 84 | if depth == 0: 85 | start_index = i 86 | depth += 1 87 | elif char == "}": 88 | depth -= 1 89 | if depth == 0: 90 | segments.append(json_string[start_index : i + 1]) 91 | 92 | # Try parsing each segment 93 | parsed_objects = [] 94 | unparsed_segments = [] 95 | 96 | for segment in segments: 97 | try: 98 | obj = json.loads(segment) 99 | parsed_objects.append(obj) 100 | except json.JSONDecodeError: 101 | unparsed_segments.append(segment) 102 | 103 | return parsed_objects, unparsed_segments 104 | 105 | 106 | def sanitize_html(html): 107 | # Replace all weird and special characters with an empty string 108 | sanitized_html = html 109 | # sanitized_html = re.sub(r'[^\w\s.,;:!?=\[\]{}()<>\/\\\-"]', '', html) 110 | 111 | # Escape all double and single quotes 112 | sanitized_html = sanitized_html.replace('"', '\\"').replace("'", "\\'") 113 | 114 | return sanitized_html 115 | 116 | 117 | def escape_json_string(s): 118 | """ 119 | Escapes characters in a string to be JSON safe. 120 | 121 | Parameters: 122 | s (str): The input string to be escaped. 123 | 124 | Returns: 125 | str: The escaped string, safe for JSON encoding. 126 | """ 127 | # Replace problematic backslash first 128 | s = s.replace("\\", "\\\\") 129 | 130 | # Replace the double quote 131 | s = s.replace('"', '\\"') 132 | 133 | # Escape control characters 134 | s = s.replace("\b", "\\b") 135 | s = s.replace("\f", "\\f") 136 | s = s.replace("\n", "\\n") 137 | s = s.replace("\r", "\\r") 138 | s = s.replace("\t", "\\t") 139 | 140 | # Additional problematic characters 141 | # Unicode control characters 142 | s = re.sub(r"[\x00-\x1f\x7f-\x9f]", lambda x: "\\u{:04x}".format(ord(x.group())), s) 143 | 144 | return s 145 | 146 | 147 | class CustomHTML2Text(HTML2Text): 148 | def __init__(self, *args, **kwargs): 149 | super().__init__(*args, **kwargs) 150 | self.ignore_links = True 151 | self.inside_pre = False 152 | self.inside_code = False 153 | 154 | def handle_tag(self, tag, attrs, start): 155 | if tag == "pre": 156 | if start: 157 | self.o("```\n") 158 | self.inside_pre = True 159 | else: 160 | self.o("\n```") 161 | self.inside_pre = False 162 | # elif tag == 'code' and not self.inside_pre: 163 | # if start: 164 | # if not self.inside_pre: 165 | # self.o('`') 166 | # self.inside_code = True 167 | # else: 168 | # if not self.inside_pre: 169 | # self.o('`') 170 | # self.inside_code = False 171 | 172 | super().handle_tag(tag, attrs, start) 173 | 174 | 175 | def get_content_of_website( 176 | url, html, word_count_threshold=MIN_WORD_THRESHOLD, css_selector=None 177 | ): 178 | try: 179 | if not html: 180 | return None 181 | # Parse HTML content with BeautifulSoup 182 | soup = BeautifulSoup(html, "html.parser") 183 | 184 | # Get the content within the tag 185 | body = soup.body 186 | 187 | # If css_selector is provided, extract content based on the selector 188 | if css_selector: 189 | selected_elements = body.select(css_selector) 190 | if not selected_elements: 191 | raise InvalidCSSSelectorError( 192 | f"Invalid CSS selector , No elements found for CSS selector: {css_selector}" 193 | ) 194 | div_tag = soup.new_tag("div") 195 | for el in selected_elements: 196 | div_tag.append(el) 197 | body = div_tag 198 | 199 | links = {"internal": [], "external": []} 200 | 201 | # Extract all internal and external links 202 | for a in body.find_all("a", href=True): 203 | href = a["href"] 204 | url_base = url.split("/")[2] 205 | if href.startswith("http") and url_base not in href: 206 | links["external"].append({"href": href, "text": a.get_text()}) 207 | else: 208 | links["internal"].append({"href": href, "text": a.get_text()}) 209 | 210 | # Remove script, style, and other tags that don't carry useful content from body 211 | for tag in body.find_all(["script", "style", "link", "meta", "noscript"]): 212 | tag.decompose() 213 | 214 | # Remove all attributes from remaining tags in body, except for img tags 215 | for tag in body.find_all(): 216 | if tag.name != "img": 217 | tag.attrs = {} 218 | 219 | # Extract all img tgas inti [{src: '', alt: ''}] 220 | media = {"images": [], "videos": [], "audios": []} 221 | for img in body.find_all("img"): 222 | media["images"].append( 223 | {"src": img.get("src"), "alt": img.get("alt"), "type": "image"} 224 | ) 225 | 226 | # Extract all video tags into [{src: '', alt: ''}] 227 | for video in body.find_all("video"): 228 | media["videos"].append( 229 | {"src": video.get("src"), "alt": video.get("alt"), "type": "video"} 230 | ) 231 | 232 | # Extract all audio tags into [{src: '', alt: ''}] 233 | for audio in body.find_all("audio"): 234 | media["audios"].append( 235 | {"src": audio.get("src"), "alt": audio.get("alt"), "type": "audio"} 236 | ) 237 | 238 | # Replace images with their alt text or remove them if no alt text is available 239 | for img in body.find_all("img"): 240 | alt_text = img.get("alt") 241 | if alt_text: 242 | img.replace_with(soup.new_string(alt_text)) 243 | else: 244 | img.decompose() 245 | 246 | # Create a function that replace content of all"pre" tage with its inner text 247 | def replace_pre_tags_with_text(node): 248 | for child in node.find_all("pre"): 249 | # set child inner html to its text 250 | child.string = child.get_text() 251 | return node 252 | 253 | # Replace all "pre" tags with their inner text 254 | body = replace_pre_tags_with_text(body) 255 | 256 | # Recursively remove empty elements, their parent elements, and elements with word count below threshold 257 | def remove_empty_and_low_word_count_elements(node, word_count_threshold): 258 | for child in node.contents: 259 | if isinstance(child, element.Tag): 260 | remove_empty_and_low_word_count_elements( 261 | child, word_count_threshold 262 | ) 263 | word_count = len(child.get_text(strip=True).split()) 264 | if ( 265 | len(child.contents) == 0 and not child.get_text(strip=True) 266 | ) or word_count < word_count_threshold: 267 | child.decompose() 268 | return node 269 | 270 | body = remove_empty_and_low_word_count_elements(body, word_count_threshold) 271 | 272 | def remove_small_text_tags( 273 | body: Tag, word_count_threshold: int = MIN_WORD_THRESHOLD 274 | ): 275 | # We'll use a list to collect all tags that don't meet the word count requirement 276 | tags_to_remove = [] 277 | 278 | # Traverse all tags in the body 279 | for tag in body.find_all(True): # True here means all tags 280 | # Check if the tag contains text and if it's not just whitespace 281 | if tag.string and tag.string.strip(): 282 | # Split the text by spaces and count the words 283 | word_count = len(tag.string.strip().split()) 284 | # If the word count is less than the threshold, mark the tag for removal 285 | if word_count < word_count_threshold: 286 | tags_to_remove.append(tag) 287 | 288 | # Remove all marked tags from the tree 289 | for tag in tags_to_remove: 290 | tag.decompose() # or tag.extract() to remove and get the element 291 | 292 | return body 293 | 294 | # Remove small text tags 295 | body = remove_small_text_tags(body, word_count_threshold) 296 | 297 | def is_empty_or_whitespace(tag: Tag): 298 | if isinstance(tag, NavigableString): 299 | return not tag.strip() 300 | # Check if the tag itself is empty or all its children are empty/whitespace 301 | if not tag.contents: 302 | return True 303 | return all(is_empty_or_whitespace(child) for child in tag.contents) 304 | 305 | def remove_empty_tags(body: Tag): 306 | # Continue processing until no more changes are made 307 | changes = True 308 | while changes: 309 | changes = False 310 | # Collect all tags that are empty or contain only whitespace 311 | empty_tags = [ 312 | tag for tag in body.find_all(True) if is_empty_or_whitespace(tag) 313 | ] 314 | for tag in empty_tags: 315 | # If a tag is empty, decompose it 316 | tag.decompose() 317 | changes = True # Mark that a change was made 318 | 319 | return body 320 | 321 | # Remove empty tags 322 | body = remove_empty_tags(body) 323 | 324 | # Flatten nested elements with only one child of the same type 325 | def flatten_nested_elements(node): 326 | for child in node.contents: 327 | if isinstance(child, element.Tag): 328 | flatten_nested_elements(child) 329 | if ( 330 | len(child.contents) == 1 331 | and child.contents[0].name == child.name 332 | ): 333 | # print('Flattening:', child.name) 334 | child_content = child.contents[0] 335 | child.replace_with(child_content) 336 | 337 | return node 338 | 339 | body = flatten_nested_elements(body) 340 | 341 | # Remove comments 342 | for comment in soup.find_all(string=lambda text: isinstance(text, Comment)): 343 | comment.extract() 344 | 345 | # Remove consecutive empty newlines and replace multiple spaces with a single space 346 | cleaned_html = str(body).replace("\n\n", "\n").replace(" ", " ") 347 | 348 | # Sanitize the cleaned HTML content 349 | cleaned_html = sanitize_html(cleaned_html) 350 | # sanitized_html = escape_json_string(cleaned_html) 351 | 352 | # Convert cleaned HTML to Markdown 353 | h = html2text.HTML2Text() 354 | h = CustomHTML2Text() 355 | h.ignore_links = True 356 | markdown = h.handle(cleaned_html) 357 | markdown = markdown.replace(" ```", "```") 358 | 359 | # Return the Markdown content 360 | return { 361 | "markdown": markdown, 362 | "cleaned_html": cleaned_html, 363 | "success": True, 364 | "media": media, 365 | "links": links, 366 | } 367 | 368 | except Exception as e: 369 | print("Error processing HTML content:", str(e)) 370 | raise InvalidCSSSelectorError(f"Invalid CSS selector: {css_selector}") from e 371 | 372 | 373 | def extract_metadata(html): 374 | metadata = {} 375 | 376 | if not html: 377 | return metadata 378 | 379 | # Parse HTML content with BeautifulSoup 380 | soup = BeautifulSoup(html, "html.parser") 381 | 382 | # Title 383 | title_tag = soup.find("title") 384 | metadata["title"] = title_tag.string if title_tag else None 385 | 386 | # Meta description 387 | description_tag = soup.find("meta", attrs={"name": "description"}) 388 | metadata["description"] = description_tag["content"] if description_tag else None 389 | 390 | # Meta keywords 391 | keywords_tag = soup.find("meta", attrs={"name": "keywords"}) 392 | metadata["keywords"] = keywords_tag["content"] if keywords_tag else None 393 | 394 | # Meta author 395 | author_tag = soup.find("meta", attrs={"name": "author"}) 396 | metadata["author"] = author_tag["content"] if author_tag else None 397 | 398 | # Open Graph metadata 399 | og_tags = soup.find_all( 400 | "meta", attrs={"property": lambda value: value and value.startswith("og:")} 401 | ) 402 | for tag in og_tags: 403 | property_name = tag["property"] 404 | metadata[property_name] = tag["content"] 405 | 406 | # Twitter Card metadata 407 | twitter_tags = soup.find_all( 408 | "meta", attrs={"name": lambda value: value and value.startswith("twitter:")} 409 | ) 410 | for tag in twitter_tags: 411 | property_name = tag["name"] 412 | metadata[property_name] = tag["content"] 413 | 414 | return metadata 415 | 416 | 417 | def extract_xml_tags(string): 418 | tags = re.findall(r"<(\w+)>", string) 419 | return list(set(tags)) 420 | 421 | 422 | def extract_xml_data(tags, string): 423 | data = {} 424 | 425 | for tag in tags: 426 | pattern = f"<{tag}>(.*?)" 427 | match = re.search(pattern, string, re.DOTALL) 428 | if match: 429 | data[tag] = match.group(1).strip() 430 | else: 431 | data[tag] = "" 432 | 433 | return data 434 | 435 | 436 | # Function to perform the completion with exponential backoff 437 | def perform_completion_with_backoff(provider, prompt_with_variables, api_token): 438 | from litellm import completion 439 | from litellm.exceptions import RateLimitError 440 | 441 | max_attempts = 3 442 | base_delay = 2 # Base delay in seconds, you can adjust this based on your needs 443 | 444 | for attempt in range(max_attempts): 445 | try: 446 | response = completion( 447 | model=provider, 448 | messages=[{"role": "user", "content": prompt_with_variables}], 449 | temperature=0.01, 450 | api_key=api_token, 451 | ) 452 | return response # Return the successful response 453 | except RateLimitError as e: 454 | print("Rate limit error:", str(e)) 455 | 456 | # Check if we have exhausted our max attempts 457 | if attempt < max_attempts - 1: 458 | # Calculate the delay and wait 459 | delay = base_delay * (2**attempt) # Exponential backoff formula 460 | print(f"Waiting for {delay} seconds before retrying...") 461 | time.sleep(delay) 462 | else: 463 | # Return an error response after exhausting all retries 464 | return [ 465 | { 466 | "index": 0, 467 | "tags": ["error"], 468 | "content": ["Rate limit error. Please try again later."], 469 | } 470 | ] 471 | 472 | 473 | def extract_blocks(url, html, provider=DEFAULT_PROVIDER, api_token=None): 474 | # api_token = os.getenv('GROQ_API_KEY', None) if not api_token else api_token 475 | api_token = PROVIDER_MODELS.get(provider, None) if not api_token else api_token 476 | 477 | variable_values = { 478 | "URL": url, 479 | "HTML": escape_json_string(sanitize_html(html)), 480 | } 481 | 482 | prompt_with_variables = PROMPT_EXTRACT_BLOCKS 483 | for variable in variable_values: 484 | prompt_with_variables = prompt_with_variables.replace( 485 | "{" + variable + "}", variable_values[variable] 486 | ) 487 | 488 | response = perform_completion_with_backoff( 489 | provider, prompt_with_variables, api_token 490 | ) 491 | 492 | try: 493 | blocks = extract_xml_data(["blocks"], response.choices[0].message.content)[ 494 | "blocks" 495 | ] 496 | blocks = json.loads(blocks) 497 | ## Add error: False to the blocks 498 | for block in blocks: 499 | block["error"] = False 500 | except Exception as e: 501 | print("Error extracting blocks:", str(e)) 502 | parsed, unparsed = split_and_parse_json_objects( 503 | response.choices[0].message.content 504 | ) 505 | blocks = parsed 506 | # Append all unparsed segments as onr error block and content is list of unparsed segments 507 | if unparsed: 508 | blocks.append( 509 | {"index": 0, "error": True, "tags": ["error"], "content": unparsed} 510 | ) 511 | return blocks 512 | 513 | 514 | def extract_blocks_batch(batch_data, provider="groq/llama3-70b-8192", api_token=None): 515 | api_token = os.getenv("GROQ_API_KEY", None) if not api_token else api_token 516 | from litellm import batch_completion 517 | 518 | messages = [] 519 | 520 | for url, html in batch_data: 521 | variable_values = { 522 | "URL": url, 523 | "HTML": html, 524 | } 525 | 526 | prompt_with_variables = PROMPT_EXTRACT_BLOCKS 527 | for variable in variable_values: 528 | prompt_with_variables = prompt_with_variables.replace( 529 | "{" + variable + "}", variable_values[variable] 530 | ) 531 | 532 | messages.append([{"role": "user", "content": prompt_with_variables}]) 533 | 534 | responses = batch_completion(model=provider, messages=messages, temperature=0.01) 535 | 536 | all_blocks = [] 537 | for response in responses: 538 | try: 539 | blocks = extract_xml_data(["blocks"], response.choices[0].message.content)[ 540 | "blocks" 541 | ] 542 | blocks = json.loads(blocks) 543 | 544 | except Exception as e: 545 | print("Error extracting blocks:", str(e)) 546 | blocks = [ 547 | { 548 | "index": 0, 549 | "tags": ["error"], 550 | "content": [ 551 | "Error extracting blocks from the HTML content. Choose another provider/model or try again." 552 | ], 553 | "questions": [ 554 | "What went wrong during the block extraction process?" 555 | ], 556 | } 557 | ] 558 | all_blocks.append(blocks) 559 | 560 | return sum(all_blocks, []) 561 | 562 | 563 | def merge_chunks_based_on_token_threshold(chunks, token_threshold): 564 | """ 565 | Merges small chunks into larger ones based on the total token threshold. 566 | 567 | :param chunks: List of text chunks to be merged based on token count. 568 | :param token_threshold: Max number of tokens for each merged chunk. 569 | :return: List of merged text chunks. 570 | """ 571 | merged_sections = [] 572 | current_chunk = [] 573 | total_token_so_far = 0 574 | 575 | for chunk in chunks: 576 | chunk_token_count = ( 577 | len(chunk.split()) * 1.3 578 | ) # Estimate token count with a factor 579 | if total_token_so_far + chunk_token_count < token_threshold: 580 | current_chunk.append(chunk) 581 | total_token_so_far += chunk_token_count 582 | else: 583 | if current_chunk: 584 | merged_sections.append("\n\n".join(current_chunk)) 585 | current_chunk = [chunk] 586 | total_token_so_far = chunk_token_count 587 | 588 | # Add the last chunk if it exists 589 | if current_chunk: 590 | merged_sections.append("\n\n".join(current_chunk)) 591 | 592 | return merged_sections 593 | 594 | 595 | def process_sections(url: str, sections: list, provider: str, api_token: str) -> list: 596 | extracted_content = [] 597 | if provider.startswith("groq/"): 598 | # Sequential processing with a delay 599 | for section in sections: 600 | extracted_content.extend(extract_blocks(url, section, provider, api_token)) 601 | time.sleep(0.5) # 500 ms delay between each processing 602 | else: 603 | # Parallel processing using ThreadPoolExecutor 604 | with ThreadPoolExecutor() as executor: 605 | futures = [ 606 | executor.submit(extract_blocks, url, section, provider, api_token) 607 | for section in sections 608 | ] 609 | for future in as_completed(futures): 610 | extracted_content.extend(future.result()) 611 | 612 | return extracted_content 613 | 614 | 615 | def wrap_text(draw, text, font, max_width): 616 | # Wrap the text to fit within the specified width 617 | lines = [] 618 | words = text.split() 619 | while words: 620 | line = "" 621 | while ( 622 | words and draw.textbbox((0, 0), line + words[0], font=font)[2] <= max_width 623 | ): 624 | line += words.pop(0) + " " 625 | lines.append(line) 626 | return "\n".join(lines) 627 | 628 | 629 | from fastapi import FastAPI, UploadFile, File, HTTPException, APIRouter, status, Form 630 | import importlib 631 | 632 | 633 | def import_strategy(module_name: str, class_name: str, *args, **kwargs): 634 | try: 635 | module = importlib.import_module(module_name) 636 | strategy_class = getattr(module, class_name) 637 | return strategy_class(*args, **kwargs) 638 | except ImportError: 639 | print("ImportError: Module not found.") 640 | raise HTTPException(status_code=400, detail=f"Module {module_name} not found.") 641 | except AttributeError: 642 | print("AttributeError: Class not found.") 643 | raise HTTPException( 644 | status_code=400, detail=f"Class {class_name} not found in {module_name}." 645 | ) 646 | -------------------------------------------------------------------------------- /omniparse/web/web_crawler.py: -------------------------------------------------------------------------------- 1 | """ 2 | Title: OmniParse 3 | Author: Adithya S K 4 | Date: 2024-07-02 5 | 6 | This code includes portions of code from the crawl4ai repository by unclecode, licensed under the Apache 2.0 License. 7 | Original repository: https://github.com/unclecode/crawl4ai 8 | 9 | Original Author: unclecode 10 | 11 | License: Apache 2.0 License 12 | URL: https://github.com/unclecode/crawl4ai/blob/main/LICENSE 13 | """ 14 | 15 | import os 16 | import time 17 | 18 | os.environ["TOKENIZERS_PARALLELISM"] = "false" 19 | from omniparse.web.models import UrlModel 20 | from omniparse.web.utils import ( 21 | get_content_of_website, 22 | extract_metadata, 23 | InvalidCSSSelectorError, 24 | ) 25 | from omniparse.web.crawler_strategy import CrawlerStrategy, LocalSeleniumCrawlerStrategy 26 | from typing import List 27 | from concurrent.futures import ThreadPoolExecutor 28 | from omniparse.web.config import DEFAULT_PROVIDER, MIN_WORD_THRESHOLD 29 | from omniparse.models import responseDocument 30 | 31 | 32 | class WebCrawler: 33 | def __init__( 34 | self, 35 | crawler_strategy: CrawlerStrategy = None, 36 | always_by_pass_cache: bool = True, 37 | verbose: bool = False, 38 | ): 39 | self.crawler_strategy = crawler_strategy or LocalSeleniumCrawlerStrategy( 40 | verbose=verbose 41 | ) 42 | self.always_by_pass_cache = always_by_pass_cache 43 | self.ready = False 44 | 45 | def warmup(self): 46 | print("[LOG] Warming up the WebCrawler") 47 | result = self.run( 48 | url="https://adithyask.com", 49 | word_count_threshold=5, 50 | bypass_cache=True, 51 | verbose=False, 52 | ) 53 | print(result) 54 | self.ready = True 55 | print("[LOG] WebCrawler is ready to crawl") 56 | 57 | def fetch_page( 58 | self, 59 | url_model: UrlModel, 60 | provider: str = DEFAULT_PROVIDER, 61 | api_token: str = None, 62 | extract_blocks_flag: bool = True, 63 | word_count_threshold=MIN_WORD_THRESHOLD, 64 | css_selector: str = None, 65 | screenshot: bool = False, 66 | use_cached_html: bool = False, 67 | **kwargs, 68 | ) -> responseDocument: 69 | return self.run( 70 | url_model.url, 71 | word_count_threshold, 72 | bypass_cache=url_model.forced, 73 | css_selector=css_selector, 74 | screenshot=screenshot, 75 | **kwargs, 76 | ) 77 | pass 78 | 79 | def fetch_pages( 80 | self, 81 | url_models: List[UrlModel], 82 | provider: str = DEFAULT_PROVIDER, 83 | api_token: str = None, 84 | extract_blocks_flag: bool = True, 85 | word_count_threshold=MIN_WORD_THRESHOLD, 86 | use_cached_html: bool = False, 87 | css_selector: str = None, 88 | screenshot: bool = False, 89 | **kwargs, 90 | ) -> List[responseDocument]: 91 | def fetch_page_wrapper(url_model, *args, **kwargs): 92 | return self.fetch_page(url_model, *args, **kwargs) 93 | 94 | with ThreadPoolExecutor() as executor: 95 | results = list( 96 | executor.map( 97 | fetch_page_wrapper, 98 | url_models, 99 | [provider] * len(url_models), 100 | [api_token] * len(url_models), 101 | [extract_blocks_flag] * len(url_models), 102 | [word_count_threshold] * len(url_models), 103 | [css_selector] * len(url_models), 104 | [screenshot] * len(url_models), 105 | [use_cached_html] * len(url_models), 106 | *[kwargs] * len(url_models), 107 | ) 108 | ) 109 | 110 | return results 111 | 112 | def run( 113 | self, 114 | url: str, 115 | word_count_threshold=MIN_WORD_THRESHOLD, 116 | bypass_cache: bool = False, 117 | css_selector: str = None, 118 | screenshot: bool = False, 119 | user_agent: str = None, 120 | verbose=True, 121 | **kwargs, 122 | ) -> responseDocument: 123 | extracted_content = None 124 | cached = None 125 | if word_count_threshold < MIN_WORD_THRESHOLD: 126 | word_count_threshold = MIN_WORD_THRESHOLD 127 | 128 | else: 129 | if user_agent: 130 | self.crawler_strategy.update_user_agent(user_agent) 131 | html = self.crawler_strategy.crawl(url) 132 | if screenshot: 133 | screenshot = self.crawler_strategy.take_screenshot() 134 | 135 | processed_html = self.process_html( 136 | url, 137 | html, 138 | extracted_content, 139 | word_count_threshold, 140 | css_selector, 141 | screenshot, 142 | verbose, 143 | bool(cached), 144 | **kwargs, 145 | ) 146 | 147 | crawl_result = responseDocument( 148 | text=processed_html["markdown"], metadata=processed_html 149 | ) 150 | crawl_result.add_image("screenshot", image_data=processed_html["screenshot"]) 151 | return crawl_result 152 | 153 | def process_html( 154 | self, 155 | url: str, 156 | html: str, 157 | extracted_content: str, 158 | word_count_threshold: int, 159 | css_selector: str, 160 | screenshot: bool, 161 | verbose: bool, 162 | is_cached: bool, 163 | **kwargs, 164 | ): 165 | t = time.time() 166 | # Extract content from HTML 167 | try: 168 | result = get_content_of_website( 169 | url, html, word_count_threshold, css_selector=css_selector 170 | ) 171 | metadata = extract_metadata(html) 172 | if result is None: 173 | raise ValueError(f"Failed to extract content from the website: {url}") 174 | except InvalidCSSSelectorError as e: 175 | raise ValueError(str(e)) 176 | 177 | cleaned_html = result.get("cleaned_html", "") 178 | markdown = result.get("markdown", "") 179 | media = result.get("media", []) 180 | links = result.get("links", []) 181 | 182 | if verbose: 183 | print( 184 | f"[LOG] Crawling done for {url}, success: True, time taken: {time.time() - t} seconds" 185 | ) 186 | 187 | screenshot = None if not screenshot else screenshot 188 | 189 | return { 190 | "url": url, 191 | "html": html, 192 | "cleaned_html": cleaned_html, 193 | "markdown": markdown, 194 | "media": media, 195 | "links": links, 196 | "metadata": metadata, 197 | "screenshot": screenshot, 198 | "extracted_content": extracted_content, 199 | "success": True, 200 | "error_message": "", 201 | } 202 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "omniparse" 3 | version = "0.0.1" 4 | description = "API to convert Documents | Media | Webpage into Structured Markdown for LLM usecases" 5 | authors = ["Adithya S K "] 6 | license = "Apache" 7 | readme = "README.md" 8 | keywords = ["pdf", "markdown", "ocr", "parse"] 9 | include = [ 10 | "server.py", 11 | ] 12 | 13 | [tool.poetry.dependencies] 14 | python = "^3.10" 15 | scikit-learn = "^1.3.2" 16 | Pillow = "^10.1.0" 17 | pydantic = "^2.4.2" 18 | pydantic-settings = "^2.0.3" 19 | transformers = "^4.41.2" 20 | numpy = "^1.26.1" 21 | python-dotenv = "^1.0.0" 22 | torch = "^2.2.2" # Issue with torch 2.3.0 and vision models - https://github.com/pytorch/pytorch/issues/121834 23 | tqdm = "^4.66.1" 24 | tabulate = "^0.9.0" 25 | ftfy = "^6.1.1" 26 | texify = "^0.1.8" 27 | rapidfuzz = "^3.8.1" 28 | surya-ocr = "^0.4.3" 29 | filetype = "^1.2.0" 30 | regex = "^2024.4.28" 31 | pdftext = "^0.3.10" 32 | grpcio = "^1.63.0" 33 | fastapi = "^0.111.0" 34 | uvicorn = "^0.29.0" 35 | pypdfium2 = "^4.30.0" 36 | moviepy = "^1.0.3" 37 | openai-whisper = "^20231117" 38 | pytube = "^15.0.0" 39 | beautifulsoup4 = "^4.12.3" 40 | html2text = "^2024.2.26" 41 | selenium = "^4.21.0" 42 | webdriver-manager = "^4.0.1" 43 | img2pdf = "^0.5.1" 44 | matplotlib = "^3.9.0" 45 | timm = "^1.0.7" 46 | flash-attn = "^2.5.9" 47 | art = "^6.2" 48 | gradio = "^4.37.1" 49 | nltk = "^3.8.1" 50 | marker-pdf = "^0.2.16" 51 | 52 | [tool.poetry.scripts] 53 | omniparse = "server:main" 54 | 55 | [build-system] 56 | requires = ["poetry-core"] 57 | build-backend = "poetry.core.masonry.api" 58 | -------------------------------------------------------------------------------- /python-sdk/omniparse_client/__init__.py: -------------------------------------------------------------------------------- 1 | from .omniparse import OmniParse 2 | -------------------------------------------------------------------------------- /python-sdk/omniparse_client/omniparse.py: -------------------------------------------------------------------------------- 1 | import os 2 | import httpx 3 | import base64 4 | import requests 5 | import aiofiles 6 | from typing import Optional 7 | from .utils import save_images_and_markdown, ParsedDocument 8 | 9 | 10 | class OmniParse: 11 | def __init__(self, api_key=None, base_url="http://localhost:8000"): 12 | self.api_key = api_key 13 | self.base_url = base_url 14 | 15 | def load_data(self, file_path): 16 | return self.convert_pdf_to_markdown_and_save([file_path]) 17 | 18 | def convert_pdf_to_markdown_and_save(self, pdf_file_paths): 19 | files = [] 20 | 21 | # Prepare the files for the request 22 | for pdf_file_path in pdf_file_paths: 23 | with open(pdf_file_path, "rb") as f: 24 | pdf_content = f.read() 25 | files.append( 26 | ( 27 | "pdf_files", 28 | (os.path.basename(pdf_file_path), pdf_content, "application/pdf"), 29 | ) 30 | ) 31 | 32 | # Send request to FastAPI server with all PDF files attached 33 | response = requests.post(self.base_url, files=files) 34 | 35 | # Check if request was successful 36 | if response.status_code == 200: 37 | # Save markdown and images 38 | response_data = response.json() 39 | output_folder = os.path.splitext(os.path.basename(pdf_file_paths[0]))[0] 40 | save_images_and_markdown(response_data, output_folder) 41 | print("Markdown and images saved successfully.") 42 | else: 43 | print(f"Error: {response.text}") 44 | 45 | 46 | class AsyncOmniParse: 47 | """ 48 | An asynchronous client for interacting with the OmniParse server. 49 | 50 | OmniParse is a platform that ingests and parses unstructured data into structured, 51 | actionable data optimized for GenAI (LLM) applications. This client provides methods 52 | to interact with the OmniParse server, allowing users to parse various types of 53 | unstructured data including documents, images, videos, audio files, and web pages. 54 | 55 | The client supports parsing of multiple file types and provides structured output 56 | in markdown format, making it ideal for AI applications such as RAG (Retrieval-Augmented Generation) 57 | and fine-tuning. 58 | 59 | Attributes: 60 | api_key (str): API key for authentication with the OmniParse server. 61 | base_url (str): Base URL for the OmniParse API endpoints. 62 | timeout (int): Timeout for API requests in seconds. 63 | 64 | Usage Examples: 65 | ```python 66 | # Initialize the client 67 | parser = AsyncOmniParse(api_key="your_api_key", base_url="http://localhost:8000") 68 | 69 | # Parse a PDF document 70 | async def parse_pdf_example(): 71 | result = await parser.parse_pdf("/path/to/document.pdf", output_folder="/path/to/output") 72 | print(result.markdown) # Access the parsed content 73 | 74 | # Process an image 75 | async def process_image_example(): 76 | result = await parser.process_image("/path/to/image.jpg", task="Caption", prompt="Describe this image") 77 | print(result) # Print the image processing result 78 | 79 | # Parse a website 80 | async def parse_website_example(): 81 | result = await parser.parse_website("https://example.com") 82 | print(result) # Print the parsed website content 83 | 84 | # Parse a video file 85 | async def parse_video_example(): 86 | result = await parser.parse_video("/path/to/video.mp4") 87 | print(result) # Print the parsed video content 88 | 89 | # Use in an async context 90 | async def main(): 91 | await parse_pdf_example() 92 | await process_image_example() 93 | await parse_website_example() 94 | await parse_video_example() 95 | 96 | # Run the async main function 97 | import asyncio 98 | asyncio.run(main()) 99 | ``` 100 | """ 101 | 102 | def __init__(self, api_key=None, base_url="http://localhost:8000", timeout=120): 103 | self.api_key = api_key 104 | self.base_url = base_url 105 | self.timeout = timeout 106 | 107 | self.parse_media_endpoint = "/parse_media" 108 | self.parse_website_endpoint = "/parse_website" 109 | self.parse_document_endpoint = "/parse_document" 110 | 111 | self.image_process_tasks = { 112 | "OCR", 113 | "OCR with Region", 114 | "Caption", 115 | "Detailed Caption", 116 | "More Detailed Caption", 117 | "Object Detection", 118 | "Dense Region Caption", 119 | "Region Proposal", 120 | } 121 | 122 | self.allowed_audio_extentions = {".mp3", ".wav", ".aac"} 123 | self.allowed_video_extentions = {".mp4", ".mkv", ".avi", ".mov"} 124 | self.allowed_document_extentions = {".pdf", ".ppt", ".pptx", ".doc", ".docs"} 125 | self.allowed_image_extentions = { 126 | ".png", 127 | ".jpg", 128 | ".jpeg", 129 | ".tiff", 130 | ".bmp", 131 | ".heic", 132 | } 133 | 134 | async def __request__( 135 | self, endpoint: str, files: dict = None, json: dict = None 136 | ) -> dict: 137 | """ 138 | Internal method to make API requests. 139 | 140 | Args: 141 | endpoint (str): API endpoint. 142 | files (dict, optional): Files to be sent with the request. 143 | json (dict, optional): JSON data to be sent with the request. 144 | 145 | Returns: 146 | dict: JSON response from the API. 147 | """ 148 | url = f"{self.base_url}{endpoint}" 149 | headers = {"Authorization": f"Bearer {self.api_key}"} if self.api_key else {} 150 | async with httpx.AsyncClient() as client: 151 | response = await client.post( 152 | url, files=files, json=json, headers=headers, timeout=self.timeout 153 | ) 154 | response.raise_for_status() 155 | return response.json() 156 | 157 | async def parse_document( 158 | self, file_path: str, output_folder: Optional[str] 159 | ) -> ParsedDocument: 160 | """ 161 | Parse a document file (PDF, PPT, or DOCX) and convert it to structured markdown. 162 | 163 | This method extracts text, tables, and images from the document, providing a 164 | structured output optimized for LLM applications. 165 | 166 | Args: 167 | file_path (str): Path to the document file. 168 | output_folder (Optional[str]): If provided, the parsed data will be saved in this folder. 169 | A new subfolder will be created with the name of the input file, and the parsed 170 | content will be saved within this subfolder. 171 | 172 | Returns: 173 | ParsedDocument: Parsed document data including extracted text, tables, and images. 174 | 175 | Raises: 176 | ValueError: If the file type is not supported. 177 | 178 | Note: 179 | If output_folder is provided, the method will save the parsed data and print a 180 | confirmation message. 181 | """ 182 | file_ext = os.path.splitext(file_path)[1].lower() 183 | 184 | if file_ext not in self.allowed_document_extentions: 185 | raise ValueError( 186 | f"Unsupported file type. Only files of format {', '.join(self.allowed_document_extentions)} are allowed." 187 | ) 188 | 189 | async with aiofiles.open(file_path, "rb") as file: 190 | file_data = await file.read() 191 | response = await self.__request__( 192 | self.parse_document_endpoint, files={"file": file_data} 193 | ) 194 | data = ParsedDocument( 195 | **response, source_path=file_path, output_folder=output_folder 196 | ) 197 | if output_folder: 198 | data.save_data(echo=True) 199 | 200 | async def parse_pdf( 201 | self, file_path: str, output_folder: Optional[str] 202 | ) -> ParsedDocument: 203 | """ 204 | Parse a PDF file and convert it to structured markdown. 205 | 206 | Args: 207 | file_path (str): Path to the PDF file. 208 | output_folder (Optional[str]): If provided, the parsed data will be saved in this folder. 209 | A new subfolder will be created with the name of the PDF file, and the parsed 210 | content will be saved within this subfolder. 211 | 212 | Returns: 213 | ParsedDocument: Parsed PDF data including extracted text, tables, and images. 214 | 215 | Raises: 216 | ValueError: If the file is not a PDF. 217 | 218 | Note: 219 | If output_folder is provided, the method will save the parsed data and print a 220 | confirmation message. 221 | """ 222 | file_ext = os.path.splitext(file_path)[1].lower() 223 | if file_ext != ".pdf": 224 | raise ValueError( 225 | f"The file must be a PDF (.pdf), but received a file of type {file_ext}" 226 | ) 227 | 228 | async with aiofiles.open(file_path, "rb") as file: 229 | file_data = await file.read() 230 | response = await self.__request__( 231 | f"{self.parse_document_endpoint}/pdf", files={"file": file_data} 232 | ) 233 | data = ParsedDocument( 234 | **response, source_path=file_path, output_folder=output_folder 235 | ) 236 | if output_folder: 237 | data.save_data(echo=True) 238 | 239 | async def parse_ppt( 240 | self, file_path: str, output_folder: Optional[str] 241 | ) -> ParsedDocument: 242 | """ 243 | Parse a PowerPoint file and convert it to structured markdown. 244 | 245 | Args: 246 | file_path (str): Path to the PPT or PPTX file. 247 | output_folder (Optional[str]): If provided, the parsed data will be saved in this folder. 248 | A new subfolder will be created with the name of the PowerPoint file, and the parsed 249 | content will be saved within this subfolder. 250 | 251 | Returns: 252 | ParsedDocument: Parsed PowerPoint data including extracted text, tables, and images. 253 | 254 | Raises: 255 | ValueError: If the file is not a PPT or PPTX. 256 | 257 | Note: 258 | If output_folder is provided, the method will save the parsed data and print a 259 | confirmation message. 260 | """ 261 | file_ext = os.path.splitext(file_path)[1].lower() 262 | if file_ext not in [".ppt", ".pptx"]: 263 | raise ValueError( 264 | f"The file must be a PPT file (.ppt or .pptx), but received a file of type {file_ext}" 265 | ) 266 | 267 | async with aiofiles.open(file_path, "rb") as file: 268 | file_data = await file.read() 269 | response = await self.__request__( 270 | f"{self.parse_document_endpoint}/ppt", files={"file": file_data} 271 | ) 272 | data = ParsedDocument( 273 | **response, source_path=file_path, output_folder=output_folder 274 | ) 275 | if output_folder: 276 | data.save_data(echo=True) 277 | 278 | async def parse_docs( 279 | self, file_path: str, output_folder: Optional[str] 280 | ) -> ParsedDocument: 281 | """ 282 | Parse a Word document file and convert it to structured markdown. 283 | 284 | Args: 285 | file_path (str): Path to the DOC or DOCS file. 286 | output_folder (Optional[str]): If provided, the parsed data will be saved in this folder. 287 | A new subfolder will be created with the name of the Word document file, and the parsed 288 | content will be saved within this subfolder. 289 | 290 | Returns: 291 | ParsedDocument: Parsed Word document data including extracted text, tables, and images. 292 | 293 | Raises: 294 | ValueError: If the file is not a DOC or DOCS. 295 | 296 | Note: 297 | If output_folder is provided, the method will save the parsed data and print a 298 | confirmation message. 299 | """ 300 | file_ext = os.path.splitext(file_path)[1].lower() 301 | if file_ext not in [".doc", ".docs"]: 302 | raise ValueError( 303 | f"The file must be a DOC file (.doc or .docs), but received a file of type {file_ext}" 304 | ) 305 | 306 | async with aiofiles.open(file_path, "rb") as file: 307 | file_data = await file.read() 308 | response = await self.__request__( 309 | f"{self.parse_document_endpoint}/docs", files={"file": file_data} 310 | ) 311 | data = ParsedDocument( 312 | **response, source_path=file_path, output_folder=output_folder 313 | ) 314 | if output_folder: 315 | data.save_data(echo=True) 316 | 317 | async def parse_image(self, file_path: str) -> dict: 318 | """ 319 | Parse an image file, extracting visual information and generating captions. 320 | 321 | This method can be used for tasks such as object detection, image captioning, 322 | and text extraction (OCR) from images. 323 | 324 | Args: 325 | file_path (str): Path to the image file. 326 | 327 | Returns: 328 | dict: Parsed image data including captions, detected objects, and extracted text. 329 | 330 | Raises: 331 | ValueError: If the file type is not supported. 332 | """ 333 | file_ext = os.path.splitext(file_path)[1].lower() 334 | if file_ext not in self.allowed_image_extentions: 335 | raise ValueError( 336 | f"Unsupported file type. Only files of format {', '.join(self.allowed_image_extentions)} are allowed." 337 | ) 338 | 339 | async with aiofiles.open(file_path, "rb") as file: 340 | file_data = await file.read() 341 | return await self.__request__( 342 | f"{self.parse_media_endpoint}/image", files={"file": file_data} 343 | ) 344 | 345 | async def parse_video(self, file_path: str) -> dict: 346 | """ 347 | Parse a video file, extracting key frames, generating captions, and transcribing audio. 348 | 349 | This method provides a structured representation of the video content, including 350 | visual and audio information. 351 | 352 | Args: 353 | file_path (str): Path to the video file. 354 | 355 | Returns: 356 | dict: Parsed video data including transcriptions, captions, and key frame information. 357 | 358 | Raises: 359 | ValueError: If the file type is not supported. 360 | """ 361 | file_ext = os.path.splitext(file_path)[1].lower() 362 | if file_ext not in self.allowed_video_extentions: 363 | raise ValueError( 364 | f"Unsupported file type. Only files of format {', '.join(self.allowed_video_extentions)} are allowed." 365 | ) 366 | 367 | async with aiofiles.open(file_path, "rb") as file: 368 | file_data = await file.read() 369 | return await self.__request__( 370 | f"{self.parse_media_endpoint}/video", files={"file": file_data} 371 | ) 372 | 373 | async def parse_audio(self, file_path: str) -> dict: 374 | """ 375 | Parse an audio file, transcribing speech to text. 376 | 377 | This method converts spoken words in the audio file to text, providing a textual 378 | representation of the audio content. 379 | 380 | Args: 381 | file_path (str): Path to the audio file. 382 | 383 | Returns: 384 | dict: Parsed audio data including the transcription. 385 | 386 | Raises: 387 | ValueError: If the file type is not supported. 388 | """ 389 | file_ext = os.path.splitext(file_path)[1].lower() 390 | if file_ext not in self.allowed_audio_extentions: 391 | raise ValueError( 392 | f"Unsupported file type. Only files of format {', '.join(self.allowed_audio_extentions)} are allowed." 393 | ) 394 | 395 | async with aiofiles.open(file_path, "rb") as file: 396 | file_data = await file.read() 397 | return await self.__request__( 398 | f"{self.parse_media_endpoint}/audio", files={"file": file_data} 399 | ) 400 | 401 | async def process_image( 402 | self, file_path: str, task: str, prompt: Optional[str] = None 403 | ) -> dict: 404 | """ 405 | Process an image with a specific task such as OCR, captioning, or object detection. 406 | 407 | This method allows for more specific image processing tasks beyond basic parsing. 408 | 409 | Args: 410 | file_path (str): Path to the image file. 411 | task (str): Image processing task to perform (e.g., "OCR", "Caption", "Object Detection"). 412 | prompt (Optional[str]): Optional prompt for certain tasks, useful for guided processing. 413 | 414 | Returns: 415 | dict: Processed image data specific to the requested task. 416 | 417 | Raises: 418 | ValueError: If the task is invalid or the file type is not supported. 419 | """ 420 | if task not in self.image_process_tasks: 421 | raise ValueError( 422 | f"Invalid task. Choose from: {', '.join(self.image_process_tasks)}" 423 | ) 424 | file_ext = os.path.splitext(file_path)[1].lower() 425 | if file_ext not in self.allowed_image_extentions: 426 | raise ValueError( 427 | f"Unsupported file type. Only files of format {', '.join(self.allowed_image_extentions)} are allowed." 428 | ) 429 | 430 | async with aiofiles.open(file_path, "rb") as file: 431 | file_data = await file.read() 432 | data = {"task": task} 433 | if prompt: 434 | data["prompt"] = prompt 435 | return await self.__request__( 436 | json=data, 437 | files={"image": file_data}, 438 | endpoint=f"{self.parse_media_endpoint}/process_image", 439 | ) 440 | 441 | async def parse_website(self, url: str) -> dict: 442 | """ 443 | Parse a website, extracting structured content from web pages. 444 | 445 | This method crawls the specified URL, extracting text, images, and other relevant 446 | content in a structured format. 447 | 448 | Args: 449 | url (str): URL of the website to parse. 450 | 451 | Returns: 452 | dict: Parsed website data including extracted text, links, and media references. 453 | """ 454 | return await self.__request__(self.parse_website_endpoint, json={"url": url}) 455 | -------------------------------------------------------------------------------- /python-sdk/omniparse_client/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import base64 4 | import mimetypes 5 | from typing import Any, List, Dict, Optional 6 | from pydantic import BaseModel, model_validator 7 | 8 | 9 | class ImageObj(BaseModel): 10 | """ 11 | Represents an image object with name, binary data, and MIME type. 12 | 13 | Attributes: 14 | name (str): The name of the image file. 15 | bytes (str): The binary data of the image, encoded as a string. 16 | mime_type (str): The MIME type of the image, automatically guessed if not provided. 17 | 18 | Methods: 19 | set_mime_type: A validator that automatically sets the MIME type based on the file name if not provided. 20 | """ 21 | 22 | name: str 23 | bytes: str 24 | mime_type: str = None 25 | 26 | @model_validator(mode="before") 27 | def set_mime_type(cls, values): 28 | name = values.get("name") 29 | mime_type = values.get("mime_type") 30 | 31 | if not mime_type and name: 32 | mime_type, _ = mimetypes.guess_type(name) 33 | values["mime_type"] = mime_type 34 | return values 35 | 36 | 37 | class TableObj(BaseModel): 38 | """ 39 | Represents a table extracted from markdown. 40 | 41 | Attributes: 42 | name (str): The name of the table. 43 | markdown (str): The original markdown representation of the table. 44 | titles (List[str]): The column titles of the table. 45 | data (List[List[str]]): The table data as a list of rows, where each row is a list of cell values. 46 | """ 47 | 48 | name: str 49 | markdown: str 50 | titles: List[str] = None 51 | data: List[List[str]] = None 52 | 53 | 54 | class MetaData(BaseModel): 55 | """ 56 | Contains metadata about a parsed document. 57 | 58 | Attributes: 59 | filetype (str): The type of the file (e.g., 'pdf', 'docx'). 60 | language (List[str]): The detected languages in the document. 61 | toc (List[Any]): Table of contents, if available. 62 | pages (int): Number of pages in the document. 63 | ocr_stats (Dict[str, Any]): Statistics related to OCR processing. 64 | block_stats (Dict[str, Any]): Statistics about document blocks. 65 | postprocess_stats (Dict[str, Any]): Statistics about post-processing. 66 | """ 67 | 68 | filetype: str 69 | language: List[str] = [] 70 | toc: List[Any] = [] 71 | pages: int = 0 72 | ocr_stats: Dict[str, Any] = {} 73 | block_stats: Dict[str, Any] = {} 74 | postprocess_stats: Dict[str, Any] = {} 75 | 76 | 77 | class ParsedDocument(BaseModel): 78 | """ 79 | Represents a parsed document with its content and associated data. 80 | 81 | Attributes: 82 | markdown (str): The document content in markdown format. 83 | images (Optional[List[ImageObj]|dict]): Images extracted from the document. 84 | tables (Optional[List[TableObj]]): Tables extracted from the document. 85 | metadata (Optional[MetaData]): Metadata about the document. 86 | source_path (Optional[str]): Path to the source document. 87 | output_folder (Optional[str]): Folder to save parsed data. 88 | 89 | Methods: 90 | set_mime_type: A validator that processes images and tables data. 91 | save_data: Saves the parsed document data to files. 92 | """ 93 | 94 | markdown: str 95 | images: Optional[List[ImageObj] | dict] = None 96 | tables: Optional[List[TableObj]] = None 97 | metadata: Optional[MetaData] = None 98 | source_path: Optional[str] = None 99 | output_folder: Optional[str] = None 100 | 101 | @model_validator(mode="before") 102 | def set_mime_type(cls, values): 103 | images: dict = values.get("images") 104 | markdown_text: str = values.get("markdown") 105 | has_tables: bool = values.get("metadata", {}).get("block_stats", False) 106 | 107 | if has_tables: 108 | values["tables"] = [ 109 | table.model_dump() for table in markdown_to_tables(markdown_text) 110 | ] 111 | if isinstance(images, dict): 112 | values["images"] = [] 113 | for name, data in images.items(): 114 | values["images"].append(ImageObj(name=name, bytes=data).model_dump()) 115 | 116 | return values 117 | 118 | def save_data(self, echo: bool = False): 119 | """ 120 | Saves the parsed document data to files. 121 | 122 | Args: 123 | echo (bool): If True, prints a message after saving the data. 124 | """ 125 | if not self.output_folder: 126 | print("No target path provided for saving the parsed data.") 127 | return 128 | base_name = os.path.basename(self.source_path) 129 | filename = os.path.splitext(base_name)[0] 130 | 131 | markdown_output_path = os.path.join(self.output_folder, f"{filename}/output.md") 132 | image_output_dir = os.path.join(self.output_folder, filename) 133 | os.makedirs(image_output_dir, exist_ok=True) 134 | 135 | with open(markdown_output_path, "w", encoding="utf-8") as md_file: 136 | md_file.write(self.markdown) 137 | 138 | if self.images: 139 | for image_obj in self.images: 140 | image_filename = image_obj.name 141 | image_path = os.path.join(image_output_dir, image_filename) 142 | 143 | _, ext = os.path.splitext(image_filename) 144 | if ext != "." + image_obj.mime_type.split("/")[1]: 145 | image_filename += ext 146 | 147 | with open(image_path, "wb") as img_file: 148 | img_file.write(image_obj.bytes) 149 | if echo: 150 | print(f"Data saved to {markdown_output_path}") 151 | 152 | 153 | def extract_markdown_tables(markdown_string: str) -> List[str]: 154 | """ 155 | Extracts all tables from a markdown string. 156 | 157 | Args: 158 | markdown_string (str): The input markdown string containing tables. 159 | 160 | Returns: 161 | List[str]: A list of strings, where each string is a complete markdown table. 162 | """ 163 | table_pattern = r"(\|[^\n]+\|\n)((?:\|:?[-]+:?)+\|)(\n(?:\|[^\n]+\|\n?)+)" 164 | tables = re.findall(table_pattern, markdown_string, re.MULTILINE) 165 | return ["".join(table) for table in tables] 166 | 167 | 168 | def markdown_to_tables(markdown: str) -> List[TableObj] | None: 169 | """ 170 | Converts markdown tables to a list of TableObj instances. 171 | 172 | Args: 173 | markdown (str): The input markdown string containing tables. 174 | 175 | Returns: 176 | List[TableObj]|None: A list of TableObj instances if tables are found, None otherwise. 177 | """ 178 | markdown_tables = extract_markdown_tables(markdown) 179 | tables = [] 180 | if markdown_tables: 181 | for i, table_md in enumerate(markdown_tables): 182 | rows = table_md.strip().split("\n") 183 | titles = [cell.strip() for cell in rows[0].split("|") if cell.strip()] 184 | data_rows = [ 185 | row for row in rows[2:] if not set(row.strip(" |")).issubset(set(":-")) 186 | ] 187 | data = [ 188 | [cell.strip() for cell in row.split("|") if cell.strip()] 189 | for row in data_rows 190 | ] 191 | tables.append( 192 | TableObj( 193 | data=data, 194 | titles=titles, 195 | name=f"table_{i}", 196 | markdown=table_md, 197 | ) 198 | ) 199 | return tables or None 200 | 201 | 202 | def save_images_and_markdown(response_data, output_folder): 203 | # Create output folder if it doesn't exist 204 | os.makedirs(output_folder, exist_ok=True) 205 | 206 | for pdf in response_data: 207 | pdf_filename = pdf["filename"] 208 | pdf_output_folder = os.path.join( 209 | output_folder, os.path.splitext(pdf_filename)[0] 210 | ) 211 | 212 | # Create a folder for each PDF 213 | os.makedirs(pdf_output_folder, exist_ok=True) 214 | 215 | # Save markdown 216 | markdown_text = pdf["markdown"] 217 | with open( 218 | os.path.join(pdf_output_folder, "output.md"), "w", encoding="utf-8" 219 | ) as f: 220 | f.write(markdown_text) 221 | 222 | # Save images 223 | image_data = pdf["images"] 224 | for image_name, image_base64 in image_data.items(): 225 | # Decode base64 image 226 | image_bytes = base64.b64decode(image_base64) 227 | 228 | # Save image 229 | with open(os.path.join(pdf_output_folder, image_name), "wb") as f: 230 | f.write(image_bytes) 231 | -------------------------------------------------------------------------------- /python-sdk/pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "omniparse_client" 3 | version = "0.0.1" 4 | description = "python client library for omniparse - easily parse/igest documents | media | website" 5 | authors = ["Adithya S Kolavi"] 6 | license = "Apache-2.0 license" 7 | 8 | [tool.poetry.dependencies] 9 | python = "^3.10" 10 | requests = "^2.32.3" 11 | pillow = "^10.3.0" 12 | httpx = "^0.27.0" 13 | pydantic = "^2.7.4" 14 | aiofiles = "^24.1.0" 15 | 16 | [tool.poetry.dev-dependencies] 17 | 18 | [build-system] 19 | requires = ["poetry-core>=1.0.0"] 20 | build-backend = "poetry.core.masonry.api" 21 | -------------------------------------------------------------------------------- /server.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | import argparse 3 | from fastapi import FastAPI 4 | from fastapi.middleware.cors import CORSMiddleware 5 | 6 | from omniparse import load_omnimodel 7 | from omniparse.documents.router import document_router 8 | from omniparse.media.router import media_router 9 | from omniparse.image.router import image_router 10 | from omniparse.web.router import website_router 11 | from omniparse.demo import demo_ui 12 | 13 | # logging.basicConfig(level=logging.DEBUG) 14 | import gradio as gr 15 | 16 | warnings.filterwarnings( 17 | "ignore", category=UserWarning 18 | ) # Filter torch pytree user warnings 19 | # app = FastAPI(lifespan=lifespan) 20 | app = FastAPI() 21 | 22 | # io = gr.Interface(lambda x: "Hello, " + x + "!", "textbox", "textbox") 23 | 24 | 25 | app.add_middleware( 26 | CORSMiddleware, 27 | allow_origins=["*"], 28 | allow_credentials=True, 29 | allow_methods=["*"], 30 | allow_headers=["*"], 31 | ) 32 | 33 | # Include routers in the main app 34 | app.include_router(document_router, prefix="/parse_document", tags=["Documents"]) 35 | app.include_router(image_router, prefix="/parse_image", tags=["Images"]) 36 | app.include_router(media_router, prefix="/parse_media", tags=["Media"]) 37 | app.include_router(website_router, prefix="/parse_website", tags=["Website"]) 38 | app = gr.mount_gradio_app(app, demo_ui, path="") 39 | 40 | 41 | def main(): 42 | # Parse command-line arguments 43 | parser = argparse.ArgumentParser(description="Run the omniparse server.") 44 | parser.add_argument("--host", default="0.0.0.0", help="Host IP address") 45 | parser.add_argument("--port", type=int, default=8000, help="Port number") 46 | parser.add_argument("--documents", action="store_true", help="Load document models") 47 | parser.add_argument("--media", action="store_true", help="Load media models") 48 | parser.add_argument("--web", action="store_true", help="Load web models") 49 | parser.add_argument("--reload", action="store_true", help="Reload Server") 50 | args = parser.parse_args() 51 | 52 | # Set global variables based on parsed arguments 53 | load_omnimodel(args.documents, args.media, args.web) 54 | 55 | # Conditionally include routers based on arguments 56 | app.include_router( 57 | document_router, 58 | prefix="/parse_document", 59 | tags=["Documents"], 60 | include_in_schema=args.documents, 61 | ) 62 | app.include_router( 63 | image_router, 64 | prefix="/parse_image", 65 | tags=["Images"], 66 | include_in_schema=args.documents, 67 | ) 68 | app.include_router( 69 | media_router, 70 | prefix="/parse_media", 71 | tags=["Media"], 72 | include_in_schema=args.media, 73 | ) 74 | app.include_router( 75 | website_router, 76 | prefix="/parse_website", 77 | tags=["Website"], 78 | include_in_schema=args.web, 79 | ) 80 | 81 | # Start the server 82 | import uvicorn 83 | 84 | uvicorn.run("server:app", host=args.host, port=args.port, reload=args.reload) 85 | 86 | 87 | if __name__ == "__main__": 88 | main() 89 | --------------------------------------------------------------------------------