├── .aws └── task_definition.json ├── .env.example ├── .flake8 ├── .gitattributes ├── .github └── workflows │ ├── CI.yml │ ├── build-and-deploy.yml │ ├── build-gpu.yml │ ├── release-please.yml │ └── test-build-docker.yml ├── .gitignore ├── .pre-commit-config.yaml ├── .python-version ├── .release-please-manifest.json ├── .vscode ├── extensions.json ├── launch.json └── settings.json ├── CHANGELOG.md ├── Dockerfile ├── Dockerfile.gpu ├── LICENSE ├── Makefile ├── Pipfile ├── README.md ├── benchmark ├── process_single_doc.py └── test_quality_sim.py ├── docker-compose.dev.yml ├── docker-compose.yml ├── docs └── archive.txt ├── evaluations └── script.py ├── images └── tables.png ├── libs ├── megaparse │ ├── .python-version │ ├── CHANGELOG.md │ ├── README.md │ ├── bench.md │ ├── examples │ │ ├── parse_file_fast.py │ │ ├── parse_file_mp.py │ │ └── parse_file_unstructured.py │ ├── program.prof │ ├── pyproject.toml │ ├── src │ │ └── megaparse │ │ │ ├── __init__.py │ │ │ ├── api │ │ │ ├── __init__.py │ │ │ ├── app.py │ │ │ ├── exceptions │ │ │ │ ├── __init__.py │ │ │ │ └── megaparse_exceptions.py │ │ │ └── models │ │ │ │ ├── __init__.py │ │ │ │ └── base.py │ │ │ ├── configs │ │ │ └── auto.py │ │ │ ├── examples │ │ │ ├── parse_file.py │ │ │ └── parsing_process.py │ │ │ ├── exceptions │ │ │ └── base.py │ │ │ ├── formatter │ │ │ ├── base.py │ │ │ ├── structured_formatter │ │ │ │ ├── __init__.py │ │ │ │ └── custom_structured_formatter.py │ │ │ └── table_formatter │ │ │ │ ├── __init__.py │ │ │ │ ├── llm_table_formatter.py │ │ │ │ └── vision_table_formatter.py │ │ │ ├── layout_detection │ │ │ ├── layout_detector.py │ │ │ ├── models │ │ │ │ └── yolov10s-doclaynet.onnx │ │ │ └── output.py │ │ │ ├── megaparse.py │ │ │ ├── models │ │ │ └── page.py │ │ │ ├── parser │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ ├── builder.py │ │ │ ├── doctr_parser.py │ │ │ ├── entity.py │ │ │ ├── llama.py │ │ │ ├── megaparse_vision.py │ │ │ └── unstructured_parser.py │ │ │ ├── predictor │ │ │ └── layout_predictor.py │ │ │ └── utils │ │ │ ├── extract_metadata.py │ │ │ ├── onnx.py │ │ │ └── strategy.py │ └── tests │ │ ├── __init__.py │ │ ├── certs │ │ ├── client-cert.pem │ │ └── client-key.pem │ │ ├── conftest.py │ │ ├── data │ │ ├── MegaFake_report.pdf │ │ ├── dummy.pdf │ │ └── grt_example │ │ │ └── MegaFake_report.md │ │ ├── pdf │ │ ├── mlbook.pdf │ │ ├── native │ │ │ ├── 0168011.pdf │ │ │ ├── 0168014.pdf │ │ │ └── 0168029.pdf │ │ ├── ocr │ │ │ ├── 0168003.pdf │ │ │ ├── 0168004.pdf │ │ │ ├── 0168119.pdf │ │ │ ├── 0168120.pdf │ │ │ ├── 0168123.pdf │ │ │ ├── 0168126.pdf │ │ │ ├── 0168127.pdf │ │ │ └── 0168322.pdf │ │ ├── rust.pdf │ │ ├── sample_native.pdf │ │ ├── sample_pdf.pdf │ │ ├── sample_table.pdf │ │ ├── test_detect_ocr.py │ │ ├── test_pdf_processing.py │ │ └── test_pdfium_parser.py │ │ ├── supported_docs │ │ ├── Sway.epub │ │ ├── file-sample_500kB.odt │ │ ├── file_example_XLSX_50.xlsx │ │ ├── file_example_XLS_50.xls │ │ ├── sample.csv │ │ ├── sample.docx │ │ ├── sample.markdown │ │ ├── sample.md │ │ ├── sample.otf │ │ ├── sample.pptx │ │ ├── sample.txt │ │ ├── sample.xml │ │ ├── sample_complexe.html │ │ └── sample_native.pdf │ │ ├── test_endpoints.py │ │ ├── test_import.py │ │ └── test_parsers.py └── megaparse_sdk │ ├── CHANGELOG.md │ ├── README.md │ ├── __init__.py │ ├── examples │ └── usage_example.py │ ├── megaparse_sdk │ ├── __init__.py │ ├── client.py │ ├── config.py │ ├── endpoints │ │ ├── __init__.py │ │ ├── file_upload.py │ │ └── url_upload.py │ ├── schema │ │ ├── __init__.py │ │ ├── document.py │ │ ├── extensions.py │ │ ├── languages.py │ │ ├── mp_exceptions.py │ │ ├── mp_inputs.py │ │ ├── mp_outputs.py │ │ ├── parser_config.py │ │ └── supported_models.py │ └── utils │ │ └── load_ssl.py │ ├── pyproject.toml │ └── tests │ ├── README.md │ ├── certs │ ├── client-cert.pem │ ├── client-key.pem │ └── rootCA.pem │ ├── pdf │ ├── MegaFake_report.pdf │ └── sample_table.pdf │ └── test_nats_client.py ├── logo.png ├── pyproject.toml ├── release-please-config.json ├── requirements-dev.lock └── requirements.lock /.aws/task_definition.json: -------------------------------------------------------------------------------- 1 | { 2 | "taskDefinitionArn": "arn:aws:ecs:eu-west-1:253053805092:task-definition/megaparse-task:2", 3 | "containerDefinitions": [ 4 | { 5 | "name": "megaparse", 6 | "image": "quay.io/unstructured-io/unstructured-api:latest", 7 | "cpu": 0, 8 | "portMappings": [ 9 | { 10 | "containerPort": 8000, 11 | "hostPort": 8000, 12 | "protocol": "tcp" 13 | } 14 | ], 15 | "essential": true, 16 | "environment": [ 17 | { 18 | "name": "UNSTRUCTURED_HI_RES_MODEL_NAME", 19 | "value": "detectron2_onnx" 20 | }, 21 | { 22 | "name": "UNSTRUCTURED_PARALLEL_MODE_ENABLED", 23 | "value": "false" 24 | } 25 | ], 26 | "mountPoints": [], 27 | "volumesFrom": [], 28 | "logConfiguration": { 29 | "logDriver": "awslogs", 30 | "options": { 31 | "awslogs-group": "/ecs/megaparse", 32 | "awslogs-region": "eu-west-1", 33 | "awslogs-stream-prefix": "ecs" 34 | } 35 | }, 36 | "systemControls": [] 37 | } 38 | ], 39 | "family": "megaparse-task", 40 | "executionRoleArn": "arn:aws:iam::253053805092:role/megaparse-ecsTaskExecutionRole", 41 | "networkMode": "awsvpc", 42 | "revision": 2, 43 | "volumes": [], 44 | "status": "ACTIVE", 45 | "requiresAttributes": [ 46 | { 47 | "name": "com.amazonaws.ecs.capability.logging-driver.awslogs" 48 | }, 49 | { 50 | "name": "ecs.capability.execution-role-awslogs" 51 | }, 52 | { 53 | "name": "com.amazonaws.ecs.capability.docker-remote-api.1.19" 54 | }, 55 | { 56 | "name": "com.amazonaws.ecs.capability.docker-remote-api.1.18" 57 | }, 58 | { 59 | "name": "ecs.capability.task-eni" 60 | } 61 | ], 62 | "placementConstraints": [], 63 | "compatibilities": [ 64 | "EC2", 65 | "FARGATE" 66 | ], 67 | "requiresCompatibilities": [ 68 | "FARGATE" 69 | ], 70 | "cpu": "2048", 71 | "memory": "8192", 72 | "tags": [] 73 | } -------------------------------------------------------------------------------- /.env.example: -------------------------------------------------------------------------------- 1 | LLAMA_CLOUD_API_KEY=llx-1234567890 2 | OPENAI_API_KEY=sk-1234567890 3 | MEGAPARSE_API_KEY=MyMegaParseKey -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | ; Minimal configuration for Flake8 to work with Black. 3 | max-line-length = 100 4 | ignore = E101,E111,E112,E221,E222,E501,E711,E712,W503,W504,F401,BLK100 5 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | *.ipynb linguist-vendored 2 | *.html linguist-vendored -------------------------------------------------------------------------------- /.github/workflows/CI.yml: -------------------------------------------------------------------------------- 1 | name: Run tests 2 | 3 | on: 4 | pull_request: 5 | workflow_dispatch: 6 | 7 | env: 8 | NATS_TOKEN: test 9 | 10 | jobs: 11 | test: 12 | name: Run tests on Python ${{ matrix.python-version }} 13 | runs-on: ubuntu-latest 14 | strategy: 15 | matrix: 16 | python-version: ["3.11", "3.12"] 17 | steps: 18 | - name: 👀 Checkout code 19 | uses: actions/checkout@v2 20 | with: 21 | submodules: true 22 | 23 | - name: Setup apt cache 24 | uses: actions/cache@v2 25 | with: 26 | path: /var/cache/apt/archives 27 | key: ${{ runner.os }}-apt-${{ hashFiles('/etc/apt/sources.list') }} 28 | 29 | - name: 😭 Install system dependencies 30 | run: | 31 | sudo apt-get update && sudo apt-get install -y \ 32 | netcat-traditional \ 33 | unzip \ 34 | libgeos-dev \ 35 | libcurl4-openssl-dev \ 36 | libssl-dev \ 37 | binutils \ 38 | curl \ 39 | git \ 40 | autoconf \ 41 | automake \ 42 | build-essential \ 43 | libtool \ 44 | gcc \ 45 | libmagic-dev \ 46 | poppler-utils \ 47 | tesseract-ocr \ 48 | libreoffice \ 49 | libpq-dev \ 50 | pandoc 51 | 52 | - name: 🔽 Install the latest version of rye 53 | uses: eifinger/setup-rye@v4 54 | with: 55 | enable-cache: true 56 | 57 | - name: 📌 Pin Python version 58 | run: rye pin ${{ matrix.python-version }} 59 | 60 | - name: 🔽 Download and Install NATS Server 61 | run: | 62 | curl -L https://github.com/nats-io/nats-server/releases/download/v2.10.22/nats-server-v2.10.22-linux-amd64.zip -o nats-server.zip 63 | unzip nats-server.zip -d nats-server && sudo cp nats-server/nats-server-v2.10.22-linux-amd64/nats-server /usr/bin 64 | 65 | - name: 🛠️ Set up NATS arguments 66 | run: | 67 | nohup nats-server \ 68 | --addr 0.0.0.0 \ 69 | --port 4222 \ 70 | --auth "$NATS_TOKEN" > nats.log 2>&1 & 71 | 72 | - name: 🔍 Verify NATS Server is Running 73 | run: | 74 | sleep 1 # Give the server some time to start 75 | if nc -zv localhost 4222; then 76 | echo "✅ NATS Server is running on port 4222." 77 | else 78 | echo "❌ Failed to start NATS Server." 79 | cat nats.log 80 | exit 1 81 | fi 82 | 83 | - name: 🔨 Sync dependencies 84 | run: | 85 | UV_INDEX_STRATEGY=unsafe-first-match rye sync --no-lock 86 | 87 | - name: 🚀 Run tests 88 | run: | 89 | rye test -p megaparse-sdk 90 | -------------------------------------------------------------------------------- /.github/workflows/build-and-deploy.yml: -------------------------------------------------------------------------------- 1 | name: Build Docker image and push ECR 2 | 3 | on: 4 | push: 5 | tags: 6 | - "v*" 7 | branches: [main] 8 | 9 | env: 10 | AWS_REGION: eu-west-1 11 | ECR_REPOSITORY: quivrhq/megaparse 12 | ECS_CLUSTER: megaparse 13 | ECS_TASK_DEFINITION: .aws/task_definition.json 14 | CONTAINER_NAME: megaparse 15 | 16 | permissions: 17 | contents: read 18 | 19 | jobs: 20 | deploy: 21 | name: build docker 22 | runs-on: ubuntu-latest 23 | environment: production 24 | outputs: 25 | imageoutput: ${{ steps.build-image.outputs.imageoutput }} 26 | 27 | steps: 28 | - name: Checkout 29 | uses: actions/checkout@v3 30 | 31 | - name: Configure AWS credentials 32 | uses: aws-actions/configure-aws-credentials@v4 33 | with: 34 | aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} 35 | aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} 36 | aws-region: us-east-1 37 | 38 | - name: Login to Amazon ECR 39 | id: login-ecr 40 | uses: aws-actions/amazon-ecr-login@v1 41 | with: 42 | registry-type: public 43 | 44 | - name: Build, tag, and push image to Amazon ECR 45 | id: build-image 46 | env: 47 | ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }} 48 | IMAGE_TAG: ${{ github.sha }} 49 | run: | 50 | # Build a docker container and push it to ECR 51 | docker build -t $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG . 52 | docker push $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG 53 | 54 | # Tag the image as 'latest' and push 55 | docker tag $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG $ECR_REGISTRY/$ECR_REPOSITORY:latest 56 | docker push $ECR_REGISTRY/$ECR_REPOSITORY:latest 57 | 58 | echo "imageoutput=$ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG" >> $GITHUB_OUTPUT 59 | -------------------------------------------------------------------------------- /.github/workflows/build-gpu.yml: -------------------------------------------------------------------------------- 1 | name: Build docker GPU and push ECR 2 | 3 | on: 4 | push: 5 | tags: 6 | - "v*" 7 | branches: [main] 8 | 9 | env: 10 | AWS_REGION: eu-west-1 11 | ECR_REPOSITORY: quivrhq/megaparse-gpu 12 | ECS_CLUSTER: megaparse 13 | ECS_TASK_DEFINITION: .aws/task_definition.json 14 | CONTAINER_NAME: megaparse 15 | 16 | permissions: 17 | contents: read 18 | 19 | jobs: 20 | deploy: 21 | name: Build docker-gpu 22 | runs-on: 23 | group: big-boy-gpu 24 | environment: production 25 | outputs: 26 | imageoutput: ${{ steps.build-image.outputs.imageoutput }} 27 | 28 | steps: 29 | - name: Checkout 30 | uses: actions/checkout@v3 31 | 32 | - name: Configure AWS credentials 33 | uses: aws-actions/configure-aws-credentials@v4 34 | with: 35 | aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} 36 | aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} 37 | aws-region: us-east-1 38 | 39 | - name: Login to Amazon ECR 40 | id: login-ecr 41 | uses: aws-actions/amazon-ecr-login@v1 42 | with: 43 | registry-type: public 44 | 45 | - name: Build, tag, and push image to Amazon ECR 46 | id: build-image 47 | env: 48 | ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }} 49 | IMAGE_TAG: ${{ github.sha }} 50 | run: | 51 | # Build a docker container and push it to ECR 52 | docker build -t $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG -f Dockerfile.gpu . 53 | docker push $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG 54 | 55 | # Tag the image as 'latest' and push 56 | docker tag $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG $ECR_REGISTRY/$ECR_REPOSITORY:latest 57 | docker push $ECR_REGISTRY/$ECR_REPOSITORY:latest 58 | 59 | echo "imageoutput=$ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG" >> $GITHUB_OUTPUT 60 | -------------------------------------------------------------------------------- /.github/workflows/release-please.yml: -------------------------------------------------------------------------------- 1 | on: 2 | push: 3 | branches: 4 | - main 5 | 6 | permissions: 7 | contents: write 8 | pull-requests: write 9 | 10 | name: release-please 11 | 12 | jobs: 13 | release-please: 14 | runs-on: ubuntu-latest 15 | outputs: 16 | release_created: ${{ steps.release.outputs['libs/megaparse--release_created'] }} 17 | release_created_sdk: ${{ steps.release.outputs['libs/megaparse_sdk--release_created'] }} 18 | steps: 19 | - name: Checkout repository 20 | uses: actions/checkout@v3 21 | with: 22 | fetch-depth: 0 # Fetch all history for tags and releases 23 | 24 | - name: Setup Python 25 | uses: actions/setup-python@v4 26 | with: 27 | python-version: "3.11" 28 | 29 | - name: Run release-please 30 | id: release 31 | uses: google-github-actions/release-please-action@v4 32 | with: 33 | token: ${{ secrets.RELEASE_PLEASE_TOKEN }} 34 | 35 | deploy-megaparse: 36 | if: needs.release-please.outputs.release_created == 'true' 37 | needs: release-please 38 | runs-on: ubuntu-latest 39 | steps: 40 | - uses: actions/checkout@v4 41 | - name: Install Rye 42 | uses: eifinger/setup-rye@v2 43 | with: 44 | enable-cache: true 45 | - name: Rye Sync 46 | run: rye sync --no-lock 47 | - name: Rye Build 48 | run: cd libs/megaparse && rye build 49 | - name: Rye Publish 50 | run: cd libs/megaparse && rye publish --token ${{ secrets.PYPI_API_TOKEN }} --yes 51 | 52 | deploy-sdk: 53 | if: needs.release-please.outputs.release_created_sdk == 'true' 54 | needs: release-please 55 | runs-on: ubuntu-latest 56 | steps: 57 | - uses: actions/checkout@v4 58 | - name: Install Rye 59 | uses: eifinger/setup-rye@v2 60 | with: 61 | enable-cache: true 62 | - name: Rye Sync 63 | run: cd libs/megaparse_sdk && rye sync --no-lock 64 | - name: Rye Build 65 | run: cd libs/megaparse_sdk && rye build 66 | - name: Rye Publish 67 | run: cd libs/megaparse_sdk && rye publish --token ${{ secrets.PYPI_API_TOKEN }} --yes 68 | -------------------------------------------------------------------------------- /.github/workflows/test-build-docker.yml: -------------------------------------------------------------------------------- 1 | on: 2 | pull_request: 3 | branches: 4 | - main 5 | 6 | name: Test build docker 7 | jobs: 8 | build-docker: 9 | runs-on: ubuntu-latest 10 | strategy: 11 | matrix: 12 | dockerfile: [Dockerfile, Dockerfile.gpu] 13 | steps: 14 | - name: Checkout repository 15 | uses: actions/checkout@v3 16 | 17 | - name: Set up QEMU 18 | uses: docker/setup-qemu-action@v3 19 | with: 20 | platforms: all 21 | 22 | - name: Set up Docker Buildx 23 | uses: docker/setup-buildx-action@v3 24 | 25 | - name: Build Docker image with caching 26 | uses: docker/build-push-action@v4 27 | with: 28 | context: . 29 | file: ${{ matrix.dockerfile }} 30 | push: false 31 | tags: quivrhq/megaparse:${{ matrix.dockerfile }} 32 | cache-from: type=gha 33 | cache-to: type=gha,mode=max 34 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /output 2 | /input 3 | .env 4 | __pycache__/ 5 | dist/** 6 | megaparse.egg-info/ 7 | *.pyc 8 | build/* 9 | ENV 10 | venv 11 | */evaluations/* 12 | */cdp/* 13 | *.pkl 14 | 15 | !megaparse/tests/output_tests/MegaFake_report.md 16 | *.DS_Store 17 | .tool-versions 18 | megaparse/sdk/examples/only_pdfs/* 19 | 20 | **/profile/ 21 | **/prof/ 22 | .ropeproject/ 23 | benchmark/hi_res/* 24 | benchmark/auto/* 25 | 26 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v4.6.0 4 | hooks: 5 | - id: check-added-large-files 6 | args: ["--maxkb=5000"] 7 | - id: check-toml 8 | - id: check-yaml 9 | - id: end-of-file-fixer 10 | - id: trailing-whitespace 11 | - id: check-merge-conflict 12 | - id: detect-private-key 13 | - id: check-case-conflict 14 | - repo: https://github.com/pre-commit/pre-commit 15 | rev: v3.6.2 16 | hooks: 17 | - id: validate_manifest 18 | - repo: https://github.com/astral-sh/ruff-pre-commit 19 | # Ruff version. 20 | rev: v0.5.1 21 | hooks: 22 | # Run the linter. 23 | - id: ruff 24 | args: [--fix] 25 | additional_dependencies: [] 26 | # Run the formatter. 27 | - id: ruff-format 28 | additional_dependencies: [] 29 | - repo: https://github.com/pre-commit/mirrors-mypy 30 | rev: v1.10.1 31 | hooks: 32 | - id: mypy 33 | name: mypy 34 | additional_dependencies: ["types-aiofiles"] 35 | -------------------------------------------------------------------------------- /.python-version: -------------------------------------------------------------------------------- 1 | 3.11.9 2 | -------------------------------------------------------------------------------- /.release-please-manifest.json: -------------------------------------------------------------------------------- 1 | { 2 | "libs/megaparse": "0.0.55", 3 | "libs/megaparse_sdk": "0.1.12" 4 | } 5 | -------------------------------------------------------------------------------- /.vscode/extensions.json: -------------------------------------------------------------------------------- 1 | { 2 | "recommendations": [ 3 | "dbaeumer.vscode-eslint", 4 | "charliermarsh.ruff", 5 | "knisterpeter.vscode-github", 6 | "github.vscode-pull-request-github", 7 | "ms-python.python", 8 | "ms-python.vscode-pylance", 9 | "ms-python.debugpy" 10 | ] 11 | } 12 | -------------------------------------------------------------------------------- /.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": "0.2.0", 3 | "configurations": [ 4 | { 5 | "name": "Python: Remote Attach", 6 | "type": "python", 7 | "request": "attach", 8 | "connect": { 9 | "host": "localhost", 10 | "port": 5678 11 | }, 12 | "pathMappings": [ 13 | { 14 | "localRoot": "${workspaceFolder}/backend", 15 | "remoteRoot": "." 16 | } 17 | ], 18 | "justMyCode": true 19 | }, 20 | { 21 | "name": "Python: Debug Test Script", 22 | "type": "python", 23 | "request": "launch", 24 | "program": "${workspaceFolder}/backend/test_process_file_and_notify.py", 25 | "console": "integratedTerminal", 26 | "justMyCode": false 27 | }, 28 | { 29 | "name": "Python: Debug", 30 | "type": "debugpy", 31 | "request": "launch", 32 | "program": "${file}", 33 | "console": "integratedTerminal", 34 | "justMyCode": false, 35 | "env": { 36 | "PYTHONPATH": "${workspaceFolder}/backend:${env:PYTHONPATH}" 37 | }, 38 | "envFile": "${workspaceFolder}/.env" 39 | } 40 | ] 41 | } 42 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "editor.formatOnSave": true, 3 | "editor.formatOnSaveMode": "file", 4 | "files.exclude": { 5 | "**/__pycache__": true, 6 | "**/.benchmarks/": true, 7 | "**/.cache/": true, 8 | "**/.pytest_cache/": true, 9 | "**/.next/": true, 10 | "**/build/": true, 11 | "**/.docusaurus/": true, 12 | "**/node_modules/": true 13 | }, 14 | "[python]": { 15 | "editor.defaultFormatter": "charliermarsh.ruff", 16 | "editor.formatOnSave": true, 17 | "editor.codeActionsOnSave": { 18 | "source.organizeImports": "explicit", 19 | "source.fixAll": "explicit" 20 | } 21 | }, 22 | "python.testing.unittestEnabled": false, 23 | "python.testing.pytestEnabled": true, 24 | "python.testing.autoTestDiscoverOnSaveEnabled": true, 25 | "python.analysis.autoImportCompletions": true, 26 | "python.analysis.typeCheckingMode": "basic", 27 | "python.analysis.diagnosticSeverityOverrides": { 28 | "reportMissingImports": "error", 29 | "reportUnusedImport": "warning", 30 | "reportGeneralTypeIssues": "warning" 31 | }, 32 | "makefile.configureOnOpen": false 33 | } 34 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.11.10-slim-bullseye 2 | 3 | WORKDIR /app 4 | 5 | # Install runtime dependencies 6 | RUN apt-get update && apt-get upgrade && apt-get install -y \ 7 | libgeos-dev \ 8 | libcurl4-openssl-dev \ 9 | libssl-dev \ 10 | binutils \ 11 | curl \ 12 | git \ 13 | autoconf \ 14 | automake \ 15 | build-essential \ 16 | libtool \ 17 | python-dev \ 18 | build-essential \ 19 | wget \ 20 | gcc \ 21 | # Additional dependencies for document handling 22 | libmagic-dev \ 23 | poppler-utils \ 24 | tesseract-ocr \ 25 | libreoffice \ 26 | libpq-dev \ 27 | pandoc && \ 28 | rm -rf /var/lib/apt/lists/* && apt-get clean 29 | 30 | COPY requirements.lock pyproject.toml README.md ./ 31 | COPY libs/megaparse/pyproject.toml libs/megaparse/README.md libs/megaparse/ 32 | COPY libs/megaparse_sdk/pyproject.toml libs/megaparse_sdk/README.md libs/megaparse_sdk/ 33 | 34 | RUN pip install uv 35 | RUN uv pip install --no-cache --system -r requirements.lock 36 | 37 | RUN playwright install --with-deps 38 | RUN python3 - -m nltk.downloader all 39 | 40 | COPY . . 41 | 42 | RUN uv pip install --no-cache --system /app/libs/megaparse /app/libs/megaparse_sdk 43 | 44 | EXPOSE 8000 45 | CMD ["uvicorn", "megaparse.api.app:app", "--host", "0.0.0.0", "--port", "8000"] 46 | -------------------------------------------------------------------------------- /Dockerfile.gpu: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda:12.6.3-cudnn-devel-ubuntu20.04 2 | 3 | WORKDIR /app 4 | 5 | ENV UV_COMPILE_BYTECODE=1 6 | ENV UV_NO_CACHE=1 7 | ENV DEBIAN_FRONTEND=noninteractive 8 | 9 | # Install runtime dependencies 10 | RUN apt-get update && apt-get install -y software-properties-common && \ 11 | add-apt-repository ppa:deadsnakes/ppa && \ 12 | apt-get update && apt-get install -y \ 13 | python3.11 \ 14 | python3.11-dev \ 15 | libgeos-dev \ 16 | libcurl4-openssl-dev \ 17 | libssl-dev \ 18 | binutils \ 19 | curl \ 20 | git \ 21 | autoconf \ 22 | automake \ 23 | libtool \ 24 | python3-pip \ 25 | build-essential \ 26 | wget \ 27 | gcc \ 28 | # Additional dependencies for document handling 29 | libmagic-dev \ 30 | poppler-utils \ 31 | tesseract-ocr \ 32 | libreoffice \ 33 | libpq-dev \ 34 | pandoc && \ 35 | rm -rf /var/lib/apt/lists/* && apt-get clean 36 | 37 | RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 1 && \ 38 | update-alternatives --set python3 /usr/bin/python3.11 39 | 40 | COPY requirements.lock pyproject.toml README.md ./ 41 | COPY libs/megaparse/pyproject.toml libs/megaparse/README.md libs/megaparse/ 42 | COPY libs/megaparse_sdk/pyproject.toml libs/megaparse_sdk/README.md libs/megaparse_sdk/ 43 | 44 | RUN curl -LsSf https://astral.sh/uv/install.sh | sh 45 | ENV PATH="/root/.local/bin:$PATH" 46 | RUN uv pip install --no-cache --system -r requirements.lock 47 | 48 | RUN playwright install --with-deps 49 | RUN python3 - -m nltk.downloader all 50 | 51 | # FIXME: causes runtime link issues with onnxruntime_pybind_state.cc:507 unstructured 52 | # RUN python3 -c "from unstructured.nlp.tokenize import download_nltk_packages; download_nltk_packages()" && \ 53 | # RUN python3 -c "import nltk; nltk.download('punkt_tab'); nltk.download('averaged_perceptron_tagger_eng')" 54 | 55 | COPY . . 56 | 57 | RUN uv pip install --no-cache --system /app/libs/megaparse /app/libs/megaparse_sdk 58 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .DEFAULT_TARGET=help 2 | 3 | ## help: Display list of commands 4 | .PHONY: help 5 | help: 6 | @echo "Available commands:" 7 | @sed -n 's|^##||p' $(MAKEFILE_LIST) | column -t ':' | sed -e 's|^| |' 8 | 9 | ## dev: Start development environment 10 | .PHONY: dev 11 | dev: 12 | DOCKER_BUILDKIT=1 docker compose -f docker-compose.dev.yml up --build 13 | 14 | ## dev-build: Build development environment without cache 15 | .PHONY: dev-build 16 | dev-build: 17 | DOCKER_BUILDKIT=1 docker compose -f docker-compose.dev.yml build --no-cache 18 | DOCKER_BUILDKIT=1 docker compose -f docker-compose.dev.yml up 19 | 20 | ## prod: Build and start production environment 21 | .PHONY: prod 22 | prod: 23 | docker compose -f docker-compose.yml up --build 24 | -------------------------------------------------------------------------------- /Pipfile: -------------------------------------------------------------------------------- 1 | [[source]] 2 | url = "https://pypi.org/simple" 3 | verify_ssl = true 4 | name = "pypi" 5 | 6 | [packages] 7 | 8 | [dev-packages] 9 | 10 | [requires] 11 | python_version = "3.11" 12 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MegaParse - Your Parser for every type of documents 2 | 3 |
4 | Quivr-logo 5 |
6 | 7 | MegaParse is a powerful and versatile parser that can handle various types of documents with ease. Whether you're dealing with text, PDFs, Powerpoint presentations, Word documents MegaParse has got you covered. Focus on having no information loss during parsing. 8 | 9 | ## Key Features 🎯 10 | 11 | - **Versatile Parser**: MegaParse is a powerful and versatile parser that can handle various types of documents with ease. 12 | - **No Information Loss**: Focus on having no information loss during parsing. 13 | - **Fast and Efficient**: Designed with speed and efficiency at its core. 14 | - **Wide File Compatibility**: Supports Text, PDF, Powerpoint presentations, Excel, CSV, Word documents. 15 | - **Open Source**: Freedom is beautiful, and so is MegaParse. Open source and free to use. 16 | 17 | ## Support 18 | 19 | - Files: ✅ PDF ✅ Powerpoint ✅ Word 20 | - Content: ✅ Tables ✅ TOC ✅ Headers ✅ Footers ✅ Images 21 | 22 | ### Example 23 | 24 | https://github.com/QuivrHQ/MegaParse/assets/19614572/1b4cdb73-8dc2-44ef-b8b4-a7509bc8d4f3 25 | 26 | ## Installation 27 | 28 | required python version >= 3.11 29 | 30 | ```bash 31 | pip install megaparse 32 | ``` 33 | 34 | ## Usage 35 | 36 | 1. Add your OpenAI or Anthropic API key to the .env file 37 | 38 | 2. Install poppler on your computer (images and PDFs) 39 | 40 | 3. Install tesseract on your computer (images and PDFs) 41 | 42 | 4. If you have a mac, you also need to install libmagic ```brew install libmagic``` 43 | 44 | Use MegaParse as it is : 45 | ```python 46 | from megaparse import MegaParse 47 | from langchain_openai import ChatOpenAI 48 | 49 | megaparse = MegaParse() 50 | response = megaparse.load("./test.pdf") 51 | print(response) 52 | ``` 53 | 54 | ### Use MegaParse Vision 55 | 56 | ```python 57 | from megaparse.parser.megaparse_vision import MegaParseVision 58 | 59 | model = ChatOpenAI(model="gpt-4o", api_key=os.getenv("OPENAI_API_KEY")) # type: ignore 60 | parser = MegaParseVision(model=model) 61 | response = parser.convert("./test.pdf") 62 | print(response) 63 | 64 | ``` 65 | **Note**: The model supported by MegaParse Vision are the multimodal ones such as claude 3.5, claude 4, gpt-4o and gpt-4. 66 | 67 | ## Use as an API 68 | There is a MakeFile for you, simply use : 69 | ```make dev``` 70 | at the root of the project and you are good to go. 71 | 72 | See localhost:8000/docs for more info on the different endpoints ! 73 | 74 | ## BenchMark 75 | 76 | 77 | | Parser | similarity_ratio | 78 | | ----------------------------- | ---------------- | 79 | | megaparse_vision | 0.87 | 80 | | unstructured_with_check_table | 0.77 | 81 | | unstructured | 0.59 | 82 | | llama_parser | 0.33 | 83 | 84 | 85 | _Higher the better_ 86 | 87 | Note: Want to evaluate and compare your Megaparse module with ours ? Please add your config in ```evaluations/script.py``` and then run ```python evaluations/script.py```. If it is better, do a PR, I mean, let's go higher together . 88 | 89 | ## In Construction 🚧 90 | - Improve table checker 91 | - Create Checkers to add **modular postprocessing** ⚙️ 92 | - Add Structured output, **let's get computer talking** 🤖 93 | 94 | 95 | 96 | ## Star History 97 | 98 | [![Star History Chart](https://api.star-history.com/svg?repos=QuivrHQ/MegaParse&type=Date)](https://star-history.com/#QuivrHQ/MegaParse&Date) 99 | -------------------------------------------------------------------------------- /benchmark/process_single_doc.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import time 3 | from pathlib import Path 4 | 5 | import numpy as np 6 | from megaparse import MegaParse 7 | 8 | N_TRY = 1 9 | 10 | 11 | async def process_file(megaparse: MegaParse, file_path: str | Path): 12 | try: 13 | t0 = time.perf_counter() 14 | _ = await megaparse.aload( 15 | file_path=file_path, 16 | ) 17 | total = time.perf_counter() - t0 18 | return total 19 | except Exception as e: 20 | print(f"Exception occured: {e}") 21 | return None 22 | 23 | 24 | async def test_process_file(file: str | Path): 25 | # parser = UnstructuredParser(strategy=StrategyEnum.HI_RES) 26 | megaparse = MegaParse() 27 | task = [] 28 | for _ in range(N_TRY): 29 | task.append(process_file(megaparse, file)) 30 | list_process_time = await asyncio.gather(*task) 31 | 32 | n_errors = sum([t is None for t in list_process_time]) 33 | list_process_time = [t for t in list_process_time if t is not None] 34 | 35 | np_list_process_time = np.array(list_process_time) 36 | print(f"All errors : {n_errors}") 37 | print(f"Average time taken: {np_list_process_time.mean()}") 38 | print(f"Median time taken: {np.median(list_process_time)}") 39 | print(f"Standard deviation of time taken: {np.std(list_process_time)}") 40 | print(f"Max time taken: {np.max(list_process_time)}") 41 | print(f"Min time taken: {np.min(list_process_time)}") 42 | 43 | 44 | if __name__ == "__main__": 45 | folder_path = "/Users/amine/data/quivr/parsing/scanned/machine.pdf" 46 | asyncio.run(test_process_file(folder_path)) 47 | -------------------------------------------------------------------------------- /benchmark/test_quality_sim.py: -------------------------------------------------------------------------------- 1 | import os 2 | import difflib 3 | from pathlib import Path 4 | 5 | auto_dir = Path("benchmark/auto") 6 | hi_res_dir = Path("benchmark/hi_res") 7 | 8 | 9 | def jaccard_similarity(str1, str2): 10 | if len(str1) == 0 and len(str2) == 0: 11 | return 1 12 | # Tokenize the strings into sets of words 13 | words1 = set(str1.split()) 14 | words2 = set(str2.split()) 15 | 16 | # Find intersection and union of the word sets 17 | intersection = words1.intersection(words2) 18 | union = words1.union(words2) 19 | 20 | # Compute Jaccard similarity 21 | return len(intersection) / len(union) if len(union) != 0 else 0 22 | 23 | 24 | def compare_files(file_name): 25 | file_path_auto = auto_dir / f"{file_name}.md" 26 | file_path_hi_res = hi_res_dir / f"{file_name}.md" 27 | 28 | with open(file_path_auto, "r") as f: 29 | auto_content = f.read() 30 | 31 | with open(file_path_hi_res, "r") as f: 32 | hi_res_content = f.read() 33 | 34 | if len(auto_content) == 0 and len(hi_res_content) == 0: 35 | return 1 36 | 37 | similarity = difflib.SequenceMatcher(None, auto_content, hi_res_content).ratio() 38 | # similarity = jaccard_similarity(auto_content, hi_res_content) 39 | 40 | return similarity 41 | 42 | 43 | def main(): 44 | files = os.listdir(hi_res_dir) 45 | print(f"Comparing {len(files)} files...") 46 | similarity_dict = {} 47 | for file in files: 48 | file_name = Path(file).stem 49 | similarity = compare_files(file_name) 50 | similarity_dict[file_name] = similarity 51 | 52 | avg_similarity = sum(similarity_dict.values()) / len(similarity_dict) 53 | print(f"\nAverage similarity: {avg_similarity}\n") 54 | 55 | pass_rate = sum( 56 | [similarity > 0.9 for similarity in similarity_dict.values()] 57 | ) / len(similarity_dict) 58 | 59 | print(f"Pass rate: {pass_rate}\n") 60 | 61 | print("Under 0.9 similarity documents:") 62 | print("-------------------------------") 63 | for file_name, similarity in similarity_dict.items(): 64 | if similarity < 0.9: 65 | print(f"{file_name}: {similarity}") 66 | 67 | 68 | if __name__ == "__main__": 69 | main() 70 | -------------------------------------------------------------------------------- /docker-compose.dev.yml: -------------------------------------------------------------------------------- 1 | version: "3.8" 2 | 3 | services: 4 | megaparse: 5 | build: 6 | context: . 7 | dockerfile: Dockerfile 8 | cache_from: 9 | - megaparse:latest 10 | args: 11 | - DEV_MODE=true 12 | image: megaparse:latest 13 | extra_hosts: 14 | - "host.docker.internal:host-gateway" 15 | container_name: megaparse 16 | volumes: 17 | - ./:/app/ 18 | command: > 19 | /bin/bash -c "python -m uvicorn megaparse.api.app:app --host 0.0.0.0 --log-level info --reload --port 8000" 20 | restart: always 21 | ports: 22 | - 8000:8000 23 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3.8" 2 | 3 | services: 4 | megaparse: 5 | image: megaparse:latest 6 | pull_policy: if_not_present 7 | container_name: megaparse 8 | extra_hosts: 9 | - "host.docker.internal:host-gateway" 10 | healthcheck: 11 | test: [ "CMD", "curl", "http://localhost:5050/healthz" ] 12 | command: > 13 | /bin/bash -c "python -m uvicorn megaparse.api.app:app --host 0.0.0.0 --log-level info --reload --port 8000 --loop uvloop" 14 | restart: always 15 | ports: 16 | - 8000:8000 17 | -------------------------------------------------------------------------------- /docs/archive.txt: -------------------------------------------------------------------------------- 1 | ### (Optional) Use LlamaParse for Improved Results 2 | 3 | 1. Create an account on [Llama Cloud](https://cloud.llamaindex.ai/) and get your API key. 4 | 5 | 2. Change the parser to LlamaParser 6 | 7 | ```python 8 | from megaparse import MegaParse 9 | from langchain_openai import ChatOpenAI 10 | from megaparse.parser.llama_parser import LlamaParser 11 | 12 | parser = LlamaParser(api_key = os.getenv("LLAMA_CLOUD_API_KEY")) 13 | megaparse = MegaParse(parser) 14 | response = megaparse.load("./test.pdf") 15 | print(response) 16 | megaparse.save("./test.md") #saves the last processed doc in md format 17 | ``` -------------------------------------------------------------------------------- /evaluations/script.py: -------------------------------------------------------------------------------- 1 | import difflib 2 | import os 3 | 4 | from langchain_openai import ChatOpenAI 5 | from megaparse.megaparse import MegaParse 6 | from megaparse.parser.llama import LlamaParser 7 | from megaparse.parser.megaparse_vision import MegaParseVision 8 | from megaparse.parser.unstructured_parser import UnstructuredParser 9 | from megaparse_sdk.schema.parser_config import StrategyEnum 10 | 11 | if __name__ == "__main__": 12 | print("---Launching evaluations script---") 13 | model = ChatOpenAI(model="gpt-4o", api_key=str(os.getenv("OPENAI_API_KEY"))) # type: ignore 14 | parser_dict = { 15 | "unstructured": UnstructuredParser(strategy=StrategyEnum.AUTO, model=None), 16 | "unstructured_with_check_table": UnstructuredParser( 17 | strategy=StrategyEnum.AUTO, 18 | model=model, 19 | ), 20 | "llama_parser": LlamaParser(api_key=str(os.getenv("LLAMA_CLOUD_API_KEY"))), 21 | "megaparse_vision": MegaParseVision(model=model), 22 | } 23 | 24 | base_pdf_path = "tests/data/MegaFake_report.pdf" 25 | base_md_path = "tests/data/grt_example/MegaFake_report.md" 26 | with open(base_md_path, "r", encoding="utf-8") as f: 27 | base_md = f.read() 28 | 29 | score_dict = {} 30 | 31 | for method, parser in parser_dict.items(): 32 | print(f"Method: {method}") 33 | megaparse = MegaParse() 34 | result = megaparse.load(file_path=base_pdf_path) 35 | score_dict[method] = difflib.SequenceMatcher(None, base_md, result).ratio() 36 | print(f"Score for method {method}: {score_dict[method]}") 37 | 38 | # Sort the results 39 | sorted_score = sorted(score_dict.items(), key=lambda x: x[1], reverse=True) 40 | 41 | # Generate a table with the results 42 | benchmark_results = "| Parser | similarity_ratio |\n|---|---|\n" 43 | for parser, score in sorted_score: 44 | benchmark_results += f"| {parser} | {score:.2f} |\n" 45 | 46 | print(benchmark_results) 47 | 48 | # Update README.md file 49 | with open("README.md", "r") as readme_file: 50 | readme_content = readme_file.read() 51 | 52 | start_marker = "" 53 | end_marker = "" 54 | start_index = readme_content.find(start_marker) + len(start_marker) 55 | end_index = readme_content.find(end_marker) 56 | 57 | updated_readme_content = ( 58 | readme_content[:start_index] 59 | + "\n" 60 | + benchmark_results 61 | + readme_content[end_index:] 62 | ) 63 | 64 | with open("README.md", "w") as readme_file: 65 | readme_file.write(updated_readme_content) 66 | -------------------------------------------------------------------------------- /images/tables.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QuivrHQ/MegaParse/ba9a24aec950d6cf14834b8e2e11f5725778f12e/images/tables.png -------------------------------------------------------------------------------- /libs/megaparse/.python-version: -------------------------------------------------------------------------------- 1 | 3.11.9 -------------------------------------------------------------------------------- /libs/megaparse/CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | ## [0.0.55](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.54...megaparse-v0.0.55) (2025-02-14) 4 | 5 | 6 | ### Features 7 | 8 | * remove tensorrt ([#230](https://github.com/QuivrHQ/MegaParse/issues/230)) ([8b8abbc](https://github.com/QuivrHQ/MegaParse/commit/8b8abbc6a2a1b33d4e921d55d2519b773ec062c8)) 9 | 10 | ## [0.0.54](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.53...megaparse-v0.0.54) (2025-02-11) 11 | 12 | 13 | ### Features 14 | 15 | * add_layout_detection ([#220](https://github.com/QuivrHQ/MegaParse/issues/220)) ([2d2d0b4](https://github.com/QuivrHQ/MegaParse/commit/2d2d0b42bba4c883db423568e932eda42edd60d7)) 16 | 17 | ## [0.0.53](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.52...megaparse-v0.0.53) (2025-01-16) 18 | 19 | 20 | ### Features 21 | 22 | * modular parser and formatter v0 ([#175](https://github.com/QuivrHQ/MegaParse/issues/175)) ([1f4dcf8](https://github.com/QuivrHQ/MegaParse/commit/1f4dcf88a5901c5a2682cb79284a0dbb08034cb2)) 23 | * Text detection in auto strategy ([#209](https://github.com/QuivrHQ/MegaParse/issues/209)) ([03c7ada](https://github.com/QuivrHQ/MegaParse/commit/03c7ada1dc245e13ef41ffd6fa3a8ed869269d37)) 24 | * type strategy output ([#216](https://github.com/QuivrHQ/MegaParse/issues/216)) ([deb8765](https://github.com/QuivrHQ/MegaParse/commit/deb8765a4df8917a4857f51a02025243192d5cf8)) 25 | 26 | 27 | ### Bug Fixes 28 | 29 | * Add EngineConfig & StrategyHandler ([#211](https://github.com/QuivrHQ/MegaParse/issues/211)) ([2e1c6dd](https://github.com/QuivrHQ/MegaParse/commit/2e1c6ddd676227d1cbc4cff9771b20595259ba38)) 30 | * add parse tests for every supported extensions ([#198](https://github.com/QuivrHQ/MegaParse/issues/198)) ([9dff0de](https://github.com/QuivrHQ/MegaParse/commit/9dff0de0c1de848151fe9a6519b658f0924c1228)) 31 | * logging error ([#218](https://github.com/QuivrHQ/MegaParse/issues/218)) ([a2170d7](https://github.com/QuivrHQ/MegaParse/commit/a2170d7c711a5d7a0531f03aa9576937ddd6576e)) 32 | * megaparse.load & add tests ([#202](https://github.com/QuivrHQ/MegaParse/issues/202)) ([13c2677](https://github.com/QuivrHQ/MegaParse/commit/13c2677bdadb4ba985a1abf9bafeb70548ab59f9)) 33 | * Strategy heuristic test & fix ([#203](https://github.com/QuivrHQ/MegaParse/issues/203)) ([7b7fb40](https://github.com/QuivrHQ/MegaParse/commit/7b7fb40cae4ed380a5f0ca0035a7bd2bcc9147c3)) 34 | * sync convert to parsers ([#186](https://github.com/QuivrHQ/MegaParse/issues/186)) ([fbb7d36](https://github.com/QuivrHQ/MegaParse/commit/fbb7d365fbaf710a687fdc6becacd6d301c09707)) 35 | 36 | ## [0.0.52](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.51...megaparse-v0.0.52) (2024-12-16) 37 | 38 | 39 | ### Bug Fixes 40 | 41 | * hatchling version ([#193](https://github.com/QuivrHQ/MegaParse/issues/193)) ([f6070a5](https://github.com/QuivrHQ/MegaParse/commit/f6070a5483a20eeb83751a2dcfc01b7f0fb14473)) 42 | 43 | ## [0.0.51](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.50...megaparse-v0.0.51) (2024-12-16) 44 | 45 | 46 | ### Features 47 | 48 | * updating langchain version ([#187](https://github.com/QuivrHQ/MegaParse/issues/187)) ([0f1f597](https://github.com/QuivrHQ/MegaParse/commit/0f1f5977df147e6b8c65d55445ccd86ef6f1a862)) 49 | 50 | ## [0.0.50](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.49...megaparse-v0.0.50) (2024-12-13) 51 | 52 | 53 | ### Features 54 | 55 | * small fixes ([#181](https://github.com/QuivrHQ/MegaParse/issues/181)) ([004afe2](https://github.com/QuivrHQ/MegaParse/commit/004afe2f170570075bbebcd32dec5d15ddba4609)) 56 | 57 | ## [0.0.49](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.48...megaparse-v0.0.49) (2024-12-12) 58 | 59 | 60 | ### Features 61 | 62 | * custom auto ([#131](https://github.com/QuivrHQ/MegaParse/issues/131)) ([3cb5be4](https://github.com/QuivrHQ/MegaParse/commit/3cb5be4a8c8eeb6dd6e9b87d7bbca24491db4c29)) 63 | * faster ocr ([#180](https://github.com/QuivrHQ/MegaParse/issues/180)) ([5661cb2](https://github.com/QuivrHQ/MegaParse/commit/5661cb2d52d959cbca0f41339791129cd35d4036)) 64 | 65 | ## [0.0.48](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.47...megaparse-v0.0.48) (2024-12-03) 66 | 67 | 68 | ### Features 69 | 70 | * Update imports and parsers in README.md ([#156](https://github.com/QuivrHQ/MegaParse/issues/156)) ([33e0303](https://github.com/QuivrHQ/MegaParse/commit/33e0303821691c4b1fc821e6b33b874bd332d430)) 71 | 72 | ## [0.0.47](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.46...megaparse-v0.0.47) (2024-11-21) 73 | 74 | 75 | ### Features 76 | 77 | * refacto megaparse for service ([#132](https://github.com/QuivrHQ/MegaParse/issues/132)) ([ab9ad7f](https://github.com/QuivrHQ/MegaParse/commit/ab9ad7fb7db580a04a998d144dd2ba3407068334)) 78 | * release plz ([#134](https://github.com/QuivrHQ/MegaParse/issues/134)) ([d8a221e](https://github.com/QuivrHQ/MegaParse/commit/d8a221e23f6e15e969c1328f183da3582d0d7925)) 79 | 80 | ## [0.0.22](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.21...megaparse-v0.0.22) (2024-07-24) 81 | 82 | 83 | ### Features 84 | 85 | * Add instructions for installing poppler and tesseract ([#10](https://github.com/QuivrHQ/MegaParse/issues/10)) ([3399552](https://github.com/QuivrHQ/MegaParse/commit/3399552bc8be705f6d34306743388a96d099eebc)) 86 | * Add MegaParse class to __init__.py ([84c0d64](https://github.com/QuivrHQ/MegaParse/commit/84c0d648ef1ddf048ec911210d89be155443dc72)) 87 | * Add support for Unstructured Parser, improve Table and Image Parsing, and add TOC and Hyperlinks for Docx ([#9](https://github.com/QuivrHQ/MegaParse/issues/9)) ([4934776](https://github.com/QuivrHQ/MegaParse/commit/493477672cef9fe22b0ab56ced1d5572104e1914)) 88 | * base loader ([#65](https://github.com/QuivrHQ/MegaParse/issues/65)) ([eb8149f](https://github.com/QuivrHQ/MegaParse/commit/eb8149f05ec2793f59fd87109a1aba8095f6f1d0)) 89 | * base loader class ([#64](https://github.com/QuivrHQ/MegaParse/issues/64)) ([801a026](https://github.com/QuivrHQ/MegaParse/commit/801a026e4b3411f8ac85171a6928e3d17c027648)) 90 | * Update benchmark results in README.md ([#15](https://github.com/QuivrHQ/MegaParse/issues/15)) ([1dfcb4c](https://github.com/QuivrHQ/MegaParse/commit/1dfcb4ce19467f7fb8137e10e5f5fbf35e563df0)) 91 | 92 | 93 | ### Bug Fixes 94 | 95 | * add __init__.py ([a5b8de9](https://github.com/QuivrHQ/MegaParse/commit/a5b8de9e1e01ef681ac2ef59a6e111ae7bd6cf70)) 96 | * change name ([6b36437](https://github.com/QuivrHQ/MegaParse/commit/6b36437787f048d36d69c3b06c2d59f7dc7a741f)) 97 | * PR Comments ([a0ab0ba](https://github.com/QuivrHQ/MegaParse/commit/a0ab0baa5dd9aae644baef55348f1af28a6776a7)) 98 | * remove nest asycio ([22195a2](https://github.com/QuivrHQ/MegaParse/commit/22195a27e9dc3583bf1fbde2a95e9fbecc8d96a4)) 99 | * use aload_data ([e5c73fe](https://github.com/QuivrHQ/MegaParse/commit/e5c73fefcbf09bb12810adc6d4412f7742c42089)) 100 | 101 | ## [0.0.21](https://github.com/QuivrHQ/MegaParse/compare/v0.0.20...v0.0.21) (2024-07-24) 102 | 103 | 104 | ### Features 105 | 106 | * base loader ([#65](https://github.com/QuivrHQ/MegaParse/issues/65)) ([eb8149f](https://github.com/QuivrHQ/MegaParse/commit/eb8149f05ec2793f59fd87109a1aba8095f6f1d0)) 107 | * base loader class ([#64](https://github.com/QuivrHQ/MegaParse/issues/64)) ([801a026](https://github.com/QuivrHQ/MegaParse/commit/801a026e4b3411f8ac85171a6928e3d17c027648)) 108 | -------------------------------------------------------------------------------- /libs/megaparse/README.md: -------------------------------------------------------------------------------- 1 | # MegaParse CORE 2 | 3 | - Core package of megaparse 4 | 5 | > **Note:** The test files in `tests/pdf/ocr` and `tests/pdf/native` come from SAFEDOCS (CC-MAIN-2021-31-PDF-UNTRUNCATED). You can find more information [here](https://digitalcorpora.org/corpora/file-corpora/cc-main-2021-31-pdf-untruncated/). -------------------------------------------------------------------------------- /libs/megaparse/bench.md: -------------------------------------------------------------------------------- 1 | ------------ 2 | UNSTRUCTURED(HI-RES): 3 | ------------ 4 | 5 | folder: cdp 6 | cdp_etiquette.pdf parsing took: 2.10s 7 | folder: scanned-tables 8 | POZIBILAN 2022.pdf parsing took: 78.72s 9 | Banco Popilar Number 2.pdf parsing took: 94.44s 10 | folder: native 11 | 00b03d60-fe45-4318-a511-18ee921b7bbb.pdf parsing took: 3.25s 12 | 0b0ab5f4-b654-4846-bd9b-18b3c1075c52.pdf parsing took: 39.75s 13 | 0adb1fd6-d009-4097-bcf6-b8f3af38d3f0.pdf parsing took: 25.02s 14 | folder: scanned 15 | machine.pdf parsing took: 54.29s 16 | medical.pdf parsing took: 76.11s 17 | les_americains.pdf parsing took: 643.84s 18 | agency.pdf parsing took: 114.19s 19 | clark.pdf parsing took: 27.89s 20 | tables_ocr.pdf parsing took: 81.21s 21 | folder: rich 22 | language_learning.pdf parsing took: 2.60s 23 | dites nous tout....pdf parsing took: 1.62s 24 | 25 | ------------ 26 | UNSTRUCTURED(FAST): 27 | ------------ 28 | folder: cdp 29 | cdp_etiquette.pdf parsing took: 0.05s 30 | folder: scanned-tables 31 | POZIBILAN 2022.pdf: can't parse 32 | Banco Popilar Number 2.pdf: can't parse 33 | folder: native 34 | 00b03d60-fe45-4318-a511-18ee921b7bbb.pdf parsing took: 0.07s 35 | 0b0ab5f4-b654-4846-bd9b-18b3c1075c52.pdf parsing took: 0.86s 36 | 0adb1fd6-d009-4097-bcf6-b8f3af38d3f0.pdf parsing took: 0.24s 37 | folder: scanned 38 | machine.pdf parsing took: 0.02s 39 | medical.pdf parsing took: 0.04s 40 | les_americains.pdf parsing took: 5.90s 41 | agency.pdf: can't parse 42 | clark.pdf: can't parse 43 | tables_ocr.pdf: can't parse 44 | folder: rich 45 | language_learning.pdf: can't parse 46 | dites nous tout....pdf parsing took: 0.02s 47 | 48 | ------------ 49 | Megaparse ( 50 | strategy = AUTO 51 | Config = { 52 | provider=COREML, 53 | det_arch: str = "fast_base" 54 | det_batch_size: int = 2 55 | assume_straight_pages: bool = True 56 | preserve_aspect_ratio: bool = True 57 | symmetric_pad: bool = True 58 | load_in_8_bit: bool = False 59 | reco_arch: str = "crnn_vgg16_bn" 60 | rec_batch_size: int = 512 61 | } 62 | ) 63 | ------------ 64 | folder: cdp 65 | cdp_etiquette.pdf parsing took: 1.71s 66 | folder: scanned-tables 67 | POZIBILAN 2022.pdf parsing took: 17.76s 68 | Banco Popilar Number 2.pdf parsing took: 19.25s 69 | folder: native 70 | 00b03d60-fe45-4318-a511-18ee921b7bbb.pdf parsing took: 0.96s 71 | 0b0ab5f4-b654-4846-bd9b-18b3c1075c52.pdf parsing took: 12.57s 72 | 0adb1fd6-d009-4097-bcf6-b8f3af38d3f0.pdf parsing took: 1.53s 73 | folder: scanned 74 | machine.pdf parsing took: 9.90s 75 | medical.pdf parsing took: 13.09s 76 | les_americains.pdf parsing took: 139.53s 77 | agency.pdf parsing took: 10.73s 78 | clark.pdf parsing took: 10.69s 79 | tables_ocr.pdf parsing took: 15.58s 80 | folder: rich 81 | language_learning.pdf parsing took: 1.74s 82 | dites nous tout....pdf parsing took: 0.64s 83 | ---- 84 | | Type | PDF Name | Unstructured(HI-RES) | Unstructured(FAST) | Megaparse( w/ doctr COREML) | 85 | |------------------|-----------------------------------|---------------------|----------------------|--------------------| 86 | | **cdp** | cdp_etiquette.pdf | 2.10s | 0.05s (bad parsing) | 1.71s | 87 | | **scanned-tables** | POZIBILAN 2022.pdf | 78.72s | can't parse | 17.76s | 88 | | **scanned-tables** | Banco Popilar Number 2.pdf | 94.44s | can't parse | 19.25s | 89 | | **native** | 00b03d60-fe45-4318-a511-18ee921b7bbb.pdf | 3.25s | 0.07s | 0.96s | 90 | | **native** | 0b0ab5f4-b654-4846-bd9b-18b3c1075c52.pdf | 39.75s | 0.86s | 12.57s | 91 | | **native** | 0adb1fd6-d009-4097-bcf6-b8f3af38d3f0.pdf | 25.02s | 0.24s | 1.53s | 92 | | **scanned** | machine.pdf | 54.29s | 0.02s | 9.90s | 93 | | **scanned** | medical.pdf | 76.11s | 0.04s | 13.09s | 94 | | **scanned** | les_americains.pdf | 643.84s | 5.90s | 139.53s | 95 | | **scanned** | agency.pdf | 114.19s | can't parse | 10.73s | 96 | | **scanned** | clark.pdf | 28.89s | can't parse | 10.69s | 97 | | **scanned** | tables_ocr.pdf | 81.21s | can't parse | 15.58s | 98 | | **rich** | language_learning.pdf | 2.60s | can't parse | 1.74s | 99 | | **rich** | dites nous tout....pdf | 1.62s | 0.02s | 0.64s | 100 | -------------------------------------------------------------------------------- /libs/megaparse/examples/parse_file_fast.py: -------------------------------------------------------------------------------- 1 | import os 2 | from dataclasses import dataclass 3 | from time import perf_counter 4 | 5 | from unstructured.partition.auto import partition 6 | 7 | 8 | @dataclass 9 | class File: 10 | file_path: str 11 | file_name: str 12 | file_extension: str 13 | 14 | 15 | def list_files_in_directory(directory_path: str) -> dict[str, list[File]]: 16 | directory_dict = {} 17 | for root, _, files in os.walk(directory_path): 18 | folder_name = os.path.basename(root) 19 | if len(folder_name) > 0: 20 | file_list = [] 21 | for file_name in files: 22 | file_path = os.path.join(root, file_name) 23 | file_extension = os.path.splitext(file_name)[1] 24 | file_list.append( 25 | File( 26 | file_path=file_path, 27 | file_name=file_name, 28 | file_extension=file_extension, 29 | ) 30 | ) 31 | directory_dict[folder_name] = file_list 32 | 33 | return directory_dict 34 | 35 | 36 | def main(): 37 | file_path = "/Users/amine/data/quivr/parsing/native/0b0ab5f4-b654-4846-bd9b-18b3c1075c52.pdf" 38 | folder_path = "/Users/amine/data/quivr/parsing/" 39 | 40 | list_files = list_files_in_directory(folder_path) 41 | 42 | for folder_name, files in list_files.items(): 43 | print(f"folder: {folder_name}") 44 | for file in files: 45 | if file.file_extension == ".pdf": 46 | s = perf_counter() 47 | elements = partition( 48 | filename=file.file_path, 49 | strategy="fast", 50 | ) 51 | if len(elements) == 0: 52 | print(f"\t{file.file_name}: can't parse ") 53 | continue 54 | 55 | e = perf_counter() 56 | print(f"\t {file.file_name} parsing took: {e-s:.2f}s") 57 | 58 | 59 | if __name__ == "__main__": 60 | els = main() 61 | -------------------------------------------------------------------------------- /libs/megaparse/examples/parse_file_mp.py: -------------------------------------------------------------------------------- 1 | import os 2 | from dataclasses import dataclass 3 | from time import perf_counter 4 | 5 | from megaparse import MegaParse 6 | from megaparse.configs.auto import DeviceEnum, MegaParseConfig 7 | 8 | 9 | @dataclass 10 | class File: 11 | file_path: str 12 | file_name: str 13 | file_extension: str 14 | 15 | 16 | def list_files_in_directory(directory_path: str) -> dict[str, list[File]]: 17 | directory_dict = {} 18 | for root, _, files in os.walk(directory_path): 19 | folder_name = os.path.basename(root) 20 | if len(folder_name) > 0: 21 | file_list = [] 22 | for file_name in files: 23 | file_path = os.path.join(root, file_name) 24 | file_extension = os.path.splitext(file_name)[1] 25 | file_list.append( 26 | File( 27 | file_path=file_path, 28 | file_name=file_name, 29 | file_extension=file_extension, 30 | ) 31 | ) 32 | directory_dict[folder_name] = file_list 33 | 34 | return directory_dict 35 | 36 | 37 | def main(): 38 | folder_path = "/Users/amine/data/quivr/parsing/" 39 | 40 | list_files = list_files_in_directory(folder_path) 41 | config = MegaParseConfig(device=DeviceEnum.COREML) 42 | mp = MegaParse(config=config) 43 | 44 | for folder_name, files in list_files.items(): 45 | print(f"folder: {folder_name}") 46 | for file in files: 47 | if file.file_extension == ".pdf": 48 | s = perf_counter() 49 | result = mp.load(file.file_path) 50 | if len(result) == 0: 51 | print(f"\t{file.file_name}: can't parse ") 52 | continue 53 | 54 | e = perf_counter() 55 | print(f"\t {file.file_name} parsing took: {e-s:.2f}s") 56 | 57 | 58 | if __name__ == "__main__": 59 | els = main() 60 | -------------------------------------------------------------------------------- /libs/megaparse/examples/parse_file_unstructured.py: -------------------------------------------------------------------------------- 1 | import os 2 | from dataclasses import dataclass 3 | from time import perf_counter 4 | 5 | from unstructured.partition.auto import partition 6 | 7 | 8 | @dataclass 9 | class File: 10 | file_path: str 11 | file_name: str 12 | file_extension: str 13 | 14 | 15 | def list_files_in_directory(directory_path: str) -> dict[str, list[File]]: 16 | directory_dict = {} 17 | for root, _, files in os.walk(directory_path): 18 | folder_name = os.path.basename(root) 19 | if len(folder_name) > 0: 20 | file_list = [] 21 | for file_name in files: 22 | file_path = os.path.join(root, file_name) 23 | file_extension = os.path.splitext(file_name)[1] 24 | file_list.append( 25 | File( 26 | file_path=file_path, 27 | file_name=file_name, 28 | file_extension=file_extension, 29 | ) 30 | ) 31 | directory_dict[folder_name] = file_list 32 | 33 | return directory_dict 34 | 35 | 36 | def main(): 37 | file_path = "/Users/amine/data/quivr/parsing/native/0b0ab5f4-b654-4846-bd9b-18b3c1075c52.pdf" 38 | folder_path = "/Users/amine/data/quivr/parsing/" 39 | 40 | list_files = list_files_in_directory(folder_path) 41 | 42 | for folder_name, files in list_files.items(): 43 | print(f"folder: {folder_name}") 44 | for file in files: 45 | if file.file_extension == ".pdf": 46 | s = perf_counter() 47 | _ = partition( 48 | filename=file.file_path, 49 | strategy="hi_res", 50 | ) 51 | e = perf_counter() 52 | print(f"\t {file.file_name} parsing took: {e-s:.2f}s") 53 | 54 | 55 | if __name__ == "__main__": 56 | els = main() 57 | -------------------------------------------------------------------------------- /libs/megaparse/program.prof: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QuivrHQ/MegaParse/ba9a24aec950d6cf14834b8e2e11f5725778f12e/libs/megaparse/program.prof -------------------------------------------------------------------------------- /libs/megaparse/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "megaparse" 3 | version = "0.0.55" 4 | authors = [ 5 | { name = "Stan Girard", email = "stan@quivr.app" }, 6 | { name = "Chloé Daems", email = "chloe@quivr.app" }, 7 | { name = "Amine Dirhoussi", email = "amine@quivr.app" }, 8 | { name = "Jacopo Chevallard", email = "jacopo@quivr.app" }, 9 | ] 10 | 11 | readme = "README.md" 12 | requires-python = ">= 3.11" 13 | 14 | dependencies = [ 15 | "megaparse-sdk", 16 | "pycryptodome>=3.21.0", 17 | "pdfplumber>=0.11.0", 18 | "backoff>=2.2.1", 19 | "pypdf>=5.0.1", 20 | "psutil>=6.1.0", 21 | "numpy<=2.0.0", 22 | "playwright>=1.47.0", 23 | "langchain-anthropic>=0.1.23", 24 | "python-magic>=0.4.27", 25 | "unstructured[all-docs]==0.15.0", 26 | "langchain>=0.3,<0.4", 27 | "langchain-community>=0.3,<0.4", 28 | "langchain-openai>=0.1.21", 29 | "langchain-core>=0.3,<0.4", 30 | "llama-parse>=0.4.0", 31 | "pydantic-settings>=2.6.1", 32 | "onnxruntime==1.20.0; platform_machine == 'x86_64'", 33 | "onnxruntime-gpu==1.20.0; platform_machine == 'x86_64'", 34 | "onnxtr[gpu-headless]>=0.6.0; platform_machine == 'x86_64'", 35 | "onnxtr[cpu]>=0.6.0; platform_machine != 'x86_64'", 36 | "pypdfium2>=4.30.0", 37 | ] 38 | 39 | [project.optional-dependencies] 40 | api = [ 41 | "python-dotenv>=1.0.0", 42 | "uvloop>=0.18.0", 43 | "pydantic-settings>=2.6.1", 44 | "uvicorn>=0.32.0", 45 | "fastapi>=0.115.2", 46 | "ratelimit>=2.2.1", 47 | 48 | ] 49 | 50 | 51 | [build-system] 52 | requires = ["hatchling==1.26.3"] 53 | build-backend = "hatchling.build" 54 | 55 | [tool.rye] 56 | managed = true 57 | dev-dependencies = [] 58 | universal = true 59 | 60 | [tool.hatch.metadata] 61 | allow-direct-references = true 62 | 63 | [tool.hatch.build.targets.wheel] 64 | packages = ["src/megaparse", "src/api"] 65 | -------------------------------------------------------------------------------- /libs/megaparse/src/megaparse/__init__.py: -------------------------------------------------------------------------------- 1 | from .megaparse import MegaParse 2 | 3 | __all__ = ["MegaParse"] 4 | -------------------------------------------------------------------------------- /libs/megaparse/src/megaparse/api/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QuivrHQ/MegaParse/ba9a24aec950d6cf14834b8e2e11f5725778f12e/libs/megaparse/src/megaparse/api/__init__.py -------------------------------------------------------------------------------- /libs/megaparse/src/megaparse/api/app.py: -------------------------------------------------------------------------------- 1 | import io 2 | import os 3 | import tempfile 4 | from typing import Any, Optional 5 | 6 | import httpx 7 | import psutil 8 | import uvicorn 9 | from fastapi import Depends, FastAPI, File, Form, HTTPException, UploadFile 10 | from langchain_anthropic import ChatAnthropic 11 | from langchain_community.document_loaders import PlaywrightURLLoader 12 | from langchain_openai import ChatOpenAI 13 | from llama_parse.utils import Language 14 | from megaparse_sdk.schema.document import Document 15 | from megaparse_sdk.schema.parser_config import ( 16 | ParserType, 17 | StrategyEnum, 18 | ) 19 | from megaparse_sdk.schema.supported_models import SupportedModel 20 | 21 | from megaparse import MegaParse 22 | from megaparse.api.exceptions.megaparse_exceptions import ( 23 | HTTPDownloadError, 24 | HTTPFileNotFound, 25 | HTTPModelNotSupported, 26 | HTTPParsingException, 27 | ParsingException, 28 | ) 29 | from megaparse.parser.builder import ParserBuilder 30 | 31 | app = FastAPI() 32 | 33 | playwright_loader = PlaywrightURLLoader(urls=[], remove_selectors=["header", "footer"]) 34 | 35 | 36 | def parser_builder_dep(): 37 | return ParserBuilder() 38 | 39 | 40 | def get_playwright_loader(): 41 | return playwright_loader 42 | 43 | 44 | @app.get("/healthz") 45 | def healthz(): 46 | return {"status": "ok"} 47 | 48 | 49 | def _check_free_memory() -> bool: 50 | """Reject traffic when free memory is below minimum (default 2GB).""" 51 | mem = psutil.virtual_memory() 52 | memory_free_minimum = int(os.environ.get("MEMORY_FREE_MINIMUM_MB", 2048)) 53 | 54 | if mem.available <= memory_free_minimum * 1024 * 1024: 55 | return False 56 | return True 57 | 58 | 59 | @app.post( 60 | "/v1/file", 61 | ) 62 | async def parse_file( 63 | file: UploadFile = File(...), 64 | method: ParserType = Form(ParserType.UNSTRUCTURED), 65 | strategy: StrategyEnum = Form(StrategyEnum.AUTO), 66 | check_table: bool = Form(False), 67 | language: Language = Form(Language.ENGLISH), 68 | parsing_instruction: Optional[str] = Form(None), 69 | model_name: Optional[SupportedModel] = Form(SupportedModel.GPT_4O), 70 | parser_builder=Depends(parser_builder_dep), 71 | ) -> dict[str, str | Document]: 72 | if not _check_free_memory(): 73 | raise HTTPException( 74 | status_code=503, detail="Service unavailable due to low memory" 75 | ) 76 | model = None 77 | if model_name and check_table: 78 | if model_name.startswith("gpt"): 79 | model = ChatOpenAI(model=model_name, api_key=os.getenv("OPENAI_API_KEY")) # type: ignore 80 | elif model_name.startswith("claude"): 81 | model = ChatAnthropic( 82 | model_name=model_name, 83 | api_key=os.getenv("ANTHROPIC_API_KEY"), # type: ignore 84 | timeout=60, 85 | stop=None, 86 | ) 87 | 88 | else: 89 | raise HTTPModelNotSupported() 90 | 91 | # parser_config = ParseFileConfig( #FIXME 92 | # method=method, 93 | # strategy=strategy, 94 | # llm_model_name=SupportedModel(model_name) if model_name and check_table else None, 95 | # language=language, 96 | # parsing_instruction=parsing_instruction, 97 | # ) 98 | try: 99 | # parser = parser_builder.build(parser_config) 100 | megaparse = MegaParse() 101 | if not file.filename: 102 | raise HTTPFileNotFound("No filename provided") 103 | _, extension = os.path.splitext(file.filename) 104 | file_bytes = await file.read() 105 | file_stream = io.BytesIO(file_bytes) 106 | result = await megaparse.aload(file=file_stream, file_extension=extension) 107 | return {"message": "File parsed successfully", "result": result} 108 | except ParsingException as e: 109 | print(e) 110 | raise HTTPParsingException(file.filename) 111 | except ValueError as e: 112 | print(e) 113 | raise HTTPException(status_code=400, detail=str(e)) 114 | except Exception as e: 115 | print(e) 116 | raise HTTPException(status_code=500, detail=str(e)) 117 | 118 | 119 | @app.post( 120 | "/v1/url", 121 | ) 122 | async def upload_url( 123 | url: str, playwright_loader=Depends(get_playwright_loader) 124 | ) -> dict[str, Any]: 125 | playwright_loader.urls = [url] 126 | 127 | if url.endswith(".pdf"): 128 | ## Download the file 129 | 130 | async with httpx.AsyncClient() as client: 131 | response = await client.get(url) 132 | if response.status_code != 200: 133 | raise HTTPDownloadError(url) 134 | 135 | with tempfile.NamedTemporaryFile(delete=False, suffix="pdf") as temp_file: 136 | temp_file.write(response.content) 137 | try: 138 | megaparse = MegaParse() 139 | result = await megaparse.aload(temp_file.name) 140 | return {"message": "File parsed successfully", "result": result} 141 | except ParsingException: 142 | raise HTTPParsingException(url) 143 | else: 144 | data = await playwright_loader.aload() 145 | # Now turn the data into a string 146 | extracted_content = "" 147 | for page in data: 148 | extracted_content += page.page_content 149 | if not extracted_content: 150 | raise HTTPDownloadError( 151 | url, 152 | message="Failed to extract content from the website. Valid URL example : https://www.quivr.com", 153 | ) 154 | return { 155 | "message": "Website content parsed successfully", 156 | "result": extracted_content, 157 | } 158 | 159 | 160 | if __name__ == "__main__": 161 | uvicorn.run(app, host="0.0.0.0", port=8000) 162 | -------------------------------------------------------------------------------- /libs/megaparse/src/megaparse/api/exceptions/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QuivrHQ/MegaParse/ba9a24aec950d6cf14834b8e2e11f5725778f12e/libs/megaparse/src/megaparse/api/exceptions/__init__.py -------------------------------------------------------------------------------- /libs/megaparse/src/megaparse/api/exceptions/megaparse_exceptions.py: -------------------------------------------------------------------------------- 1 | from fastapi import HTTPException 2 | 3 | 4 | class HTTPModelNotSupported(HTTPException): 5 | def __init__( 6 | self, 7 | detail: str = "The requested model is not supported yet.", 8 | headers: dict | None = None, 9 | ): 10 | super().__init__(status_code=501, detail=detail, headers=headers) 11 | 12 | 13 | class HTTPFileNotFound(HTTPException): 14 | def __init__( 15 | self, 16 | message="The UploadFile.filename does not exist and is needed for this operation", 17 | ): 18 | super().__init__(status_code=404, detail=message) 19 | 20 | 21 | class HTTPDownloadError(HTTPException): 22 | def __init__(self, file_name, message="Failed to download the file"): 23 | message = f"{file_name} : {message}" 24 | super().__init__(status_code=400, detail=message) 25 | 26 | 27 | class HTTPParsingException(HTTPException): 28 | def __init__(self, file_name, message="Failed to parse the file"): 29 | message = f"{file_name} : {message}" 30 | super().__init__(status_code=500, detail=message) 31 | 32 | 33 | class ParsingException(Exception): 34 | """Exception raised for errors in the parsing process.""" 35 | 36 | def __init__(self, message="An error occurred during parsing"): 37 | self.message = message 38 | super().__init__(self.message) 39 | -------------------------------------------------------------------------------- /libs/megaparse/src/megaparse/api/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QuivrHQ/MegaParse/ba9a24aec950d6cf14834b8e2e11f5725778f12e/libs/megaparse/src/megaparse/api/models/__init__.py -------------------------------------------------------------------------------- /libs/megaparse/src/megaparse/api/models/base.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | 4 | class MarkDownType(str, Enum): 5 | """Markdown type enumeration.""" 6 | 7 | TITLE = "Title" 8 | SUBTITLE = "Subtitle" 9 | HEADER = "Header" 10 | FOOTER = "Footer" 11 | NARRATIVE_TEXT = "NarrativeText" 12 | LIST_ITEM = "ListItem" 13 | TABLE = "Table" 14 | PAGE_BREAK = "PageBreak" 15 | IMAGE = "Image" 16 | FORMULA = "Formula" 17 | FIGURE_CAPTION = "FigureCaption" 18 | ADDRESS = "Address" 19 | EMAIL_ADDRESS = "EmailAddress" 20 | CODE_SNIPPET = "CodeSnippet" 21 | PAGE_NUMBER = "PageNumber" 22 | DEFAULT = "Default" 23 | UNDEFINED = "Undefined" 24 | -------------------------------------------------------------------------------- /libs/megaparse/src/megaparse/configs/auto.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | from pydantic import BaseModel 4 | from pydantic_settings import BaseSettings, SettingsConfigDict 5 | 6 | 7 | class TextDetConfig(BaseModel): 8 | det_arch: str = "fast_base" 9 | batch_size: int = 2 10 | assume_straight_pages: bool = True 11 | preserve_aspect_ratio: bool = True 12 | symmetric_pad: bool = True 13 | load_in_8_bit: bool = False 14 | 15 | 16 | class AutoStrategyConfig(BaseModel): 17 | page_threshold: float = 0.6 18 | document_threshold: float = 0.2 19 | 20 | 21 | class TextRecoConfig(BaseModel): 22 | reco_arch: str = "crnn_vgg16_bn" 23 | batch_size: int = 512 24 | 25 | 26 | class DeviceEnum(str, Enum): 27 | CPU = "cpu" 28 | CUDA = "cuda" 29 | COREML = "coreml" 30 | 31 | 32 | class DoctrConfig(BaseModel): 33 | straighten_pages: bool = False 34 | detect_orientation: bool = False 35 | detect_language: bool = False 36 | text_det_config: TextDetConfig = TextDetConfig() 37 | text_reco_config: TextRecoConfig = TextRecoConfig() 38 | 39 | 40 | class MegaParseConfig(BaseSettings): 41 | """ 42 | Configuration for Megaparse. 43 | """ 44 | 45 | model_config = SettingsConfigDict( 46 | env_prefix="MEGAPARSE_", 47 | env_file=(".env.local", ".env"), 48 | env_nested_delimiter="__", 49 | extra="ignore", 50 | use_enum_values=True, 51 | ) 52 | doctr_config: DoctrConfig = DoctrConfig() 53 | auto_config: AutoStrategyConfig = AutoStrategyConfig() 54 | device: DeviceEnum = DeviceEnum.CPU 55 | -------------------------------------------------------------------------------- /libs/megaparse/src/megaparse/examples/parse_file.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | from megaparse.megaparse import MegaParse 4 | from pydantic import BaseModel, Field 5 | 6 | 7 | class MyCustomFormat(BaseModel): 8 | title: str = Field(description="The title of the document.") 9 | problem: str = Field(description="The problem statement.") 10 | solution: str = Field(description="The solution statement.") 11 | 12 | 13 | def main(): 14 | # model = ChatOpenAI(name="gpt-4o") 15 | # formatter_1 = CustomStructuredFormatter(model=model, output_model=MyCustomFormat) 16 | 17 | megaparse = MegaParse() 18 | 19 | file_path = Path("./tests/pdf/ocr/0168127.pdf") 20 | result = megaparse.load(file_path=file_path) 21 | print(result) 22 | 23 | 24 | if __name__ == "__main__": 25 | main() 26 | -------------------------------------------------------------------------------- /libs/megaparse/src/megaparse/exceptions/base.py: -------------------------------------------------------------------------------- 1 | class ParsingException(Exception): 2 | """Exception raised for errors in the parsing process.""" 3 | 4 | def __init__(self, message="An error occurred during parsing"): 5 | self.message = message 6 | super().__init__(self.message) 7 | -------------------------------------------------------------------------------- /libs/megaparse/src/megaparse/formatter/base.py: -------------------------------------------------------------------------------- 1 | from abc import ABC 2 | from pathlib import Path 3 | from typing import Union 4 | 5 | from langchain_core.language_models.chat_models import BaseChatModel 6 | from megaparse_sdk.schema.document import Document 7 | 8 | 9 | class BaseFormatter(ABC): 10 | """ 11 | A class used to improve the layout of elements, particularly focusing on converting HTML tables to markdown tables. 12 | Attributes 13 | ---------- 14 | model : BaseChatModel 15 | An instance of a chat model used to process and improve the layout of elements. 16 | Methods 17 | ------- 18 | improve_layout(elements: List[Element]) -> List[Element] 19 | Processes a list of elements, converting HTML tables to markdown tables and improving the overall layout. 20 | """ 21 | 22 | def __init__(self, model: BaseChatModel | None = None): 23 | self.model = model 24 | 25 | def format( 26 | self, document: Document, file_path: Path | str | None = None 27 | ) -> Union[Document, str]: 28 | raise NotImplementedError("Subclasses should implement this method") 29 | 30 | async def aformat( 31 | self, document: Document, file_path: Path | str | None = None 32 | ) -> Union[Document, str]: 33 | raise NotImplementedError("Subclasses should implement this method") 34 | -------------------------------------------------------------------------------- /libs/megaparse/src/megaparse/formatter/structured_formatter/__init__.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | from langchain_core.language_models.chat_models import BaseChatModel 4 | from megaparse.formatter.base import BaseFormatter 5 | from megaparse_sdk.schema.document import Document 6 | from pydantic import BaseModel 7 | 8 | 9 | class StructuredFormatter(BaseFormatter): 10 | def __init__(self, model: BaseChatModel, output_model: type[BaseModel]): 11 | super().__init__(model) 12 | self.output_model = output_model 13 | 14 | async def aformat( 15 | self, 16 | document: Document, 17 | file_path: Path | str | None = None, 18 | ) -> str: # FIXME: Return a structured output of type BaseModel ? 19 | raise NotImplementedError() 20 | 21 | def format( 22 | self, 23 | document: Document, 24 | file_path: Path | str | None = None, 25 | ) -> str: # FIXME: Return a structured output of type BaseModel ? 26 | raise NotImplementedError() 27 | -------------------------------------------------------------------------------- /libs/megaparse/src/megaparse/formatter/structured_formatter/custom_structured_formatter.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | from megaparse.formatter.structured_formatter import StructuredFormatter 4 | from megaparse_sdk.schema.document import Document 5 | from pydantic import BaseModel 6 | 7 | 8 | class CustomStructuredFormatter(StructuredFormatter): 9 | def format( 10 | self, 11 | document: Document, 12 | file_path: Path | str | None = None, 13 | ) -> str: 14 | """ 15 | Structure the file using an AI language model. 16 | Args: 17 | text: The text to format. 18 | file_path: The file path of the text. 19 | model: The AI language model to use for formatting. 20 | Returns: 21 | The structured text. 22 | """ 23 | if not self.model: 24 | raise ValueError("A Model is needed to use the CustomStructuredFormatter.") 25 | print("Formatting text using CustomStructuredFormatter...") 26 | text = str(document) 27 | if len(text) < 0: 28 | raise ValueError( 29 | "A non empty text is needed to format text using CustomStructuredFormatter." 30 | ) 31 | if not self.output_model: 32 | raise ValueError( 33 | "An output model is needed to structure text using CustomStructuredFormatter." 34 | ) 35 | 36 | structured_model = self.model.with_structured_output(self.output_model) # type: ignore 37 | 38 | formatted_text = structured_model.invoke( 39 | f"Parse the text in a structured format: {text}" 40 | ) 41 | assert isinstance(formatted_text, BaseModel), "Model output is not a BaseModel." 42 | 43 | return formatted_text.model_dump_json() 44 | 45 | async def aformat( 46 | self, 47 | document: Document, 48 | file_path: Path | str | None = None, 49 | ) -> str: 50 | """ 51 | Asynchronously structure the file using an AI language model. 52 | Args: 53 | text: The text to format. 54 | file_path: The file path of the text. 55 | model: The AI language model to use for formatting. 56 | Returns: 57 | The structured text. 58 | """ 59 | if not self.model: 60 | raise ValueError("A Model is needed to use the CustomStructuredFormatter.") 61 | print("Formatting text using CustomStructuredFormatter...") 62 | text = str(document) 63 | 64 | if len(text) < 0: 65 | raise ValueError( 66 | "A non empty text is needed to format text using CustomStructuredFormatter." 67 | ) 68 | if not self.output_model: 69 | raise ValueError( 70 | "An output model is needed to structure text using CustomStructuredFormatter." 71 | ) 72 | 73 | structured_model = self.model.with_structured_output(self.output_model) # type: ignore 74 | 75 | formatted_text = await structured_model.ainvoke( 76 | f"Parse the text in a structured format: {text}" 77 | ) 78 | assert isinstance(formatted_text, BaseModel), "Model output is not a BaseModel." 79 | 80 | return formatted_text.model_dump_json() 81 | -------------------------------------------------------------------------------- /libs/megaparse/src/megaparse/formatter/table_formatter/__init__.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | from megaparse.formatter.base import BaseFormatter 4 | from megaparse_sdk.schema.document import Document 5 | 6 | 7 | class TableFormatter(BaseFormatter): 8 | def format( 9 | self, document: Document, file_path: Path | str | None = None 10 | ) -> Document: 11 | raise NotImplementedError("Subclasses should implement this method") 12 | 13 | async def aformat( 14 | self, document: Document, file_path: Path | str | None = None 15 | ) -> Document: 16 | raise NotImplementedError("Subclasses should implement this method") 17 | -------------------------------------------------------------------------------- /libs/megaparse/src/megaparse/formatter/table_formatter/llm_table_formatter.py: -------------------------------------------------------------------------------- 1 | import re 2 | import warnings 3 | from pathlib import Path 4 | from typing import Optional 5 | 6 | from langchain_core.language_models.chat_models import BaseChatModel 7 | from langchain_core.prompts import ChatPromptTemplate 8 | from megaparse.formatter.table_formatter import TableFormatter 9 | from megaparse_sdk.schema.document import Document, TableBlock 10 | 11 | 12 | class SimpleMDTableFormatter(TableFormatter): 13 | """ 14 | A formatter that converts table elements into Markdown format using llms. 15 | """ 16 | 17 | TABLE_MARKER_START = "[TABLE]" 18 | TABLE_MARKER_END = "[/TABLE]" 19 | CODE_BLOCK_PATTERN = r"^```.*$\n?" 20 | 21 | def __init__(self, model: Optional[BaseChatModel] = None): 22 | super().__init__(model) 23 | 24 | async def aformat( 25 | self, document: Document, file_path: Path | str | None = None 26 | ) -> Document: 27 | warnings.warn( 28 | "The SimpleMDTableFormatter is a sync formatter, please use the sync format method", 29 | UserWarning, 30 | stacklevel=2, 31 | ) 32 | return self.format(document=document, file_path=file_path) 33 | 34 | def format( 35 | self, document: Document, file_path: Path | str | None = None 36 | ) -> Document: 37 | """ 38 | Formats table elements within a list of elements. 39 | Args: 40 | elements: A list of Element objects. 41 | Returns: 42 | A list of Element objects with formatted tables. 43 | """ 44 | if not self.model: 45 | raise ValueError("A Model is needed to use the SimpleMDTableFormatter.") 46 | print("Formatting tables using SimpleMDTableFormatter...") 47 | table_stack = [] 48 | formatted_elements = [] 49 | 50 | for block in document.content: 51 | if isinstance(block, TableBlock): 52 | previous_table = table_stack[-1] if table_stack else "" 53 | formatted_table = self.format_table(block, previous_table) 54 | table_stack.append(formatted_table.text) 55 | formatted_elements.append(formatted_table) 56 | else: 57 | formatted_elements.append(block) 58 | 59 | document.content = formatted_elements 60 | return document 61 | 62 | def format_table( 63 | self, table_element: TableBlock, previous_table: str 64 | ) -> TableBlock: 65 | """ 66 | Formats a single table element into Markdown using an AI language model. 67 | Args: 68 | table_element: The table element to format. 69 | previous_table: The previously formatted table text. 70 | Returns: 71 | The formatted table element. 72 | """ 73 | assert self.model is not None, "Model is not set." 74 | 75 | prompt = ChatPromptTemplate.from_messages( 76 | [ 77 | ( 78 | "human", 79 | ( 80 | "You are an expert in markdown tables. Transform the following parsed table into a " 81 | "markdown table. Provide just the table in pure markdown, nothing else.\n" 82 | "\n{text}\n\n" 83 | "\n{previous_table}\n" 84 | ), 85 | ), 86 | ] 87 | ) 88 | 89 | chain = prompt | self.model 90 | result = chain.invoke( 91 | { 92 | "text": table_element.text, 93 | "previous_table": previous_table, 94 | } 95 | ) 96 | 97 | content_str = str(result.content) 98 | cleaned_content = re.sub( 99 | self.CODE_BLOCK_PATTERN, "", content_str, flags=re.MULTILINE 100 | ) 101 | markdown_table = ( 102 | f"{self.TABLE_MARKER_START}\n" 103 | f"{cleaned_content}\n" 104 | f"{self.TABLE_MARKER_END}\n\n" 105 | ) 106 | 107 | table_element.text = markdown_table 108 | 109 | return table_element 110 | -------------------------------------------------------------------------------- /libs/megaparse/src/megaparse/formatter/table_formatter/vision_table_formatter.py: -------------------------------------------------------------------------------- 1 | import base64 2 | from io import BytesIO 3 | from pathlib import Path 4 | from typing import List, Optional 5 | 6 | from langchain_core.language_models.chat_models import BaseChatModel 7 | from langchain_core.messages import HumanMessage 8 | from megaparse.formatter.table_formatter import TableFormatter 9 | from megaparse_sdk.schema.document import Document, TableBlock 10 | from pdf2image import convert_from_path 11 | from PIL import Image 12 | 13 | TABLE_OCR_PROMPT = """ 14 | You are tasked with transcribing the content of a table into markdown format. Your goal is to create a well-structured, readable markdown table that accurately represents the original content while adding appropriate formatting. 15 | Answer uniquely with the parsed table. Do not include the fenced code blocks backticks. 16 | """ 17 | 18 | 19 | class VisionMDTableFormatter(TableFormatter): 20 | """ 21 | A formatter that converts table elements into Markdown format using an AI language model. 22 | """ 23 | 24 | TABLE_MARKER_START = "[TABLE]" 25 | TABLE_MARKER_END = "[/TABLE]" 26 | CODE_BLOCK_PATTERN = r"^```.*$\n?" 27 | 28 | def __init__(self, model: Optional[BaseChatModel] = None): 29 | super().__init__(model) 30 | 31 | def _crop_table_image(self, table_element: TableBlock, file_path: str) -> str: 32 | """ 33 | Helper method to crop the table portion of the PDF page and convert it to a base64 string. 34 | """ 35 | assert table_element.bbox, "Table element must have coordinates." 36 | bbox = table_element.bbox 37 | page_number = table_element.page_range[0] 38 | assert page_number, "Table element must have a page number." 39 | assert bbox, "Table element must have coordinates." 40 | 41 | pages = convert_from_path(file_path) 42 | 43 | # Calculate the box for cropping 44 | box = ( 45 | bbox.top_left.x, 46 | bbox.top_left.y, 47 | bbox.bottom_right.x, 48 | bbox.bottom_right.y, 49 | ) 50 | table_image = pages[page_number - 1].crop(box) 51 | # Convert the cropped image to base64 52 | table_image64 = self.process_file([table_image])[0] 53 | return table_image64 54 | 55 | async def aformat( 56 | self, document: Document, file_path: Path | str | None = None 57 | ) -> Document: 58 | """ 59 | Asynchronously formats table elements within a list of elements. 60 | """ 61 | if not self.model: 62 | raise ValueError("A Model is needed to use the VisionMDTableFormatter.") 63 | print("Formatting tables using VisionMDTableFormatter (async)...") 64 | assert ( 65 | file_path 66 | ), "A file path is needed to format tables using VisionMDTableFormatter." 67 | if not isinstance(file_path, str): 68 | file_path = str(file_path) 69 | formatted_elements = [] 70 | for block in document.content: 71 | if isinstance(block, TableBlock): 72 | formatted_table = await self.aformat_table(block, file_path) 73 | formatted_elements.append(formatted_table) 74 | else: 75 | formatted_elements.append(block) 76 | 77 | document.content = formatted_elements 78 | return document 79 | 80 | def format( 81 | self, document: Document, file_path: Path | str | None = None 82 | ) -> Document: 83 | """ 84 | Asynchronously formats table elements within a list of elements. 85 | """ 86 | if not self.model: 87 | raise ValueError("A Model is needed to use the VisionMDTableFormatter.") 88 | print("Formatting tables using VisionMDTableFormatter (async)...") 89 | assert ( 90 | file_path 91 | ), "A file path is needed to format tables using VisionMDTableFormatter." 92 | if not isinstance(file_path, str): 93 | file_path = str(file_path) 94 | formatted_elements = [] 95 | for block in document.content: 96 | if isinstance(block, TableBlock): 97 | formatted_table = self.format_table(block, file_path) 98 | formatted_elements.append(formatted_table) 99 | else: 100 | formatted_elements.append(block) 101 | 102 | document.content = formatted_elements 103 | return document 104 | 105 | async def aformat_table( 106 | self, table_element: TableBlock, file_path: str 107 | ) -> TableBlock: 108 | """ 109 | Asynchronously formats a table element into Markdown format using a Vision Model. 110 | """ 111 | table_image64 = self._crop_table_image(table_element, file_path) 112 | formatted_table = await self.avision_extract(table_image64) 113 | 114 | markdown_table = ( 115 | f"{self.TABLE_MARKER_START}\n" 116 | f"{formatted_table}\n" 117 | f"{self.TABLE_MARKER_END}\n\n" 118 | ) 119 | # Replace the element's text with the formatted table text 120 | table_element.text = markdown_table 121 | return table_element 122 | 123 | def format_table(self, table_element: TableBlock, file_path: str) -> TableBlock: 124 | """ 125 | Asynchronously formats a table element into Markdown format using a Vision Model. 126 | """ 127 | table_image64 = self._crop_table_image(table_element, file_path) 128 | formatted_table = self.vision_extract(table_image64) 129 | 130 | markdown_table = ( 131 | f"{self.TABLE_MARKER_START}\n" 132 | f"{formatted_table}\n" 133 | f"{self.TABLE_MARKER_END}\n\n" 134 | ) 135 | # Replace the element's text with the formatted table text 136 | table_element.text = markdown_table 137 | return table_element 138 | 139 | def process_file(self, images: List[Image.Image], image_format="PNG") -> List[str]: 140 | """ 141 | Convert a list of PIL images to base64 encoded images. 142 | """ 143 | try: 144 | images_base64 = [] 145 | for image in images: 146 | buffered = BytesIO() 147 | image.save(buffered, format=image_format) 148 | image_base64 = base64.b64encode(buffered.getvalue()).decode("utf-8") 149 | images_base64.append(image_base64) 150 | return images_base64 151 | except Exception as e: 152 | raise ValueError(f"Error processing PDF file: {str(e)}") 153 | 154 | async def avision_extract(self, table_image: str) -> str: 155 | """ 156 | Asynchronously send image data to the language model for processing. 157 | """ 158 | assert ( 159 | self.model 160 | ), "A model is needed to use the VisionMDTableFormatter (async)." 161 | image_prompt = { 162 | "type": "image_url", 163 | "image_url": {"url": f"data:image/jpeg;base64,{table_image}"}, 164 | } 165 | 166 | message = HumanMessage( 167 | content=[ 168 | {"type": "text", "text": TABLE_OCR_PROMPT}, 169 | image_prompt, 170 | ], 171 | ) 172 | response = await self.model.ainvoke([message]) 173 | return str(response.content) 174 | 175 | def vision_extract(self, table_image: str) -> str: 176 | """ 177 | Synchronously send image data to the language model for processing. 178 | """ 179 | assert self.model, "A model is needed to use the VisionMDTableFormatter (sync)." 180 | image_prompt = { 181 | "type": "image_url", 182 | "image_url": {"url": f"data:image/jpeg;base64,{table_image}"}, 183 | } 184 | 185 | message = HumanMessage( 186 | content=[ 187 | {"type": "text", "text": TABLE_OCR_PROMPT}, 188 | image_prompt, 189 | ], 190 | ) 191 | response = self.model.invoke([message]) 192 | return str(response.content) 193 | -------------------------------------------------------------------------------- /libs/megaparse/src/megaparse/layout_detection/layout_detector.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import pathlib 4 | import uuid 5 | from typing import Any, List 6 | 7 | import numpy as np 8 | import onnxruntime as rt 9 | from megaparse.configs.auto import DeviceEnum 10 | from megaparse.layout_detection.output import LayoutDetectionOutput 11 | from megaparse.utils.onnx import get_providers 12 | from megaparse_sdk.schema.document import BBOX, Point2D 13 | from onnxtr.models.engine import EngineConfig 14 | from onnxtr.models.preprocessor import PreProcessor 15 | from PIL import Image, ImageDraw 16 | from PIL.Image import Image as PILImage 17 | 18 | logger = logging.getLogger("megaparse") 19 | 20 | LABEL_MAP = { 21 | 0: "Caption", 22 | 1: "Footnote", 23 | 2: "Formula", 24 | 3: "List-item", 25 | 4: "Page-footer", 26 | 5: "Page-header", 27 | 6: "Picture", 28 | 7: "Section-header", 29 | 8: "Table", 30 | 9: "Text", 31 | 10: "Title", 32 | } 33 | 34 | default_cfg: dict[str, dict[str, Any]] = { 35 | "yolov10s-doclaynet": { 36 | "mean": (0.5, 0.5, 0.5), 37 | "std": (1.0, 1.0, 1.0), 38 | "url_8_bit": None, 39 | "input_shape": (1, 1024, 1024), 40 | "url": pathlib.Path(__file__).parent.joinpath("models/yolov10s-doclaynet.onnx"), 41 | } 42 | } 43 | 44 | 45 | class LayoutDetector: 46 | def __init__( 47 | self, 48 | device: DeviceEnum = DeviceEnum.CPU, 49 | threshold: float = 0.1, 50 | preserve_aspect_ratio: bool = True, 51 | model_name: str = "yolov10s-doclaynet", 52 | load_in_8_bit: bool = False, 53 | ): 54 | model_config = default_cfg[model_name] 55 | self.device = device 56 | general_options = rt.SessionOptions() 57 | providers = get_providers(self.device) 58 | self.threshold = threshold 59 | self.batch_size, self.required_width, self.required_height = model_config[ 60 | "input_shape" 61 | ] 62 | self.preserve_aspect_ratio = preserve_aspect_ratio 63 | 64 | self.pre_processor = PreProcessor( 65 | output_size=(self.required_width, self.required_height), 66 | batch_size=self.batch_size, 67 | preserve_aspect_ratio=self.preserve_aspect_ratio, 68 | ) 69 | 70 | engine_config = EngineConfig( 71 | session_options=general_options, 72 | providers=providers, 73 | ) 74 | model_path = ( 75 | model_config.get("url_8_bit") if load_in_8_bit else model_config.get("url") 76 | ) 77 | assert model_path, f"Model path not found for {model_name}" 78 | 79 | self.model = rt.InferenceSession(model_path, engine_config=engine_config) 80 | 81 | def __call__( 82 | self, img_pages: list[PILImage], output_dir: str | None = None 83 | ) -> List[List[LayoutDetectionOutput]]: 84 | pages = [np.array(img) for img in img_pages] 85 | # Dimension check 86 | if any(page.ndim != 3 for page in pages): 87 | raise ValueError( 88 | "incorrect input shape: all pages are expected to be multi-channel 2D images." 89 | ) 90 | processed_batches = self.pre_processor(pages) 91 | processed_batches = np.array(processed_batches) 92 | processed_batches = processed_batches.squeeze(1) # Horrendus 93 | processed_batches = processed_batches.transpose(0, 3, 1, 2) 94 | 95 | pred_batches = np.array( 96 | [ 97 | self.model.run(None, {"images": np.expand_dims(batch, axis=0)}) 98 | for batch in processed_batches 99 | ] 100 | ) 101 | pred_batches = np.concatenate(pred_batches, axis=0) 102 | pred_batches = pred_batches.squeeze(1) # Horrendus 103 | 104 | processed_preds = [] 105 | for page, pred in zip(pages, pred_batches, strict=True): 106 | img_h, img_w = page.shape[:2] 107 | bboxes = self.extract_bboxes_from_page(pred, img_h, img_w) 108 | processed_preds.append(bboxes) 109 | 110 | if output_dir: 111 | self._save_layout(pages=pages, preds=processed_preds, output_dir=output_dir) 112 | 113 | return processed_preds 114 | 115 | def extract_bboxes_from_page( 116 | self, preds: np.ndarray, img_h: int, img_w: int 117 | ) -> List[LayoutDetectionOutput]: 118 | results = [] 119 | 120 | assert preds.shape == (300, 6) 121 | 122 | scale_h = img_h / self.required_height 123 | scale_w = img_w / self.required_width 124 | 125 | for det in preds: 126 | # Rescale the bounding box coordinates to the original dimensions 127 | x1, y1, x2, y2, score, cls_idx = det 128 | if score < self.threshold: 129 | continue 130 | 131 | x1 *= scale_w 132 | x2 *= scale_w 133 | y1 *= scale_h 134 | y2 *= scale_h 135 | 136 | if self.preserve_aspect_ratio: 137 | ratio = img_h / img_w 138 | x1 = x1 * (ratio if ratio > 1 else 1) 139 | x2 = x2 * (ratio if ratio > 1 else 1) 140 | y1 = y1 / (ratio if ratio < 1 else 1) 141 | y2 = y2 / (ratio if ratio < 1 else 1) 142 | 143 | x1 = max(0, min(x1, img_w)) 144 | x2 = max(0, min(x2, img_w)) 145 | y1 = max(0, min(y1, img_h)) 146 | y2 = max(0, min(y2, img_h)) 147 | 148 | bbox_id = uuid.uuid4() 149 | 150 | results.append( 151 | LayoutDetectionOutput( 152 | bbox_id=bbox_id, 153 | bbox=BBOX( 154 | top_left=Point2D(x=x1 / img_w, y=y1 / img_h), 155 | bottom_right=Point2D(x=x2 / img_w, y=y2 / img_h), 156 | ), 157 | prob=det[4], 158 | label=int(det[5]), 159 | ) 160 | ) 161 | 162 | result = self.topK(results) # or topK 163 | return result 164 | 165 | def nms( 166 | self, 167 | raw_bboxes: List[LayoutDetectionOutput], 168 | iou_threshold: float = 0.9, # FIXME: thresh Configurable in constructor 169 | ) -> List[LayoutDetectionOutput]: 170 | """ 171 | Non-Maximum Suppression (NMS) algorithm. 172 | 173 | Args: 174 | raw_bboxes (list): List of LayoutBBox objects. 175 | iou_threshold (float): IoU threshold for suppression. 176 | 177 | Returns: 178 | None: The input list `raw_bboxes` is modified in-place. 179 | """ 180 | raw_bboxes.sort(key=lambda x: x.prob, reverse=True) 181 | 182 | current_index = 0 183 | for index in range(len(raw_bboxes)): 184 | drop = False 185 | for prev_index in range(current_index): 186 | iou = raw_bboxes[index].bbox.iou(raw_bboxes[prev_index].bbox) 187 | if iou > iou_threshold: 188 | drop = True 189 | break 190 | if not drop: 191 | raw_bboxes[current_index], raw_bboxes[index] = ( 192 | raw_bboxes[index], 193 | raw_bboxes[current_index], 194 | ) 195 | current_index += 1 196 | 197 | return raw_bboxes[:current_index] 198 | 199 | def topK( 200 | self, detectResult: List[LayoutDetectionOutput], topK: int = 50 201 | ) -> List[LayoutDetectionOutput]: 202 | if len(detectResult) <= topK: 203 | return detectResult 204 | else: 205 | predBoxs = [] 206 | sort_detectboxs = sorted(detectResult, key=lambda x: x.prob, reverse=True) 207 | for i in range(topK): 208 | predBoxs.append(sort_detectboxs[i]) 209 | return predBoxs 210 | 211 | def _save_layout( 212 | self, 213 | pages: list[np.ndarray], 214 | preds: list[list[LayoutDetectionOutput]], 215 | output_dir: str, 216 | ): 217 | os.makedirs(output_dir, exist_ok=True) 218 | for i, (page, layout) in enumerate(zip(pages, preds, strict=True)): 219 | image = Image.fromarray(page) 220 | draw = ImageDraw.Draw(image) 221 | img_w, img_h = image.size 222 | 223 | for detection in layout: 224 | x_min, y_min, x_max, y_max = detection.bbox.to_numpy() 225 | bbox = x_min * img_w, y_min * img_h, x_max * img_w, y_max * img_h 226 | confidence = detection.prob 227 | category = detection.label 228 | label = LABEL_MAP.get(category, "Unknown") 229 | 230 | draw.rectangle(bbox, outline="red", width=2) 231 | # assert bbox[2] <= image.width 232 | # assert bbox[3] <= image.height 233 | draw.text( 234 | (bbox[0], bbox[1]), 235 | f"{label} ({confidence:.2f})", 236 | fill="red", 237 | ) 238 | 239 | image.save(os.path.join(output_dir, f"page_{i}.png")) 240 | -------------------------------------------------------------------------------- /libs/megaparse/src/megaparse/layout_detection/models/yolov10s-doclaynet.onnx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QuivrHQ/MegaParse/ba9a24aec950d6cf14834b8e2e11f5725778f12e/libs/megaparse/src/megaparse/layout_detection/models/yolov10s-doclaynet.onnx -------------------------------------------------------------------------------- /libs/megaparse/src/megaparse/layout_detection/output.py: -------------------------------------------------------------------------------- 1 | from uuid import UUID 2 | 3 | from megaparse_sdk.schema.document import BBOX 4 | from pydantic import BaseModel 5 | 6 | 7 | class LayoutDetectionOutput(BaseModel): 8 | bbox_id: UUID 9 | bbox: BBOX 10 | prob: float 11 | label: int 12 | -------------------------------------------------------------------------------- /libs/megaparse/src/megaparse/models/page.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | from megaparse_sdk.schema.document import TextDetection 4 | from megaparse_sdk.schema.parser_config import StrategyEnum 5 | from PIL.Image import Image as PILImage 6 | from pydantic import BaseModel, ConfigDict 7 | from pypdfium2._helpers.page import PdfPage 8 | 9 | 10 | class PageDimension(BaseModel): 11 | """ 12 | A class to represent a page dimension 13 | """ 14 | 15 | width: float 16 | height: float 17 | 18 | 19 | class Page(BaseModel): 20 | """ 21 | A class to represent a page 22 | """ 23 | 24 | strategy: StrategyEnum 25 | text_detections: TextDetection | None = None 26 | rasterized: PILImage | None = None 27 | page_size: PageDimension 28 | page_index: int 29 | pdfium_elements: PdfPage 30 | 31 | model_config = ConfigDict(arbitrary_types_allowed=True) 32 | 33 | 34 | class GatewayDocument(BaseModel): 35 | """ 36 | A class to represent a Gateway MegaParse Document, which is a container of pages. 37 | """ 38 | 39 | file_name: str 40 | pages: List[Page] 41 | -------------------------------------------------------------------------------- /libs/megaparse/src/megaparse/parser/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import BaseParser 2 | 3 | __all__ = ["BaseParser"] 4 | -------------------------------------------------------------------------------- /libs/megaparse/src/megaparse/parser/base.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from pathlib import Path 3 | from typing import IO 4 | 5 | from megaparse_sdk.schema.document import Document 6 | from megaparse_sdk.schema.extensions import FileExtension 7 | 8 | 9 | class BaseParser(ABC): 10 | """Mother Class for all the parsers [Unstructured, LlamaParse, MegaParseVision]""" 11 | 12 | supported_extensions = [] 13 | 14 | def check_supported_extension( 15 | self, file_extension: FileExtension | None, file_path: str | Path | None = None 16 | ): 17 | if not file_extension and not file_path: 18 | raise ValueError( 19 | f"Either file_path or file_extension must be provided for {self.__class__.__name__}" 20 | ) 21 | if file_path and not file_extension: 22 | file_path = Path(file_path) if isinstance(file_path, str) else file_path 23 | file_extension = FileExtension(file_path.suffix) 24 | if file_extension and file_extension not in self.supported_extensions: 25 | raise ValueError( 26 | f"Unsupported file extension {file_extension.value} for {self.__class__.__name__}" 27 | ) 28 | 29 | @abstractmethod 30 | async def aconvert( 31 | self, 32 | file_path: str | Path | None = None, 33 | file: IO[bytes] | None = None, 34 | file_extension: FileExtension | None = None, 35 | **kwargs, 36 | ) -> Document: 37 | """ 38 | Convert the given file to a specific format. 39 | 40 | Args: 41 | file_path (str | Path): The path to the file to be converted. 42 | **kwargs: Additional keyword arguments for the conversion process. 43 | 44 | Returns: 45 | str: The result of the conversion process. 46 | 47 | Raises: 48 | NotImplementedError: If the method is not implemented by a subclass. 49 | """ 50 | raise NotImplementedError("Subclasses should implement this method") 51 | 52 | @abstractmethod 53 | def convert( 54 | self, 55 | file_path: str | Path | None = None, 56 | file: IO[bytes] | None = None, 57 | file_extension: FileExtension | None = None, 58 | **kwargs, 59 | ) -> Document: 60 | """ 61 | Convert the given file to the unstructured format. 62 | 63 | Args: 64 | file_path (str | Path): The path to the file to be converted. 65 | **kwargs: Additional keyword arguments for the conversion process. 66 | 67 | Returns: 68 | str: The result of the conversion process. 69 | 70 | Raises: 71 | NotImplementedError: If the method is not implemented by a subclass. 72 | """ 73 | raise NotImplementedError("Subclasses should implement this method") 74 | -------------------------------------------------------------------------------- /libs/megaparse/src/megaparse/parser/builder.py: -------------------------------------------------------------------------------- 1 | from megaparse_sdk.schema.parser_config import ParseFileConfig 2 | 3 | from megaparse.parser.base import BaseParser 4 | from megaparse.parser.llama import LlamaParser 5 | from megaparse.parser.megaparse_vision import MegaParseVision 6 | from megaparse.parser.unstructured_parser import UnstructuredParser 7 | 8 | parser_dict: dict[str, type] = { 9 | "unstructured": UnstructuredParser, 10 | "llama_parser": LlamaParser, 11 | "megaparse_vision": MegaParseVision, 12 | } 13 | 14 | 15 | class ParserBuilder: 16 | def build(self, config: ParseFileConfig) -> BaseParser: 17 | """ 18 | Build a parser based on the given configuration. 19 | 20 | Args: 21 | config (ParserDict): The configuration to be used for building the parser. 22 | 23 | Returns: 24 | BaseParser: The built parser. 25 | 26 | Raises: 27 | ValueError: If the configuration is invalid. 28 | """ 29 | return parser_dict[config.method](**config.model_dump()) 30 | -------------------------------------------------------------------------------- /libs/megaparse/src/megaparse/parser/entity.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | from typing import List, Optional 3 | 4 | 5 | class TagEnum(str, Enum): 6 | """Possible tags for the elements in the file""" 7 | 8 | TABLE = "TABLE" 9 | TOC = "TOC" 10 | HEADER = "HEADER" 11 | IMAGE = "IMAGE" 12 | 13 | 14 | class SupportedModel(Enum): 15 | GPT_4O = ("gpt-4o", None) 16 | GPT_4O_TURBO = ("gpt-4o-turbo", None) 17 | CLAUDE_3_5_SONNET = ("claude-3-5-sonnet", ["latest", "20241022"]) 18 | CLAUDE_3_OPUS = ("claude-3-opus", ["latest", "20240229"]) 19 | 20 | def __init__(self, model_name: str, supported_releases: Optional[List[str]]): 21 | self.model_name = model_name 22 | self.supported_releases = supported_releases 23 | 24 | @classmethod 25 | def is_supported(cls, model_name: str) -> bool: 26 | # Attempt to match model_name by checking if it starts with a known model name 27 | for model in cls: 28 | if model_name.startswith(model.model_name): 29 | # Extract the release version if available 30 | release = model_name[len(model.model_name) :].lstrip("-") or None 31 | # Check if the model supports this release 32 | if model.supported_releases is None: 33 | return True 34 | return release in model.supported_releases if release else False 35 | return False 36 | -------------------------------------------------------------------------------- /libs/megaparse/src/megaparse/parser/llama.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from typing import IO, List 3 | 4 | from llama_index.core.schema import Document as LlamaDocument 5 | from llama_parse import LlamaParse as _LlamaParse 6 | from llama_parse.utils import Language, ResultType 7 | from megaparse_sdk.schema.document import BBOX, Point2D, TextBlock 8 | from megaparse_sdk.schema.document import Document as MPDocument 9 | from megaparse_sdk.schema.extensions import FileExtension 10 | 11 | from megaparse.parser import BaseParser 12 | 13 | 14 | class LlamaParser(BaseParser): 15 | supported_extensions = [FileExtension.PDF] 16 | 17 | def __init__( 18 | self, 19 | api_key: str, 20 | verbose=True, 21 | language: Language = Language.FRENCH, 22 | parsing_instruction: str | None = None, 23 | **kwargs, 24 | ) -> None: 25 | self.api_key = api_key 26 | self.verbose = verbose 27 | self.language = language 28 | if parsing_instruction: 29 | self.parsing_instruction = parsing_instruction 30 | else: 31 | self.parsing_instruction = """Do not take into account the page breaks (no --- between pages), 32 | do not repeat the header and the footer so the tables are merged if needed. Keep the same format for similar tables.""" 33 | 34 | async def aconvert( 35 | self, 36 | file_path: str | Path | None = None, 37 | file: IO[bytes] | None = None, 38 | file_extension: None | FileExtension = None, 39 | **kwargs, 40 | ) -> MPDocument: 41 | if not file_path: 42 | raise ValueError("File_path should be provided to run LlamaParser") 43 | self.check_supported_extension(file_extension, file_path) 44 | 45 | llama_parser = _LlamaParse( 46 | api_key=self.api_key, 47 | result_type=ResultType.MD, 48 | gpt4o_mode=True, 49 | verbose=self.verbose, 50 | language=self.language, 51 | parsing_instruction=self.parsing_instruction, 52 | ) 53 | 54 | documents: List[LlamaDocument] = await llama_parser.aload_data(str(file_path)) 55 | 56 | return self.__to_elements_list__(documents) 57 | 58 | def convert( 59 | self, 60 | file_path: str | Path | None = None, 61 | file: IO[bytes] | None = None, 62 | file_extension: None | FileExtension = None, 63 | **kwargs, 64 | ) -> MPDocument: 65 | if not file_path: 66 | raise ValueError("File_path should be provided to run LlamaParser") 67 | self.check_supported_extension(file_extension, file_path) 68 | 69 | llama_parser = _LlamaParse( 70 | api_key=self.api_key, 71 | result_type=ResultType.JSON, 72 | gpt4o_mode=True, 73 | verbose=self.verbose, 74 | language=self.language, 75 | parsing_instruction=self.parsing_instruction, 76 | ) 77 | 78 | documents: List[LlamaDocument] = llama_parser.load_data(str(file_path)) 79 | 80 | return self.__to_elements_list__(documents) 81 | 82 | def __to_elements_list__(self, llama_doc: List[LlamaDocument]) -> MPDocument: 83 | list_blocks = [] 84 | for i, page in enumerate(llama_doc): 85 | list_blocks.append( 86 | TextBlock( 87 | text=page.text, 88 | metadata={}, 89 | page_range=(i, i + 1), 90 | bbox=BBOX( 91 | top_left=Point2D(x=0, y=0), bottom_right=Point2D(x=1, y=1) 92 | ), 93 | ) 94 | ) 95 | return MPDocument( 96 | metadata={}, 97 | detection_origin="llamaparse", 98 | content=list_blocks, 99 | ) 100 | -------------------------------------------------------------------------------- /libs/megaparse/src/megaparse/parser/megaparse_vision.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import base64 3 | import re 4 | from io import BytesIO 5 | from pathlib import Path 6 | from typing import IO, List 7 | 8 | from langchain_core.language_models.chat_models import BaseChatModel 9 | from langchain_core.messages import HumanMessage 10 | from megaparse_sdk.schema.document import BBOX, Block, Point2D, TextBlock 11 | from megaparse_sdk.schema.document import Document as MPDocument 12 | from megaparse_sdk.schema.extensions import FileExtension 13 | from pdf2image import convert_from_path 14 | 15 | from megaparse.parser import BaseParser 16 | from megaparse.parser.entity import SupportedModel, TagEnum 17 | 18 | # BASE_OCR_PROMPT = """ 19 | # Transcribe the content of this file into markdown. Be mindful of the formatting. 20 | # Add formatting if you think it is not clear. 21 | # Do not include page breaks and merge content of tables if it is continued in the next page. 22 | # Add tags around what you identify as a table [TABLE], header - complete chain of characters that are repeated at each start of pages - [HEADER], table of content [TOC] in the format '[tag] ... [/tag]' 23 | # Return only the parsed content. 24 | # """ 25 | 26 | BASE_OCR_PROMPT = """ 27 | You are tasked with transcribing and formatting the content of a file into markdown. Your goal is to create a well-structured, readable markdown document that accurately represents the original content while adding appropriate formatting and tags. 28 | 29 | 30 | Follow these instructions to complete the task: 31 | 32 | 1. Carefully read through the entire file content. 33 | 34 | 2. Transcribe the content into markdown format, paying close attention to the existing formatting and structure. 35 | 36 | 3. If you encounter any unclear formatting in the original content, use your judgment to add appropriate markdown formatting to improve readability and structure. 37 | 38 | 4. For tables, headers, and table of contents, add the following tags: 39 | - Tables: Enclose the entire table in [TABLE] and [/TABLE] tags. Merge content of tables if it is continued in the next page. 40 | - Headers (complete chain of characters repeated at the start of each page): Enclose in [HEADER] and [/HEADER] tags inside the markdown file. 41 | - Table of contents: Enclose in [TOC] and [/TOC] tags 42 | 43 | 5. When transcribing tables: 44 | - If a table continues across multiple pages, merge the content into a single, cohesive table. 45 | - Use proper markdown table formatting with pipes (|) and hyphens (-) for table structure. 46 | 47 | 6. Do not include page breaks in your transcription. 48 | 49 | 7. Maintain the logical flow and structure of the document, ensuring that sections and subsections are properly formatted using markdown headers (# for main headers, ## for subheaders, etc.). 50 | 51 | 8. Use appropriate markdown syntax for other formatting elements such as bold, italic, lists, and code blocks as needed. 52 | 53 | 10. Return only the parsed content in markdown format, including the specified tags for tables, headers, and table of contents. 54 | """ 55 | 56 | 57 | class MegaParseVision(BaseParser): 58 | supported_extensions = [FileExtension.PDF] 59 | 60 | def __init__(self, model: BaseChatModel, **kwargs): 61 | if hasattr(model, "model_name"): 62 | if not SupportedModel.is_supported(model.model_name): 63 | raise ValueError( 64 | f"Invald model name, MegaParse vision only supports model that have vision capabilities. " 65 | f"{model.model_name} is not supported." 66 | ) 67 | self.model = model 68 | 69 | self.parsed_chunks: list[str] | None = None 70 | 71 | def process_file(self, file_path: str, image_format: str = "PNG") -> List[str]: 72 | """ 73 | Process a PDF file and convert its pages to base64 encoded images. 74 | 75 | :param file_path: Path to the PDF file 76 | :param image_format: Format to save the images (default: PNG) 77 | :return: List of base64 encoded images 78 | """ 79 | try: 80 | images = convert_from_path(file_path) 81 | images_base64 = [] 82 | for image in images: 83 | buffered = BytesIO() 84 | image.save(buffered, format=image_format) 85 | image_base64 = base64.b64encode(buffered.getvalue()).decode("utf-8") 86 | images_base64.append(image_base64) 87 | return images_base64 88 | except Exception as e: 89 | raise ValueError(f"Error processing PDF file: {str(e)}") 90 | 91 | def get_element(self, tag: TagEnum, chunk: str): 92 | pattern = rf"\[{tag.value}\]([\s\S]*?)\[/{tag.value}\]" 93 | all_elmts = re.findall(pattern, chunk) 94 | if not all_elmts: 95 | print(f"No {tag.value} found in the chunk") 96 | return [] 97 | return [elmt.strip() for elmt in all_elmts] 98 | 99 | async def asend_to_mlm(self, images_data: List[str]) -> str: 100 | """ 101 | Send images to the language model for processing. 102 | 103 | :param images_data: List of base64 encoded images 104 | :return: Processed content as a string 105 | """ 106 | images_prompt = [ 107 | { 108 | "type": "image_url", 109 | "image_url": {"url": f"data:image/jpeg;base64,{image_data}"}, 110 | } 111 | for image_data in images_data 112 | ] 113 | message = HumanMessage( 114 | content=[ 115 | {"type": "text", "text": BASE_OCR_PROMPT}, 116 | *images_prompt, 117 | ], 118 | ) 119 | response = await self.model.ainvoke([message]) 120 | return str(response.content) 121 | 122 | def send_to_mlm(self, images_data: List[str]) -> str: 123 | """ 124 | Send images to the language model for processing. 125 | 126 | :param images_data: List of base64 encoded images 127 | :return: Processed content as a string 128 | """ 129 | images_prompt = [ 130 | { 131 | "type": "image_url", 132 | "image_url": {"url": f"data:image/jpeg;base64,{image_data}"}, 133 | } 134 | for image_data in images_data 135 | ] 136 | message = HumanMessage( 137 | content=[ 138 | {"type": "text", "text": BASE_OCR_PROMPT}, 139 | *images_prompt, 140 | ], 141 | ) 142 | response = self.model.invoke([message]) 143 | return str(response.content) 144 | 145 | async def aconvert( 146 | self, 147 | file_path: str | Path | None = None, 148 | file: IO[bytes] | None = None, 149 | file_extension: FileExtension | None = None, 150 | batch_size: int = 3, 151 | **kwargs, 152 | ) -> MPDocument: 153 | """ 154 | Parse a PDF file and process its content using the language model. 155 | 156 | :param file_path: Path to the PDF file 157 | :param batch_size: Number of pages to process concurrently 158 | :return: List of processed content strings 159 | """ 160 | if not file_path: 161 | raise ValueError("File_path should be provided to run MegaParseVision") 162 | 163 | if isinstance(file_path, Path): 164 | file_path = str(file_path) 165 | 166 | self.check_supported_extension(file_extension, file_path) 167 | 168 | pdf_base64 = self.process_file(file_path) 169 | n_pages = len(pdf_base64) 170 | tasks = [ 171 | self.asend_to_mlm(pdf_base64[i : i + batch_size]) 172 | for i in range(0, len(pdf_base64), batch_size) 173 | ] 174 | self.parsed_chunks = await asyncio.gather(*tasks) 175 | responses = self.get_cleaned_content("\n".join(self.parsed_chunks)) 176 | return self.__to_elements_list__(responses, n_pages=n_pages) 177 | 178 | def convert( 179 | self, 180 | file_path: str | Path | None = None, 181 | file: IO[bytes] | None = None, 182 | file_extension: FileExtension | None = None, 183 | batch_size: int = 3, 184 | **kwargs, 185 | ) -> MPDocument: 186 | """ 187 | Parse a PDF file and process its content using the language model. 188 | 189 | :param file_path: Path to the PDF file 190 | :param batch_size: Number of pages to process at a time 191 | :return: List of processed content strings 192 | """ 193 | if not file_path: 194 | raise ValueError("File_path should be provided to run MegaParseVision") 195 | 196 | if isinstance(file_path, Path): 197 | file_path = str(file_path) 198 | 199 | self.check_supported_extension(file_extension, file_path) 200 | 201 | pdf_base64 = self.process_file(file_path) 202 | n_pages = len(pdf_base64) 203 | chunks = [ 204 | pdf_base64[i : i + batch_size] 205 | for i in range(0, len(pdf_base64), batch_size) 206 | ] 207 | self.parsed_chunks = [] 208 | for chunk in chunks: 209 | response = self.send_to_mlm(chunk) 210 | self.parsed_chunks.append(response) 211 | responses = self.get_cleaned_content("\n".join(self.parsed_chunks)) 212 | return self.__to_elements_list__(responses, n_pages) 213 | 214 | def get_cleaned_content(self, parsed_file: str) -> str: 215 | """ 216 | Get cleaned parsed file without any tags defined in TagEnum. 217 | 218 | This method removes all tags from TagEnum from the parsed file, formats the content, 219 | and handles the HEADER tag specially by keeping only the first occurrence. 220 | 221 | Args: 222 | parsed_file (str): The parsed file content with tags. 223 | 224 | Returns: 225 | str: The cleaned content without TagEnum tags. 226 | 227 | """ 228 | tag_pattern = "|".join(map(re.escape, TagEnum.__members__.values())) 229 | tag_regex = rf"\[({tag_pattern})\](.*?)\[/\1\]" 230 | # handle the HEADER tag specially 231 | header_pattern = rf"\[{TagEnum.HEADER.value}\](.*?)\[/{TagEnum.HEADER.value}\]" 232 | headers = re.findall(header_pattern, parsed_file, re.DOTALL) 233 | if headers: 234 | first_header = headers[0].strip() 235 | # Remove all HEADER tags and their content 236 | parsed_file = re.sub(header_pattern, "", parsed_file, flags=re.DOTALL) 237 | # Add the first header back at the beginning 238 | parsed_file = f"{first_header}\n{parsed_file}" 239 | 240 | # Remove all other tags 241 | def remove_tag(match): 242 | return match.group(2) 243 | 244 | cleaned_content = re.sub(tag_regex, remove_tag, parsed_file, flags=re.DOTALL) 245 | 246 | cleaned_content = re.sub(r"^```.*$\n?", "", cleaned_content, flags=re.MULTILINE) 247 | cleaned_content = re.sub(r"\n\s*\n", "\n\n", cleaned_content) 248 | cleaned_content = cleaned_content.replace("|\n\n|", "|\n|") 249 | cleaned_content = cleaned_content.strip() 250 | 251 | return cleaned_content 252 | 253 | def __to_elements_list__(self, mpv_doc: str, n_pages: int) -> MPDocument: 254 | list_blocks: List[Block] = [ 255 | TextBlock( 256 | text=mpv_doc, 257 | metadata={}, 258 | page_range=(0, n_pages - 1), 259 | bbox=BBOX(top_left=Point2D(x=0, y=0), bottom_right=Point2D(x=1, y=1)), 260 | ) 261 | ] 262 | return MPDocument( 263 | metadata={}, 264 | detection_origin="megaparse_vision", 265 | content=list_blocks, 266 | ) 267 | -------------------------------------------------------------------------------- /libs/megaparse/src/megaparse/predictor/layout_predictor.py: -------------------------------------------------------------------------------- 1 | from PIL import Image 2 | from unstructured_inference.inference.layout import PageLayout 3 | from unstructured_inference.models.base import get_model 4 | from unstructured_inference.visualize import draw_bbox 5 | 6 | 7 | def extract_layout( 8 | page_number: int, page_image: Image.Image, model_name: str = "yolox" 9 | ) -> PageLayout: 10 | layout_model = get_model(model_name) 11 | parsed_page = PageLayout.from_image( 12 | image=page_image, 13 | number=page_number, 14 | detection_model=layout_model, 15 | element_extraction_model=None, 16 | fixed_layout=None, 17 | ) 18 | 19 | colors = ["red" for _ in parsed_page.elements] 20 | for el, color in zip(parsed_page.elements, colors, strict=True): 21 | page_image = draw_bbox(page_image, el, color=color, details=False) 22 | 23 | page_image.show() 24 | 25 | return parsed_page 26 | -------------------------------------------------------------------------------- /libs/megaparse/src/megaparse/utils/extract_metadata.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Dict 2 | 3 | import pypdfium2 as pdfium 4 | 5 | 6 | def get_doc_metdata(pdfium_document: pdfium.PdfDocument) -> Dict[str, Any]: 7 | pass 8 | -------------------------------------------------------------------------------- /libs/megaparse/src/megaparse/utils/onnx.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from typing import List 3 | 4 | import onnxruntime as rt 5 | from megaparse.configs.auto import DeviceEnum 6 | 7 | logger = logging.getLogger("megaparse") 8 | 9 | 10 | def get_providers(device: DeviceEnum) -> List[str]: 11 | prov = rt.get_available_providers() 12 | logger.info("Available providers: %s", prov) 13 | if device == DeviceEnum.CUDA: 14 | if "CUDAExecutionProvider" not in prov: 15 | raise ValueError( 16 | "onnxruntime can't find CUDAExecutionProvider in list of available providers" 17 | ) 18 | return ["CUDAExecutionProvider"] 19 | elif device == DeviceEnum.COREML: 20 | if "CoreMLExecutionProvider" not in prov: 21 | raise ValueError( 22 | "onnxruntime can't find CoreMLExecutionProvider in list of available providers" 23 | ) 24 | return ["CoreMLExecutionProvider"] 25 | elif device == DeviceEnum.CPU: 26 | return ["CPUExecutionProvider"] 27 | else: 28 | raise ValueError("device not in (CUDA,CoreML,CPU)") 29 | -------------------------------------------------------------------------------- /libs/megaparse/src/megaparse/utils/strategy.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | import numpy as np 4 | from megaparse.models.page import Page 5 | from megaparse_sdk.schema.document import TextDetection 6 | from megaparse_sdk.schema.parser_config import StrategyEnum 7 | from pypdfium2._helpers.page import PdfPage 8 | 9 | 10 | def get_page_strategy( 11 | pdfium_page: PdfPage, onnxtr_page: TextDetection | None, threshold: float 12 | ) -> StrategyEnum: 13 | if onnxtr_page is None: 14 | return StrategyEnum.FAST 15 | text_coords = [] 16 | # Get all the images in the page 17 | for obj in pdfium_page.get_objects(): 18 | if obj.type == 1: # type: ignore 19 | text_coords.append(obj.get_pos()) 20 | 21 | p_width, p_height = int(pdfium_page.get_width()), int(pdfium_page.get_height()) 22 | 23 | pdfium_canva = np.zeros((int(p_height), int(p_width))) 24 | 25 | for coords in text_coords: 26 | # (left,bottom,right, top) 27 | # 0---l--------------R-> y 28 | # | 29 | # B (x0,y0) 30 | # | 31 | # T (x1,y1) 32 | # ^ 33 | # x 34 | x0, y0, x1, y1 = ( 35 | p_height - coords[3], 36 | coords[0], 37 | p_height - coords[1], 38 | coords[2], 39 | ) 40 | x0 = max(0, min(p_height, int(x0))) 41 | y0 = max(0, min(p_width, int(y0))) 42 | x1 = max(0, min(p_height, int(x1))) 43 | y1 = max(0, min(p_width, int(y1))) 44 | pdfium_canva[x0:x1, y0:y1] = 1 45 | 46 | onnxtr_canva = np.zeros((int(p_height), int(p_width))) 47 | for block in onnxtr_page.bboxes: 48 | x0, y0 = block.bbox[0] 49 | x1, y1 = block.bbox[1] 50 | x0 = max(0, min(int(x0 * p_width), int(p_width))) 51 | y0 = max(0, min(int(y0 * p_height), int(p_height))) 52 | x1 = max(0, min(int(x1 * p_width), int(p_width))) 53 | y1 = max(0, min(int(y1 * p_height), int(p_height))) 54 | onnxtr_canva[y0:y1, x0:x1] = 1 55 | 56 | intersection = np.logical_and(pdfium_canva, onnxtr_canva) 57 | union = np.logical_or(pdfium_canva, onnxtr_canva) 58 | sum_intersection = np.sum(intersection) 59 | sum_union = np.sum(union) 60 | iou = sum_intersection / sum_union if sum_union != 0 else 0 61 | if iou < threshold: 62 | return StrategyEnum.HI_RES 63 | return StrategyEnum.FAST 64 | 65 | 66 | def determine_global_strategy(pages: List[Page], threshold: float) -> StrategyEnum: 67 | count = sum(1 for page in pages if page.strategy == StrategyEnum.HI_RES) 68 | if count / len(pages) > threshold: 69 | return StrategyEnum.HI_RES 70 | return StrategyEnum.FAST 71 | -------------------------------------------------------------------------------- /libs/megaparse/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QuivrHQ/MegaParse/ba9a24aec950d6cf14834b8e2e11f5725778f12e/libs/megaparse/tests/__init__.py -------------------------------------------------------------------------------- /libs/megaparse/tests/certs/client-cert.pem: -------------------------------------------------------------------------------- 1 | -----BEGIN CERTIFICATE----- 2 | MIIEqDCCAxCgAwIBAgIRAITvq6ZEk6paYFDRbueJhEMwDQYJKoZIhvcNAQELBQAw 3 | gZ0xHjAcBgNVBAoTFW1rY2VydCBkZXZlbG9wbWVudCBDQTE5MDcGA1UECwwwYW1p 4 | bmVAYW1pbmVzLU1hY0Jvb2stUHJvLmxvY2FsIChhbWluZSBkaXJob3Vzc2kpMUAw 5 | PgYDVQQDDDdta2NlcnQgYW1pbmVAYW1pbmVzLU1hY0Jvb2stUHJvLmxvY2FsIChh 6 | bWluZSBkaXJob3Vzc2kpMB4XDTI0MTExOTEwNDgwN1oXDTI3MDIxOTEwNDgwN1ow 7 | ZDEnMCUGA1UEChMebWtjZXJ0IGRldmVsb3BtZW50IGNlcnRpZmljYXRlMTkwNwYD 8 | VQQLDDBhbWluZUBhbWluZXMtTWFjQm9vay1Qcm8ubG9jYWwgKGFtaW5lIGRpcmhv 9 | dXNzaSkwggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEKAoIBAQC2fDlGlKYIj8bp 10 | tlDYh8ooc56Zt+R1HF1GcqF0Gv+oub/dDvsIZnun5bBnA7W3tJ4M6virwg6cBiA5 11 | KDkIbwWfzHatsvFM0gMX3ZEfAwemo9Egi8udOsuAkP0OYlxzAB1PqOKCRfcfzFcH 12 | qmOb/JNlI82LBDLOqJDfGG4cRBYWqWRTYDxHsswSKFr/QHOHpImtrAyqo8qsXobN 13 | gLWSm1cNNtHa5XiCCJ7NUCVZh5cyEeCv1fS2297N+H0W9BxKpb1f9sAQ2N3ZLei8 14 | ghHuQVA8yhUB1YCO/8jsywvXb8EnZctPLvhuLxeCN7A4TESPk5i0LsITqJcl4vQT 15 | WWVVcNfJAgMBAAGjgZowgZcwDgYDVR0PAQH/BAQDAgWgMCcGA1UdJQQgMB4GCCsG 16 | AQUFBwMCBggrBgEFBQcDAQYIKwYBBQUHAwQwHwYDVR0jBBgwFoAUV2w3gvQM5La1 17 | 2fk80tJXoM/14l4wOwYDVR0RBDQwMoIJbG9jYWxob3N0gRNtZWdhcGFyc2VAcXVp 18 | dnIuYXBwhxAAAAAAAAAAAAAAAAAAAAABMA0GCSqGSIb3DQEBCwUAA4IBgQAYq4VZ 19 | 6spwGvcqg8kCOghu6o54UPYo/NLzh3oYewJnDJ+2XD786TpTgjZMGA6Ms+det6oV 20 | HdT5s77VFgJiJloHlD0fpKkRxjzyBOk5/bQcCKkTMBVfgJbMoAfa2gq+/7zxmLcn 21 | AmNg7BkmsTtHWPsLyN3rYI4dkkDKWkxp8Sezm9WPEa9OGJDJSYf4Dq9pN1lUoP1p 22 | vxsq7sW0HDWnx/I2zWuz3AaT9b4UayRnk4IRYxAuYYN/k0GNjVmmDveywNoNlkmW 23 | 0Az6ycPN+vvz8Jpm3CbZSIQLO8Yn57H/aU4DmOtunm3VLUiLucmfOggv8Sq5n2g9 24 | ze61UJu9lr2/nWOXnErl3V9UL3kJ1OlbFzTWDGm9zX7boo6MLXy+fAj+Tw0sCeMr 25 | drdxo8IUYYU6HUdtuLGMFznBFFUNhfFSwFANGPB38NyofwLPSZM0hYntQqBMt/P7 26 | /E+wQ67hSEutkIbOD3kGkGREIk3dVyUeajO9DFTaQ+yTnNtnuUbxs5LkRlw= 27 | -----END CERTIFICATE----- 28 | -------------------------------------------------------------------------------- /libs/megaparse/tests/certs/client-key.pem: -------------------------------------------------------------------------------- 1 | -----BEGIN PRIVATE KEY----- 2 | MIIEvgIBADANBgkqhkiG9w0BAQEFAASCBKgwggSkAgEAAoIBAQC2fDlGlKYIj8bp 3 | tlDYh8ooc56Zt+R1HF1GcqF0Gv+oub/dDvsIZnun5bBnA7W3tJ4M6virwg6cBiA5 4 | KDkIbwWfzHatsvFM0gMX3ZEfAwemo9Egi8udOsuAkP0OYlxzAB1PqOKCRfcfzFcH 5 | qmOb/JNlI82LBDLOqJDfGG4cRBYWqWRTYDxHsswSKFr/QHOHpImtrAyqo8qsXobN 6 | gLWSm1cNNtHa5XiCCJ7NUCVZh5cyEeCv1fS2297N+H0W9BxKpb1f9sAQ2N3ZLei8 7 | ghHuQVA8yhUB1YCO/8jsywvXb8EnZctPLvhuLxeCN7A4TESPk5i0LsITqJcl4vQT 8 | WWVVcNfJAgMBAAECggEBAIK2AlSzHyacze8UH16qDTzibGVRGjxkf895Rnqi6COU 9 | QYD3PQrsVYCS/sMbHiujHV7FZC+rRcmufaBTVl7bH10yGIQc28iZ2YtbsppTEkTj 10 | rGUynTtXJPNHZ2vJOs1I9LXdk7maogPN2zzraIQP7AgTGCSOclIi3fpfRmfKwUOj 11 | BkEzj7CbaAGtW9vTamPJG/+wgaaBcPhplQk4cD2mjdaMLfGQXNXiYgp09kf0hJ2k 12 | 0QbsQBC85bMSfmPAsoTRLxi94S12at3SABgF0oOCy9FZs/sWsdJRI6nbfvZ3C4xo 13 | 8y+rH7Yaej7AYK+jbU3Uk/1473cuCAnNKg65UyU4+gECgYEA2/ZQYRDU3JWNHQGy 14 | dJXZRl6hSFCw9y9RUc/QjcRs+VlnXE5UK1eLwfcKh0YYRhIWSE8z3mZmK09M/FG0 15 | xbU4qIZbDYcAI2nCiUeT8HmTjVSPMS1oWZrt7rh00gcyoLQt2TUS3bo2tsmdPyWW 16 | OgEiYfb4MoG/KCdYlACE6O4GMMECgYEA1GIMIHM2x4B1wgLnKeI3X2wYWuYCHtFB 17 | Px56GUFTZytBsHghxtovVlLh88FNS5rthvXuE0FHE9RljKhZaNgqrPOrlAZSuv18 18 | vK7RmG/NPJl2osbs677a/xoxNuVkfrRcxl4cvYOBL5huHo1D5sOitGFW+IlscgWY 19 | nWzXlY7AYQkCgYA6H96hp7b4CzTc42Pq1uYxaDQqTdhVmVVdzxKHQ86gHXXouHIZ 20 | eereeI95q5YifgkRVoyYSmrZKv1m95hTXk34inhpHLF2qi3T5Ow88YOCJ0QndJ5M 21 | f1o8aNXF4k0IllQ/P30axmhK6P/6fc4yybXyOTbg8dQ3oh4QDgsRGkTcgQKBgQCG 22 | qLgJpyN3cPK5FYAeJUl4nh//GlED2yekbp15/9py0pFu42x/GX3kHN8Y31oz8sJh 23 | zPKrkLsRTp0ohuFRwaWlTUZfr3arCugY9jr8jP6zSpZW9QvpGXTfRGsp5F5Im/Eq 24 | 8ScF3ih91gcUJfuEiExUVFeBdBinXvb58bXrJLzDiQKBgG+Z06uj2dWxtK4nqJvP 25 | HllTocAGVm+fEmupVsLU6ksVVrOl8O9TapMbY8pUj9J5oBYJvY+KFGoIoxYwhZrz 26 | 4NqY7iv8w+LQ7mQIwcQ4B67pDAQMJZTShR5v57FlAZldP5UpE5ASt22isBW31sYI 27 | 1OaXIqrCA/V43NydDezh0ylQ 28 | -----END PRIVATE KEY----- 29 | -------------------------------------------------------------------------------- /libs/megaparse/tests/conftest.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from typing import IO 3 | 4 | import pytest_asyncio 5 | from httpx import ASGITransport, AsyncClient 6 | from langchain_community.document_loaders import PlaywrightURLLoader 7 | from langchain_core.documents import Document 8 | from megaparse.api.app import app, get_playwright_loader, parser_builder_dep 9 | from megaparse.parser.base import BaseParser 10 | from megaparse_sdk.schema.document import Document as MPDocument 11 | from megaparse_sdk.schema.document import TextBlock 12 | from megaparse_sdk.schema.extensions import FileExtension 13 | 14 | 15 | class FakeParserBuilder: 16 | def build(self, *args, **kwargs) -> BaseParser: 17 | """ 18 | Build a fake parser based on the given configuration. 19 | 20 | Returns: 21 | BaseParser: The built fake parser. 22 | 23 | Raises: 24 | ValueError: If the configuration is invalid. 25 | """ 26 | 27 | class FakeParser(BaseParser): 28 | def convert( 29 | self, 30 | file_path: str | Path | None = None, 31 | file: IO[bytes] | None = None, 32 | file_extension: None | FileExtension = None, 33 | **kwargs, 34 | ) -> MPDocument: 35 | print("Fake parser is converting the file") 36 | return MPDocument( 37 | file_name="Fake file", 38 | content=[TextBlock(text="Fake conversion result", metadata={})], 39 | metadata={}, 40 | detection_origin="fakeparser", 41 | ) 42 | 43 | async def aconvert( 44 | self, 45 | file_path: str | Path | None = None, 46 | file: IO[bytes] | None = None, 47 | file_extension: None | FileExtension = None, 48 | **kwargs, 49 | ) -> MPDocument: 50 | print("Fake parser is converting the file") 51 | return MPDocument( 52 | file_name="Fake file", 53 | content=[TextBlock(text="Fake conversion result", metadata={})], 54 | metadata={}, 55 | detection_origin="fakeparser", 56 | ) 57 | 58 | return FakeParser() 59 | 60 | 61 | @pytest_asyncio.fixture(scope="function") 62 | async def test_client(): 63 | print("Setting up test_client fixture") 64 | 65 | def fake_parser_builder(): 66 | return FakeParserBuilder() 67 | 68 | def fake_playwright_loader(): 69 | class FakePlaywrightLoader(PlaywrightURLLoader): 70 | async def aload(self): 71 | return [Document(page_content="Fake website content")] 72 | 73 | return FakePlaywrightLoader(urls=[], remove_selectors=["header", "footer"]) 74 | 75 | app.dependency_overrides[parser_builder_dep] = fake_parser_builder 76 | app.dependency_overrides[get_playwright_loader] = fake_playwright_loader 77 | async with AsyncClient( 78 | transport=ASGITransport(app=app), # type: ignore 79 | base_url="http://test", 80 | ) as ac: 81 | yield ac 82 | app.dependency_overrides = {} 83 | -------------------------------------------------------------------------------- /libs/megaparse/tests/data/MegaFake_report.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QuivrHQ/MegaParse/ba9a24aec950d6cf14834b8e2e11f5725778f12e/libs/megaparse/tests/data/MegaFake_report.pdf -------------------------------------------------------------------------------- /libs/megaparse/tests/data/dummy.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QuivrHQ/MegaParse/ba9a24aec950d6cf14834b8e2e11f5725778f12e/libs/megaparse/tests/data/dummy.pdf -------------------------------------------------------------------------------- /libs/megaparse/tests/data/grt_example/MegaFake_report.md: -------------------------------------------------------------------------------- 1 | | My Mega fake report | #1756394 | 31/05/2024 | 2 | |---------------------|----------|------------| 3 | 4 | ## Why Mega Parse might be the best ? 5 | 6 | ### Introduction 7 | 8 | Mega Parse is a state-of-the-art document parser designed to convert various document formats such as PDF, DOCX, PPTX, and more into Markdown (MD) format, making them ready for Retrieval-Augmented Generation (RAG) ingestion. In today's data-driven world, the ability to efficiently manage and utilize large volumes of information is crucial. This report explores the features, benefits, and comparative performance of Mega Parse, illustrating why it stands out as a superior tool in the realm of document parsing. 9 | 10 | ### Features of Mega Parse 11 | 12 | Mega Parse boasts an impressive array of features tailored to meet the diverse needs of modern enterprises. 13 | 14 | **Multiple Format Support:** Mega Parse supports a wide range of document formats including PDF, DOCX, and PPTX. This versatility allows users to handle various document types without needing multiple tools. Whether you are working with text documents, presentations, or scanned PDFs, Mega Parse has you covered. 15 | 16 | **High-Speed Processing:** One of the standout features of Mega Parse is its ability to convert documents at a rapid pace. With processing speeds of up to 120 pages per minute, it significantly enhances productivity by reducing the time spent on document conversion. 17 | 18 | **Markdown Output:** Mega Parse converts documents into a structured Markdown format. Markdown is a lightweight markup language with plain text formatting syntax, which is widely used because of its simplicity and ease of conversion to other formats. This makes it ideal for RAG ingestion, where structured and easily interpretable data is paramount. 19 | 20 | Accuracy: Accuracy in text extraction and formatting is a critical aspect of any document parser. Mega Parse ensures high accuracy, maintaining the integrity and structure of the original documents. This is particularly important for documents that contain complex formatting and embedded elements. 21 | 22 | Customizable Parsing Rules: Users can define custom parsing rules to suit specific needs, allowing for greater control over the conversion process. This flexibility ensures that Mega Parse can be adapted to a wide variety of use cases. 23 | 24 | Batch Processing: Mega Parse supports batch processing, enabling the simultaneous conversion of multiple documents. This feature is particularly useful for organizations dealing with large volumes of documents, as it streamlines the workflow and saves time. 25 | 26 | Error Handling: Advanced error handling capabilities ensure that any issues encountered during the conversion process are managed effectively, minimizing disruptions and maintaining workflow efficiency. 27 | 28 | # Benefits of Mega Parse 29 | 30 | The implementation of Mega Parse offers numerous benefits that can transform the way organizations manage their documents. 31 | 32 | **Efficiency:** By significantly speeding up the document conversion process, Mega Parse increases overall efficiency. This is especially beneficial for industries that handle large volumes of documents on a daily basis, such as legal firms, financial institutions, and research organizations. 33 | 34 | **Versatility:** Mega Parse's ability to handle multiple document types makes it a versatile tool for various industries. Whether you need to convert legal documents, technical manuals, or business presentations, Mega Parse is equipped to handle the task. 35 | 36 | **Enhanced Knowledge Management:** Converting documents to Markdown facilitates easier content management and retrieval. Markdown files are not only lightweight but 37 | also highly compatible with various knowledge management systems, making it easier to organize, search, and utilize information. 38 | 39 | Improved Workflow: Mega Parse simplifies the process of preparing documents for machine learning and AI applications. By converting documents into a structured format, it reduces the time and effort required to preprocess data, allowing teams to focus on higher-level tasks. 40 | 41 | Cost Savings: The efficiency and speed of Mega Parse can lead to significant cost savings. Reduced processing times and improved workflow efficiency mean that resources can be allocated more effectively, ultimately lowering operational costs. 42 | 43 | Scalability: Mega Parse is designed to scale with the needs of an organization. As document volumes grow, Mega Parse can handle the increased load without compromising performance, making it a future-proof solution for document management. 44 | 45 | # Comparative Performance 46 | 47 | The following table provides a comprehensive comparative analysis of Mega Parse against other document parsers based on fictional performance metrics. This comparison highlights the strengths of Mega Parse in various key areas. 48 | 49 | | Metric | Mega Parse | Parser A | Parser B | Parser C | Parser D | 50 | |---------------------|------------------|----------------|----------------|----------------|----------------| 51 | | Supported Formats | PDF, DOCX, PPTX | PDF, DOCX | DOCX, PPTX | PDF, PPTX | PDF, DOCX, XLSX| 52 | | Conversion Speed (pages/min) | 120 | 90 | 100 | 85 | 95 | 53 | | **Accuracy Rate (%)** | 98 | 95 | 93 | 90 | 92 | 54 | | **Output Format** | Markdown | HTML | Markdown | Plain Text | HTML | 55 | | **Error Rate (%)** | 1 | 3 | 4 | 5 | 3 | 56 | | **Ease of Use** | High | Medium | High | Medium | Medium | 57 | | **Integration Capability** | Excellent | Good | Good | Fair | Good | 58 | | **Batch Processing** | Yes | No | Yes | No | Yes | 59 | | **Custom Parsing Rules** | Yes | Limited | Yes | No | Limited | 60 | | **Multilingual Support** | Yes | Yes | No | Yes | Yes | 61 | | **OCR (Optical Character Recognition)** | Yes | No | Yes | No | Yes | 62 | | **Price (per user/month)** | $30 | $25 | $20 | $15 | $18 | 63 | | **Customer Support Rating (out of 5)** | 4.8 | 4.2 | 4.5 | 3.9 | 4.1 | 64 | | **Free Trial Available** | Yes | Yes | No | Yes | No | 65 | | **Cloud Integration** | Yes | No | Yes | Yes | No | 66 | | **Security Features** | Advanced | Basic | Advanced | Basic | Intermediate | 67 | | **User Community Size** | Large | Medium | Medium | Small | Medium | 68 | | **Monthly Updates** | Yes | Yes | No | Yes | No | 69 | | **Mobile App Availability** | Yes | No | Yes | No | Yes | 70 | | **Platform Compatibility** | Windows, Mac, Linux | Windows, Mac | Windows | Mac, Linux | Windows, Linux | 71 | | **Data Privacy Compliance** | High | Medium | High | Low | Medium | 72 | | **AI-Driven Enhancements** | Yes | No | Yes | No | Yes | 73 | | **File Size Limit (per document)** | 1GB | 500MB | 750MB | 200MB | 500MB | 74 | | **User Training Resources** | Extensive | Moderate | Extensive | Limited | Moderate | 75 | | **API Access** | Yes | No | Yes | No | Yes | 76 | | **Customizable Output Templates** | Yes | Limited | Yes | No | Yes | 77 | | **Collaboration Features** | Yes | No | Yes | No | Limited | 78 | | **Document Version Control** | Yes | No | Yes | No | Yes | 79 | | **Import/Export Options** | Extensive | Moderate | Extensive | Limited | Moderate | 80 | | Feedback Mechanism | Yes | No | Yes | No | Yes | 81 | 82 | *Note: All data presented in this table is fictional and for illustrative purposes only.* 83 | 84 | ## Conclusion 85 | 86 | Mega Parse stands out as a leading document parser due to its extensive format support, high-speed processing, and accuracy. Its ability to convert a variety of document types into Markdown format makes it an invaluable tool for organizations looking to streamline their document management processes and enhance their knowledge management systems. With features like customizable parsing rules, batch processing, and advanced error handling, Mega Parse is well-equipped to meet the demands of modern enterprises. Its scalability and cost-effectiveness further reinforce its position as a top choice for document parsing and conversion needs. By leveraging Mega Parse, organizations can improve their workflow efficiency, reduce operational costs, and better manage their information assets in the age of big data and artificial intelligence. -------------------------------------------------------------------------------- /libs/megaparse/tests/pdf/mlbook.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QuivrHQ/MegaParse/ba9a24aec950d6cf14834b8e2e11f5725778f12e/libs/megaparse/tests/pdf/mlbook.pdf -------------------------------------------------------------------------------- /libs/megaparse/tests/pdf/native/0168011.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QuivrHQ/MegaParse/ba9a24aec950d6cf14834b8e2e11f5725778f12e/libs/megaparse/tests/pdf/native/0168011.pdf -------------------------------------------------------------------------------- /libs/megaparse/tests/pdf/native/0168014.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QuivrHQ/MegaParse/ba9a24aec950d6cf14834b8e2e11f5725778f12e/libs/megaparse/tests/pdf/native/0168014.pdf -------------------------------------------------------------------------------- /libs/megaparse/tests/pdf/native/0168029.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QuivrHQ/MegaParse/ba9a24aec950d6cf14834b8e2e11f5725778f12e/libs/megaparse/tests/pdf/native/0168029.pdf -------------------------------------------------------------------------------- /libs/megaparse/tests/pdf/ocr/0168003.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QuivrHQ/MegaParse/ba9a24aec950d6cf14834b8e2e11f5725778f12e/libs/megaparse/tests/pdf/ocr/0168003.pdf -------------------------------------------------------------------------------- /libs/megaparse/tests/pdf/ocr/0168004.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QuivrHQ/MegaParse/ba9a24aec950d6cf14834b8e2e11f5725778f12e/libs/megaparse/tests/pdf/ocr/0168004.pdf -------------------------------------------------------------------------------- /libs/megaparse/tests/pdf/ocr/0168119.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QuivrHQ/MegaParse/ba9a24aec950d6cf14834b8e2e11f5725778f12e/libs/megaparse/tests/pdf/ocr/0168119.pdf -------------------------------------------------------------------------------- /libs/megaparse/tests/pdf/ocr/0168120.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QuivrHQ/MegaParse/ba9a24aec950d6cf14834b8e2e11f5725778f12e/libs/megaparse/tests/pdf/ocr/0168120.pdf -------------------------------------------------------------------------------- /libs/megaparse/tests/pdf/ocr/0168123.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QuivrHQ/MegaParse/ba9a24aec950d6cf14834b8e2e11f5725778f12e/libs/megaparse/tests/pdf/ocr/0168123.pdf -------------------------------------------------------------------------------- /libs/megaparse/tests/pdf/ocr/0168126.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QuivrHQ/MegaParse/ba9a24aec950d6cf14834b8e2e11f5725778f12e/libs/megaparse/tests/pdf/ocr/0168126.pdf -------------------------------------------------------------------------------- /libs/megaparse/tests/pdf/ocr/0168127.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QuivrHQ/MegaParse/ba9a24aec950d6cf14834b8e2e11f5725778f12e/libs/megaparse/tests/pdf/ocr/0168127.pdf -------------------------------------------------------------------------------- /libs/megaparse/tests/pdf/ocr/0168322.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QuivrHQ/MegaParse/ba9a24aec950d6cf14834b8e2e11f5725778f12e/libs/megaparse/tests/pdf/ocr/0168322.pdf -------------------------------------------------------------------------------- /libs/megaparse/tests/pdf/rust.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QuivrHQ/MegaParse/ba9a24aec950d6cf14834b8e2e11f5725778f12e/libs/megaparse/tests/pdf/rust.pdf -------------------------------------------------------------------------------- /libs/megaparse/tests/pdf/sample_native.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QuivrHQ/MegaParse/ba9a24aec950d6cf14834b8e2e11f5725778f12e/libs/megaparse/tests/pdf/sample_native.pdf -------------------------------------------------------------------------------- /libs/megaparse/tests/pdf/sample_pdf.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QuivrHQ/MegaParse/ba9a24aec950d6cf14834b8e2e11f5725778f12e/libs/megaparse/tests/pdf/sample_pdf.pdf -------------------------------------------------------------------------------- /libs/megaparse/tests/pdf/sample_table.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QuivrHQ/MegaParse/ba9a24aec950d6cf14834b8e2e11f5725778f12e/libs/megaparse/tests/pdf/sample_table.pdf -------------------------------------------------------------------------------- /libs/megaparse/tests/pdf/test_detect_ocr.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pypdfium2 4 | import pytest 5 | from megaparse.megaparse import MegaParse 6 | from megaparse.utils.strategy import determine_global_strategy 7 | from megaparse_sdk.schema.parser_config import StrategyEnum 8 | 9 | ocr_pdfs = os.listdir("./tests/pdf/ocr") 10 | native_pdfs = os.listdir("./tests/pdf/native") 11 | 12 | megaparse = MegaParse() 13 | 14 | 15 | @pytest.mark.parametrize("hi_res_pdf", ocr_pdfs) 16 | def test_hi_res_strategy(hi_res_pdf): 17 | if hi_res_pdf == "0168004.pdf": 18 | pytest.skip("Skip 0168004.pdf as it is flaky currently") 19 | 20 | pdf_doc = pypdfium2.PdfDocument(f"./tests/pdf/ocr/{hi_res_pdf}") 21 | pages = megaparse.extract_page_strategies(pdf_doc) 22 | assert ( 23 | determine_global_strategy( 24 | pages, megaparse.config.auto_config.document_threshold 25 | ) 26 | == StrategyEnum.HI_RES 27 | ) 28 | 29 | 30 | @pytest.mark.parametrize("native_pdf", native_pdfs) 31 | def test_fast_strategy(native_pdf): 32 | if native_pdf == "0168029.pdf": 33 | pytest.skip("Skip 0168029.pdf as it is too long to process") 34 | 35 | pdf_doc = pypdfium2.PdfDocument(f"./tests/pdf/native/{native_pdf}") 36 | pages = megaparse.extract_page_strategies(pdf_doc) 37 | 38 | assert ( 39 | determine_global_strategy( 40 | pages, megaparse.config.auto_config.document_threshold 41 | ) 42 | == StrategyEnum.FAST 43 | ) 44 | -------------------------------------------------------------------------------- /libs/megaparse/tests/pdf/test_pdf_processing.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import pypdfium2 4 | import pytest 5 | from megaparse.configs.auto import ( 6 | DeviceEnum, 7 | MegaParseConfig, 8 | ) 9 | from megaparse.megaparse import MegaParse 10 | from megaparse.utils.strategy import determine_global_strategy 11 | from megaparse_sdk.schema.extensions import FileExtension 12 | from megaparse_sdk.schema.parser_config import StrategyEnum 13 | 14 | 15 | @pytest.fixture 16 | def native_pdf() -> Path: 17 | p = Path("./tests/pdf/sample_native.pdf") 18 | return p 19 | 20 | 21 | @pytest.fixture 22 | def scanned_pdf() -> Path: 23 | p = Path("./tests/pdf/sample_pdf.pdf") 24 | return p 25 | 26 | 27 | # def test_get_default_processors_megaparse(): 28 | # megaparse = MegaParse() 29 | # assert type(megaparse.parser) is UnstructuredParser 30 | 31 | 32 | @pytest.mark.asyncio 33 | @pytest.mark.parametrize("pdf_name", ["scanned_pdf", "native_pdf"]) 34 | async def test_async_megaparse_pdf_processor_file_path(pdf_name, request): 35 | pdf = request.getfixturevalue(pdf_name) 36 | processor = MegaParse(config=MegaParseConfig(device=DeviceEnum.COREML)) 37 | result = await processor.aload(file_path=pdf) 38 | assert len(str(result)) > 0 39 | 40 | 41 | @pytest.mark.parametrize("pdf_name", ["scanned_pdf", "native_pdf"]) 42 | def test_sync_megaparse_pdf_processor_file_path(pdf_name, request): 43 | pdf = request.getfixturevalue(pdf_name) 44 | processor = MegaParse() 45 | result = processor.load(file_path=pdf) 46 | assert len(result) > 0 47 | 48 | 49 | @pytest.mark.asyncio 50 | @pytest.mark.parametrize("pdf_name", ["scanned_pdf", "native_pdf"]) 51 | async def test_megaparse_pdf_processor_file(pdf_name, request): 52 | pdf = request.getfixturevalue(pdf_name) 53 | processor = MegaParse() 54 | with open(pdf, "rb") as f: 55 | result = await processor.aload(file=f, file_extension=FileExtension.PDF) 56 | assert len(str(result)) > 0 57 | 58 | 59 | def test_strategy_native(native_pdf): 60 | processor = MegaParse() 61 | pdf_doc = pypdfium2.PdfDocument(native_pdf) 62 | 63 | pages = processor.extract_page_strategies(pdf_doc) 64 | 65 | assert ( 66 | determine_global_strategy( 67 | pages, processor.config.auto_config.document_threshold 68 | ) 69 | == StrategyEnum.FAST 70 | ) 71 | pdf_doc.close() 72 | 73 | 74 | def test_strategy_scanned(scanned_pdf): 75 | processor = MegaParse() 76 | pdf_doc = pypdfium2.PdfDocument(scanned_pdf) 77 | pages = processor.extract_page_strategies(pdf_doc) 78 | assert ( 79 | determine_global_strategy( 80 | pages, processor.config.auto_config.document_threshold 81 | ) 82 | == StrategyEnum.HI_RES 83 | ) 84 | pdf_doc.close() 85 | -------------------------------------------------------------------------------- /libs/megaparse/tests/pdf/test_pdfium_parser.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import pypdfium2 as pdfium 4 | 5 | 6 | def test_pdfium(): 7 | # scanned pdf 8 | p = Path("./tests/pdf/mlbook.pdf") 9 | document = pdfium.PdfDocument(p) 10 | 11 | objs = [] 12 | for page in document: 13 | for obj in page.get_objects(): 14 | objs.append(obj) 15 | 16 | document.close() 17 | -------------------------------------------------------------------------------- /libs/megaparse/tests/supported_docs/Sway.epub: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QuivrHQ/MegaParse/ba9a24aec950d6cf14834b8e2e11f5725778f12e/libs/megaparse/tests/supported_docs/Sway.epub -------------------------------------------------------------------------------- /libs/megaparse/tests/supported_docs/file-sample_500kB.odt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QuivrHQ/MegaParse/ba9a24aec950d6cf14834b8e2e11f5725778f12e/libs/megaparse/tests/supported_docs/file-sample_500kB.odt -------------------------------------------------------------------------------- /libs/megaparse/tests/supported_docs/file_example_XLSX_50.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QuivrHQ/MegaParse/ba9a24aec950d6cf14834b8e2e11f5725778f12e/libs/megaparse/tests/supported_docs/file_example_XLSX_50.xlsx -------------------------------------------------------------------------------- /libs/megaparse/tests/supported_docs/file_example_XLS_50.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QuivrHQ/MegaParse/ba9a24aec950d6cf14834b8e2e11f5725778f12e/libs/megaparse/tests/supported_docs/file_example_XLS_50.xls -------------------------------------------------------------------------------- /libs/megaparse/tests/supported_docs/sample.csv: -------------------------------------------------------------------------------- 1 | Name,Description 2 | MegaParse,"MegaParse is the best parser, even with accents like é, è, and ñ." 3 | OtherParse,"OtherParse is a decent parser, but it struggles with accents." 4 | RandomParse,"RandomParse is another parser, but it often fails with special characters." -------------------------------------------------------------------------------- /libs/megaparse/tests/supported_docs/sample.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QuivrHQ/MegaParse/ba9a24aec950d6cf14834b8e2e11f5725778f12e/libs/megaparse/tests/supported_docs/sample.docx -------------------------------------------------------------------------------- /libs/megaparse/tests/supported_docs/sample.markdown: -------------------------------------------------------------------------------- 1 | # The Difficulty of Parsing Files 2 | 3 | Parsing files can be a challenging task due to several factors: 4 | 5 | ## 1. File Format Variability 6 | Different file formats (e.g., JSON, XML, CSV) require different parsing techniques. Each format has its own structure and rules, making it necessary to handle each one uniquely. 7 | 8 | ## 2. Inconsistent Data 9 | Files often contain inconsistent or malformed data. Handling these inconsistencies requires robust error-checking and validation mechanisms. 10 | 11 | ## 3. Large File Sizes 12 | Parsing large files can be resource-intensive and time-consuming. Efficient algorithms and memory management techniques are essential to handle large datasets. 13 | 14 | ## 4. Encoding Issues 15 | Files may use different character encodings (e.g., UTF-8, ASCII). Properly detecting and handling these encodings is crucial to avoid data corruption. 16 | 17 | ## 5. Nested Structures 18 | Some file formats, like JSON and XML, can have deeply nested structures. Parsing these nested structures requires recursive algorithms and careful handling of hierarchical data. 19 | 20 | ## Conclusion 21 | Despite these challenges, effective file parsing is essential for data processing and analysis. By understanding and addressing these difficulties, developers can create robust parsers that handle a wide variety of file formats and data inconsistencies. 22 | -------------------------------------------------------------------------------- /libs/megaparse/tests/supported_docs/sample.md: -------------------------------------------------------------------------------- 1 | # The Difficulty of Parsing Files 2 | 3 | Parsing files can be a challenging task due to several factors: 4 | 5 | ## 1. File Format Variability 6 | Different file formats (e.g., JSON, XML, CSV) require different parsing techniques. Each format has its own structure and rules, making it necessary to handle each one uniquely. 7 | 8 | ## 2. Inconsistent Data 9 | Files often contain inconsistent or malformed data. Handling these inconsistencies requires robust error-checking and validation mechanisms. 10 | 11 | ## 3. Large File Sizes 12 | Parsing large files can be resource-intensive and time-consuming. Efficient algorithms and memory management techniques are essential to handle large datasets. 13 | 14 | ## 4. Encoding Issues 15 | Files may use different character encodings (e.g., UTF-8, ASCII). Properly detecting and handling these encodings is crucial to avoid data corruption. 16 | 17 | ## 5. Nested Structures 18 | Some file formats, like JSON and XML, can have deeply nested structures. Parsing these nested structures requires recursive algorithms and careful handling of hierarchical data. 19 | 20 | ## Conclusion 21 | Despite these challenges, effective file parsing is essential for data processing and analysis. By understanding and addressing these difficulties, developers can create robust parsers that handle a wide variety of file formats and data inconsistencies. 22 | -------------------------------------------------------------------------------- /libs/megaparse/tests/supported_docs/sample.otf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QuivrHQ/MegaParse/ba9a24aec950d6cf14834b8e2e11f5725778f12e/libs/megaparse/tests/supported_docs/sample.otf -------------------------------------------------------------------------------- /libs/megaparse/tests/supported_docs/sample.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QuivrHQ/MegaParse/ba9a24aec950d6cf14834b8e2e11f5725778f12e/libs/megaparse/tests/supported_docs/sample.pptx -------------------------------------------------------------------------------- /libs/megaparse/tests/supported_docs/sample.txt: -------------------------------------------------------------------------------- 1 | Lorem ipsum 2 | 3 | Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nunc ac faucibus odio. 4 | 5 | Vestibulum neque massa, scelerisque sit amet ligula eu, congue molestie mi. Praesent ut varius sem. Nullam at porttitor arcu, nec lacinia nisi. Ut ac dolor vitae odio interdum condimentum. Vivamus dapibus sodales ex, vitae malesuada ipsum cursus convallis. Maecenas sed egestas nulla, ac condimentum orci. Mauris diam felis, vulputate ac suscipit et, iaculis non est. Curabitur semper arcu ac ligula semper, nec luctus nisl blandit. Integer lacinia ante ac libero lobortis imperdiet. Nullam mollis convallis ipsum, ac accumsan nunc vehicula vitae. Nulla eget justo in felis tristique fringilla. Morbi sit amet tortor quis risus auctor condimentum. Morbi in ullamcorper elit. Nulla iaculis tellus sit amet mauris tempus fringilla. 6 | Maecenas mauris lectus, lobortis et purus mattis, blandit dictum tellus. 7 | Maecenas non lorem quis tellus placerat varius. 8 | Nulla facilisi. 9 | Aenean congue fringilla justo ut aliquam. 10 | Mauris id ex erat. Nunc vulputate neque vitae justo facilisis, non condimentum ante sagittis. 11 | Morbi viverra semper lorem nec molestie. 12 | Maecenas tincidunt est efficitur ligula euismod, sit amet ornare est vulputate. 13 | https://github.com/QuivrHQ/MegaParse -------------------------------------------------------------------------------- /libs/megaparse/tests/supported_docs/sample.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Charter Group 5 |
6 | 100 Main 7 | Framingham 8 | MA 9 | 01701 10 |
11 |
12 | 720 Prospect 13 | Framingham 14 | MA 15 | 01701 16 |
17 |
18 | 120 Ridge 19 | MA 20 | 01760 21 |
22 |
23 |
-------------------------------------------------------------------------------- /libs/megaparse/tests/supported_docs/sample_complexe.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QuivrHQ/MegaParse/ba9a24aec950d6cf14834b8e2e11f5725778f12e/libs/megaparse/tests/supported_docs/sample_complexe.html -------------------------------------------------------------------------------- /libs/megaparse/tests/supported_docs/sample_native.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QuivrHQ/MegaParse/ba9a24aec950d6cf14834b8e2e11f5725778f12e/libs/megaparse/tests/supported_docs/sample_native.pdf -------------------------------------------------------------------------------- /libs/megaparse/tests/test_endpoints.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | @pytest.mark.asyncio 5 | async def test_parse_file_endpoint(test_client): 6 | # Simulate a request to the parse endpoint 7 | with open("./tests/pdf/sample_pdf.pdf", "rb") as file: 8 | response = await test_client.post( 9 | "/v1/file", 10 | files={"file": ("test.pdf", file)}, 11 | data={ 12 | "method": "unstructured", 13 | "strategy": "auto", 14 | "language": "en", 15 | "check_table": False, 16 | }, 17 | ) 18 | assert response.status_code == 200 19 | assert response.json()["message"] == "File parsed successfully" 20 | 21 | 22 | @pytest.mark.asyncio 23 | async def test_parse_url_endpoint(test_client): 24 | response = await test_client.post("/v1/url?url=https://www.quivr.com") 25 | assert response.status_code == 200 26 | assert response.json() == { 27 | "message": "Website content parsed successfully", 28 | "result": "Fake website content", 29 | } 30 | -------------------------------------------------------------------------------- /libs/megaparse/tests/test_import.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from megaparse import MegaParse 3 | 4 | 5 | @pytest.mark.skip("slow test") 6 | def test_load(): 7 | megaparse = MegaParse() 8 | response = megaparse.load("./tests/data/dummy.pdf") 9 | print(response) 10 | assert response.strip("\n") == "Dummy PDF download" 11 | -------------------------------------------------------------------------------- /libs/megaparse/tests/test_parsers.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pytest 4 | from megaparse.parser.doctr_parser import DoctrParser 5 | from megaparse.parser.llama import LlamaParser 6 | from megaparse.parser.megaparse_vision import MegaParseVision 7 | from megaparse.parser.unstructured_parser import UnstructuredParser 8 | from megaparse_sdk.schema.extensions import FileExtension 9 | 10 | PARSER_LIST = [ 11 | UnstructuredParser, 12 | # DoctrParser, 13 | ] 14 | 15 | 16 | @pytest.mark.parametrize("parser", PARSER_LIST) 17 | @pytest.mark.parametrize("extension", list(FileExtension)) 18 | def test_sync_parser(parser, extension): 19 | directory = "./tests/supported_docs" 20 | file_path = next( 21 | ( 22 | os.path.join(root, file) 23 | for root, _, files in os.walk(directory) 24 | for file in files 25 | if file.endswith(extension.value) 26 | ), 27 | None, 28 | ) 29 | if file_path is None: 30 | pytest.fail(f"No file with extension {extension.value} found in {directory}") 31 | 32 | myparser = parser() 33 | if extension in myparser.supported_extensions: 34 | response = myparser.convert(file_path) 35 | 36 | assert response 37 | assert len(str(response)) > 0 38 | else: 39 | with pytest.raises(ValueError): 40 | myparser.convert(file_path) 41 | -------------------------------------------------------------------------------- /libs/megaparse_sdk/CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | ## [0.1.12](https://github.com/QuivrHQ/MegaParse/compare/megaparse-sdk-v0.1.11...megaparse-sdk-v0.1.12) (2025-02-13) 4 | 5 | 6 | ### Features 7 | 8 | * add layout detection ([#228](https://github.com/QuivrHQ/MegaParse/issues/228)) ([77f7040](https://github.com/QuivrHQ/MegaParse/commit/77f7040c9c221a17effce089be7ec575cdd83468)) 9 | 10 | ## [0.1.11](https://github.com/QuivrHQ/MegaParse/compare/megaparse-sdk-v0.1.10...megaparse-sdk-v0.1.11) (2025-02-11) 11 | 12 | 13 | ### Features 14 | 15 | * add_layout_detection ([#220](https://github.com/QuivrHQ/MegaParse/issues/220)) ([2d2d0b4](https://github.com/QuivrHQ/MegaParse/commit/2d2d0b42bba4c883db423568e932eda42edd60d7)) 16 | * Text detection in auto strategy ([#209](https://github.com/QuivrHQ/MegaParse/issues/209)) ([03c7ada](https://github.com/QuivrHQ/MegaParse/commit/03c7ada1dc245e13ef41ffd6fa3a8ed869269d37)) 17 | 18 | 19 | ### Bug Fixes 20 | 21 | * Add EngineConfig & StrategyHandler ([#211](https://github.com/QuivrHQ/MegaParse/issues/211)) ([2e1c6dd](https://github.com/QuivrHQ/MegaParse/commit/2e1c6ddd676227d1cbc4cff9771b20595259ba38)) 22 | * add parse tests for every supported extensions ([#198](https://github.com/QuivrHQ/MegaParse/issues/198)) ([9dff0de](https://github.com/QuivrHQ/MegaParse/commit/9dff0de0c1de848151fe9a6519b658f0924c1228)) 23 | * Strategy heuristic test & fix ([#203](https://github.com/QuivrHQ/MegaParse/issues/203)) ([7b7fb40](https://github.com/QuivrHQ/MegaParse/commit/7b7fb40cae4ed380a5f0ca0035a7bd2bcc9147c3)) 24 | 25 | ## [0.1.10](https://github.com/QuivrHQ/MegaParse/compare/megaparse-sdk-v0.1.9...megaparse-sdk-v0.1.10) (2024-12-16) 26 | 27 | 28 | ### Bug Fixes 29 | 30 | * hatchling version ([#193](https://github.com/QuivrHQ/MegaParse/issues/193)) ([f6070a5](https://github.com/QuivrHQ/MegaParse/commit/f6070a5483a20eeb83751a2dcfc01b7f0fb14473)) 31 | 32 | ## [0.1.9](https://github.com/QuivrHQ/MegaParse/compare/megaparse-sdk-v0.1.8...megaparse-sdk-v0.1.9) (2024-12-13) 33 | 34 | 35 | ### Features 36 | 37 | * small fixes ([#181](https://github.com/QuivrHQ/MegaParse/issues/181)) ([004afe2](https://github.com/QuivrHQ/MegaParse/commit/004afe2f170570075bbebcd32dec5d15ddba4609)) 38 | 39 | ## [0.1.8](https://github.com/QuivrHQ/MegaParse/compare/megaparse-sdk-v0.1.7...megaparse-sdk-v0.1.8) (2024-12-12) 40 | 41 | 42 | ### Features 43 | 44 | * custom auto ([#131](https://github.com/QuivrHQ/MegaParse/issues/131)) ([3cb5be4](https://github.com/QuivrHQ/MegaParse/commit/3cb5be4a8c8eeb6dd6e9b87d7bbca24491db4c29)) 45 | * faster ocr ([#180](https://github.com/QuivrHQ/MegaParse/issues/180)) ([5661cb2](https://github.com/QuivrHQ/MegaParse/commit/5661cb2d52d959cbca0f41339791129cd35d4036)) 46 | 47 | ## [0.1.7](https://github.com/QuivrHQ/MegaParse/compare/megaparse-sdk-v0.1.6...megaparse-sdk-v0.1.7) (2024-11-25) 48 | 49 | 50 | ### Bug Fixes 51 | 52 | * Update README.md ([#154](https://github.com/QuivrHQ/MegaParse/issues/154)) ([a103393](https://github.com/QuivrHQ/MegaParse/commit/a1033938184e20c24b0e54ee0db088b28075fd14)) 53 | 54 | ## [0.1.6](https://github.com/QuivrHQ/MegaParse/compare/megaparse-sdk-v0.1.5...megaparse-sdk-v0.1.6) (2024-11-25) 55 | 56 | 57 | ### Features 58 | 59 | * megaparse sdk tests ([#148](https://github.com/QuivrHQ/MegaParse/issues/148)) ([e030285](https://github.com/QuivrHQ/MegaParse/commit/e0302853fc2c1526b8e912bf3ef85b970a5b89bc)) 60 | 61 | ## [0.1.5](https://github.com/QuivrHQ/MegaParse/compare/megaparse-sdk-v0.1.4...megaparse-sdk-v0.1.5) (2024-11-21) 62 | 63 | 64 | ### Features 65 | 66 | * refacto megaparse for service ([#132](https://github.com/QuivrHQ/MegaParse/issues/132)) ([ab9ad7f](https://github.com/QuivrHQ/MegaParse/commit/ab9ad7fb7db580a04a998d144dd2ba3407068334)) 67 | * release plz ([#134](https://github.com/QuivrHQ/MegaParse/issues/134)) ([d8a221e](https://github.com/QuivrHQ/MegaParse/commit/d8a221e23f6e15e969c1328f183da3582d0d7925)) 68 | -------------------------------------------------------------------------------- /libs/megaparse_sdk/README.md: -------------------------------------------------------------------------------- 1 | ## MegaParse SDK 2 | 3 | Welcome to the MegaParse SDK! This SDK allows you to easily interact with the MegaParse API to upload URLs and files for processing. 4 | 5 | ### Installation 6 | 7 | To install the MegaParse SDK, use pip: 8 | 9 | ```sh 10 | pip install megaparse-sdk 11 | ``` 12 | 13 | ### Usage 14 | 15 | Here is an example of how to use the MegaParse SDK: 16 | 17 | #### Uploading URLs 18 | 19 | ```python 20 | import asyncio 21 | import os 22 | 23 | from megaparse.sdk import MegaParseSDK 24 | 25 | async def upload_url(): 26 | api_key = str(os.getenv("MEGAPARSE_API_KEY")) 27 | megaparse = MegaParseSDK(api_key) 28 | 29 | url = "https://www.quivr.com" 30 | 31 | # Upload a URL 32 | url_response = await megaparse.url.upload(url) 33 | print(f"\n----- URL Response : {url} -----\n") 34 | print(url_response) 35 | 36 | await megaparse.close() 37 | 38 | if __name__ == "__main__": 39 | asyncio.run(upload_url()) 40 | ``` 41 | 42 | #### Uploading Files 43 | 44 | ```python 45 | import asyncio 46 | import os 47 | 48 | from megaparse.sdk import MegaParseSDK 49 | 50 | async def upload_file(): 51 | api_key = str(os.getenv("MEGAPARSE_API_KEY")) 52 | megaparse = MegaParseSDK(api_key) 53 | 54 | file_path = "your/file/path.pdf" 55 | # Upload a file 56 | response = await megaparse.file.upload( 57 | file_path=file_path, 58 | method="unstructured", # unstructured, llama_parser, megaparse_vision 59 | strategy="auto", 60 | ) 61 | print(f"\n----- File Response : {file_path} -----\n") 62 | print(response) 63 | 64 | await megaparse.close() 65 | 66 | if __name__ == "__main__": 67 | asyncio.run(upload_file()) 68 | ``` 69 | 70 | ### Features 71 | 72 | - **Upload URLs**: Easily upload URLs for processing. 73 | - **Upload Files**: Upload files with different processing methods and strategies. 74 | 75 | ### Getting Started 76 | 77 | 1. **Set up your API key**: Make sure to set the `MEGAPARSE_API_KEY` environment variable with your MegaParse API key. 78 | 2. **Run the example**: Use the provided example to see how to upload URLs and files. 79 | 80 | For more details, refer to the [usage example](#file:usage_example.py-context). 81 | 82 | We hope you find the MegaParse SDK useful for your projects! 83 | 84 | Enjoy, _Quivr Team_ ! 85 | -------------------------------------------------------------------------------- /libs/megaparse_sdk/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /libs/megaparse_sdk/examples/usage_example.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import os 3 | 4 | from megaparse.sdk.megaparse_sdk import MegaParseSDK 5 | 6 | 7 | async def main(): 8 | api_key = str(os.getenv("MEGAPARSE_API_KEY")) 9 | megaparse = MegaParseSDK(api_key) 10 | 11 | # url = "https://www.quivr.com" 12 | 13 | # # Upload a URL 14 | # url_response = await megaparse.url.upload(url) 15 | # print(f"\n----- URL Response : {url} -----\n") 16 | # print(url_response) 17 | 18 | # file_path = "megaparse/sdk/pdf/MegaFake_report.pdf" 19 | file_path = ( 20 | "megaparse/sdk/examples/only_pdfs/4 The Language of Medicine 2024.07.21.pdf" 21 | ) 22 | # Upload a file 23 | response = await megaparse.file.upload( 24 | file_path=file_path, 25 | method="unstructured", # type: ignore # unstructured, llama_parser, megaparse_vision 26 | strategy="auto", # type: ignore # fast, auto, hi_res 27 | ) 28 | print(f"\n----- File Response : {file_path} -----\n") 29 | print(response) 30 | await megaparse.close() 31 | 32 | 33 | if __name__ == "__main__": 34 | asyncio.run(main()) 35 | -------------------------------------------------------------------------------- /libs/megaparse_sdk/megaparse_sdk/__init__.py: -------------------------------------------------------------------------------- 1 | from .client import MegaParseClient 2 | from .endpoints.file_upload import FileUpload 3 | from .endpoints.url_upload import URLUpload 4 | 5 | 6 | class MegaParseSDK: 7 | def __init__(self, api_key: str | None = None, base_url: str | None = None): 8 | self.client = MegaParseClient(api_key, base_url) 9 | self.file = FileUpload(self.client) 10 | self.url = URLUpload(self.client) 11 | 12 | async def close(self): 13 | await self.client.close() 14 | -------------------------------------------------------------------------------- /libs/megaparse_sdk/megaparse_sdk/client.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import enum 3 | import logging 4 | import os 5 | from io import BytesIO 6 | from pathlib import Path 7 | from types import TracebackType 8 | from typing import Any, Self 9 | 10 | import httpx 11 | import nats 12 | from nats.errors import NoRespondersError, TimeoutError 13 | 14 | from megaparse_sdk.config import ClientNATSConfig, MegaParseSDKConfig 15 | from megaparse_sdk.schema.document import Document 16 | from megaparse_sdk.schema.mp_exceptions import ( 17 | DownloadError, 18 | InternalServiceError, 19 | MemoryLimitExceeded, 20 | ModelNotSupported, 21 | ParsingException, 22 | ) 23 | from megaparse_sdk.schema.mp_inputs import ( 24 | FileInput, 25 | MPInput, 26 | ParseFileConfig, 27 | ParseFileInput, 28 | ParseUrlInput, 29 | ) 30 | from megaparse_sdk.schema.mp_outputs import ( 31 | MPErrorType, 32 | MPOutput, 33 | MPOutputType, 34 | ) 35 | from megaparse_sdk.utils.load_ssl import load_ssl_cxt 36 | 37 | logger = logging.getLogger("megparse_sdk") 38 | 39 | 40 | class MegaParseClient: 41 | def __init__( 42 | self, 43 | api_key: str | None = None, 44 | base_url: str | None = None, 45 | ): 46 | config = MegaParseSDKConfig() 47 | self.base_url = base_url or config.url 48 | self.api_key = api_key or config.api_key 49 | self.max_retries = config.max_retries 50 | if self.api_key: 51 | self.session = httpx.AsyncClient( 52 | headers={"x-api-key": self.api_key}, timeout=config.timeout 53 | ) 54 | else: 55 | self.session = httpx.AsyncClient(timeout=config.timeout) 56 | 57 | async def request(self, method: str, endpoint: str, **kwargs: Any) -> Any: 58 | url = f"{self.base_url}{endpoint}" 59 | client = self.session 60 | for attempt in range(self.max_retries): 61 | try: 62 | response = await client.request(method, url, **kwargs) 63 | response.raise_for_status() 64 | return response.json() 65 | except (httpx.HTTPStatusError, httpx.RequestError): 66 | if attempt < self.max_retries - 1: 67 | await asyncio.sleep(2**attempt) # Exponential backoff 68 | 69 | raise RuntimeError(f"Can't send request to the server: {url}") 70 | 71 | async def close(self): 72 | await self.session.aclose() 73 | 74 | 75 | class ClientState(enum.Enum): 76 | # First state of the client 77 | UNOPENED = 1 78 | # Client has either sent a request, or is within a `with` block. 79 | OPENED = 2 80 | # Client has either exited the `with` block, or `close()` called. 81 | CLOSED = 3 82 | 83 | 84 | class MegaParseNATSClient: 85 | def __init__(self, config: ClientNATSConfig): 86 | self.nc_config = config 87 | self.max_retries = self.nc_config.max_retries 88 | self.backoff = self.nc_config.backoff 89 | if self.nc_config.ssl_config: 90 | self.ssl_ctx = load_ssl_cxt(self.nc_config.ssl_config) 91 | else: 92 | self.ssl_ctx = None 93 | # Client connection 94 | self._state = ClientState.UNOPENED 95 | self._nc = None 96 | 97 | async def _get_nc(self): 98 | if self._nc is None: 99 | self._nc = await nats.connect( 100 | self.nc_config.endpoint, 101 | tls=self.ssl_ctx, 102 | connect_timeout=self.nc_config.connect_timeout, 103 | reconnect_time_wait=self.nc_config.reconnect_time_wait, 104 | max_reconnect_attempts=self.nc_config.max_reconnect_attempts, 105 | ) 106 | return self._nc 107 | return self._nc 108 | 109 | async def __aenter__(self: Self) -> Self: 110 | if self._state != ClientState.UNOPENED: 111 | msg = { 112 | ClientState.OPENED: "Cannot open a client instance more than once.", 113 | ClientState.CLOSED: ( 114 | "Cannot reopen a client instance, client was closed." 115 | ), 116 | }[self._state] 117 | raise RuntimeError(msg) 118 | 119 | self._state = ClientState.OPENED 120 | 121 | await self._get_nc() 122 | return self 123 | 124 | async def __aexit__( 125 | self, 126 | exc_type: type[BaseException] | None = None, 127 | exc_value: BaseException | None = None, 128 | traceback: TracebackType | None = None, 129 | ) -> None: 130 | self._state = ClientState.CLOSED 131 | await self.aclose() 132 | 133 | async def parse_url(self, url: str): 134 | url_inp = ParseUrlInput(url=url) 135 | return await self._send_req(MPInput(input=url_inp)) 136 | 137 | async def parse_file( 138 | self, file: Path | BytesIO, file_name: str | None = None 139 | ) -> str | Document: 140 | if isinstance(file, Path): 141 | with open(file, "rb") as f: 142 | data = f.read() 143 | file_name = os.path.basename(file) 144 | else: 145 | file.seek(0) 146 | data = file.read() 147 | if file_name is None: 148 | raise ValueError("please provide file_name if passing ByteIO stream") 149 | 150 | file_input = ParseFileInput( 151 | file_input=FileInput(file_name=file_name, file_size=len(data), data=data), 152 | parse_config=ParseFileConfig(), 153 | ) 154 | 155 | inp = MPInput(input=file_input) 156 | return await self._send_req(inp) 157 | 158 | async def _send_req(self, inp: MPInput) -> str | Document: 159 | logger.debug(f"Sending {inp} to megaparse service.") 160 | 161 | for attempt in range(self.max_retries): 162 | try: 163 | return await self._send_req_inner(inp) 164 | except (TimeoutError, NoRespondersError) as e: 165 | logger.error(f"Sending req error: {e}. Retrying for {attempt} time") 166 | if attempt < self.max_retries - 1: 167 | logger.debug(f"Backoff for {2**self.backoff}s") 168 | await asyncio.sleep(2**self.backoff) 169 | raise ParsingException 170 | 171 | async def _send_req_inner(self, inp: MPInput): 172 | nc = await self._get_nc() 173 | raw_response = await nc.request( 174 | self.nc_config.subject, 175 | inp.model_dump_json().encode("utf-8"), 176 | timeout=self.nc_config.timeout, 177 | ) 178 | response = MPOutput.model_validate_json(raw_response.data.decode("utf-8")) 179 | return self._handle_mp_output(response) 180 | 181 | def _handle_mp_output(self, response: MPOutput) -> str | Document: 182 | if response.output_type == MPOutputType.PARSE_OK: 183 | assert response.result, "Parsing OK but response is None" 184 | return response.result 185 | elif response.output_type == MPOutputType.PARSE_ERR: 186 | assert response.err, "Parsing OK but response is None" 187 | match response.err.mp_err_code: 188 | case MPErrorType.MEMORY_LIMIT: 189 | raise MemoryLimitExceeded 190 | case MPErrorType.INTERNAL_SERVER_ERROR: 191 | raise InternalServiceError 192 | case MPErrorType.MODEL_NOT_SUPPORTED: 193 | raise ModelNotSupported 194 | case MPErrorType.DOWNLOAD_ERROR: 195 | raise DownloadError 196 | case MPErrorType.PARSING_ERROR: 197 | raise ParsingException 198 | raise ValueError(f"unknown service response type: {response}") 199 | 200 | async def aclose(self): 201 | nc = await self._get_nc() 202 | await nc.close() 203 | -------------------------------------------------------------------------------- /libs/megaparse_sdk/megaparse_sdk/config.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel, FilePath 2 | from pydantic_settings import BaseSettings, SettingsConfigDict 3 | 4 | 5 | class MegaParseSDKConfig(BaseSettings): 6 | """ 7 | Configuration for the Megaparse SDK. 8 | """ 9 | 10 | model_config = SettingsConfigDict(env_prefix="MEGAPARSE_SDK_") 11 | api_key: str | None = None 12 | url: str = "https://megaparse.tooling.quivr.app" 13 | timeout: int = 600 14 | max_retries: int = 3 15 | 16 | 17 | class SSLConfig(BaseModel): 18 | ssl_key_file: FilePath 19 | ssl_cert_file: FilePath 20 | ca_cert_file: FilePath | None = None 21 | 22 | 23 | class ClientNATSConfig(BaseSettings): 24 | model_config = SettingsConfigDict( 25 | env_prefix="MEGAPARSE_NATS_", 26 | env_file=(".env.local", ".env"), 27 | env_nested_delimiter="__", 28 | extra="ignore", 29 | ) 30 | subject: str = "parsing" 31 | endpoint: str = "https://tests@nats.tooling.quivr.app:4222" 32 | timeout: float = 20 33 | max_retries: int = 5 34 | backoff: float = 3 35 | connect_timeout: int = 5 36 | reconnect_time_wait: int = 1 37 | max_reconnect_attempts: int = 20 38 | ssl_config: SSLConfig | None = None 39 | -------------------------------------------------------------------------------- /libs/megaparse_sdk/megaparse_sdk/endpoints/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /libs/megaparse_sdk/megaparse_sdk/endpoints/file_upload.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from httpx import Response 4 | from pydantic import BaseModel 5 | 6 | from megaparse_sdk.client import MegaParseClient 7 | from megaparse_sdk.schema.languages import Language 8 | from megaparse_sdk.schema.parser_config import ParserType, StrategyEnum 9 | 10 | 11 | class UploadFileConfig(BaseModel): 12 | method: ParserType 13 | strategy: StrategyEnum 14 | check_table: bool 15 | language: Language 16 | parsing_instruction: str | None = None 17 | model_name: str = "gpt-4o" 18 | 19 | 20 | class FileUpload: 21 | def __init__(self, client: MegaParseClient): 22 | self.client = client 23 | 24 | async def upload( 25 | self, 26 | file_path: str, 27 | method: ParserType = ParserType.UNSTRUCTURED, 28 | strategy: StrategyEnum = StrategyEnum.AUTO, 29 | check_table: bool = False, 30 | language: Language = Language.ENGLISH, 31 | parsing_instruction: Optional[str] = None, 32 | model_name: str = "gpt-4o", 33 | ) -> Response: 34 | data = UploadFileConfig( 35 | method=method, 36 | strategy=strategy, 37 | check_table=check_table, 38 | language=language, 39 | parsing_instruction=parsing_instruction, 40 | model_name=model_name, 41 | ) 42 | with open(file_path, "rb") as file: 43 | files = {"file": (file_path, file)} 44 | 45 | response = await self.client.request( 46 | "POST", 47 | "/v1/file", 48 | files=files, 49 | data=data.model_dump(mode="json"), 50 | ) 51 | return response 52 | -------------------------------------------------------------------------------- /libs/megaparse_sdk/megaparse_sdk/endpoints/url_upload.py: -------------------------------------------------------------------------------- 1 | from httpx import Response 2 | 3 | from megaparse_sdk.client import MegaParseClient 4 | 5 | 6 | class URLUpload: 7 | def __init__(self, client: MegaParseClient): 8 | self.client = client 9 | 10 | async def upload(self, url: str, max_retries: int = 3) -> Response: 11 | endpoint = f"/v1/url?url={url}" 12 | headers = {"accept": "application/json"} 13 | response = await self.client.request("POST", endpoint, headers=headers, data="") 14 | return response 15 | -------------------------------------------------------------------------------- /libs/megaparse_sdk/megaparse_sdk/schema/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /libs/megaparse_sdk/megaparse_sdk/schema/extensions.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | 4 | class FileExtension(str, Enum): 5 | """Supported file extension enumeration.""" 6 | 7 | _mimetype: str 8 | 9 | def __new__(cls, value: str, mimetype: str): 10 | obj = str.__new__(cls, value) 11 | obj._value_ = value 12 | obj._mimetype = mimetype 13 | return obj 14 | 15 | PDF = (".pdf", "application/pdf") 16 | DOCX = ( 17 | ".docx", 18 | "application/vnd.openxmlformats-officedocument.wordprocessingml.document", 19 | ) 20 | TXT = (".txt", "text/plain") 21 | OTF = (".odt", "application/vnd.oasis.opendocument.text") 22 | EPUB = (".epub", "application/epub") 23 | HTML = (".html", "text/html") 24 | XML = (".xml", "application/xml") 25 | CSV = (".csv", "text/csv") 26 | XLSX = ( 27 | ".xlsx", 28 | "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", 29 | ) 30 | XLS = (".xls", "application/vnd.ms-excel") 31 | PPTX = ( 32 | ".pptx", 33 | "application/vnd.openxmlformats-officedocument.presentationml.presentation", 34 | ) 35 | MD = (".md", "text/markdown") 36 | MARKDOWN = (".markdown", "text/markdown") 37 | 38 | @property 39 | def mimetype(self) -> str: 40 | return self._mimetype 41 | -------------------------------------------------------------------------------- /libs/megaparse_sdk/megaparse_sdk/schema/languages.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | 4 | class Language(str, Enum): 5 | BAZA = "abq" 6 | ADYGHE = "ady" 7 | AFRIKAANS = "af" 8 | ANGIKA = "ang" 9 | ARABIC = "ar" 10 | ASSAMESE = "as" 11 | AVAR = "ava" 12 | AZERBAIJANI = "az" 13 | BELARUSIAN = "be" 14 | BULGARIAN = "bg" 15 | BIHARI = "bh" 16 | BHOJPURI = "bho" 17 | BENGALI = "bn" 18 | BOSNIAN = "bs" 19 | SIMPLIFIED_CHINESE = "ch_sim" 20 | TRADITIONAL_CHINESE = "ch_tra" 21 | CHECHEN = "che" 22 | CZECH = "cs" 23 | WELSH = "cy" 24 | DANISH = "da" 25 | DARGWA = "dar" 26 | GERMAN = "de" 27 | ENGLISH = "en" 28 | SPANISH = "es" 29 | ESTONIAN = "et" 30 | PERSIAN_FARSI = "fa" 31 | FRENCH = "fr" 32 | IRISH = "ga" 33 | GOAN_KONKANI = "gom" 34 | HINDI = "hi" 35 | CROATIAN = "hr" 36 | HUNGARIAN = "hu" 37 | INDONESIAN = "id" 38 | INGUSH = "inh" 39 | ICELANDIC = "is" 40 | ITALIAN = "it" 41 | JAPANESE = "ja" 42 | KABARDIAN = "kbd" 43 | KANNADA = "kn" 44 | KOREAN = "ko" 45 | KURDISH = "ku" 46 | LATIN = "la" 47 | LAK = "lbe" 48 | LEZGHIAN = "lez" 49 | LITHUANIAN = "lt" 50 | LATVIAN = "lv" 51 | MAGAHI = "mah" 52 | MAITHILI = "mai" 53 | MAORI = "mi" 54 | MONGOLIAN = "mn" 55 | MARATHI = "mr" 56 | MALAY = "ms" 57 | MALTESE = "mt" 58 | NEPALI = "ne" 59 | NEWARI = "new" 60 | DUTCH = "nl" 61 | NORWEGIAN = "no" 62 | OCCITAN = "oc" 63 | PALI = "pi" 64 | POLISH = "pl" 65 | PORTUGUESE = "pt" 66 | ROMANIAN = "ro" 67 | RUSSIAN = "ru" 68 | SERBIAN_CYRILLIC = "rs_cyrillic" 69 | SERBIAN_LATIN = "rs_latin" 70 | NAGPURI = "sck" 71 | SLOVAK = "sk" 72 | SLOVENIAN = "sl" 73 | ALBANIAN = "sq" 74 | SWEDISH = "sv" 75 | SWAHILI = "sw" 76 | TAMIL = "ta" 77 | TABASSARAN = "tab" 78 | TELUGU = "te" 79 | THAI = "th" 80 | TAJIK = "tjk" 81 | TAGALOG = "tl" 82 | TURKISH = "tr" 83 | UYGHUR = "ug" 84 | UKRAINIAN = "uk" 85 | URDU = "ur" 86 | UZBEK = "uz" 87 | VIETNAMESE = "vi" 88 | -------------------------------------------------------------------------------- /libs/megaparse_sdk/megaparse_sdk/schema/mp_exceptions.py: -------------------------------------------------------------------------------- 1 | class ModelNotSupported(Exception): 2 | def __init__( 3 | self, 4 | message: str = "The requested model is not supported yet.", 5 | ): 6 | super().__init__(message) 7 | 8 | 9 | class MemoryLimitExceeded(Exception): 10 | def __init__(self, message="The service is under high memory pressure"): 11 | super().__init__(message) 12 | 13 | 14 | class InternalServiceError(Exception): 15 | def __init__(self, message="Internal service error occured"): 16 | super().__init__(message) 17 | 18 | 19 | class DownloadError(Exception): 20 | def __init__(self, message="Failed to download the file"): 21 | super().__init__(message) 22 | 23 | 24 | class ParsingException(Exception): 25 | def __init__(self, message="An error occurred during parsing"): 26 | super().__init__(message) 27 | -------------------------------------------------------------------------------- /libs/megaparse_sdk/megaparse_sdk/schema/mp_inputs.py: -------------------------------------------------------------------------------- 1 | import base64 2 | from enum import Enum 3 | from typing import Literal, Union 4 | 5 | from pydantic import BaseModel, Field, field_serializer, field_validator 6 | 7 | from .parser_config import ParseFileConfig 8 | 9 | 10 | class FileInput(BaseModel): 11 | file_name: str 12 | file_size: int 13 | data: bytes 14 | 15 | @field_validator("data", mode="before") 16 | def decode_data(cls, value): 17 | if isinstance(value, str): 18 | try: 19 | return base64.b64decode(value) 20 | except Exception: 21 | raise ValueError("Invalid Base64 encoding for the 'data' field.") 22 | return value 23 | 24 | # TODO: this is slow !!! Move to reading bytes directly from bucket storage 25 | # append bytes with CRC32 26 | @field_serializer("data", return_type=str) 27 | def serialize_data(self, data: bytes, _info): 28 | return base64.b64encode(data).decode("utf-8") 29 | 30 | 31 | class MPParseType(str, Enum): 32 | PARSE_FILE = "parse_file" 33 | PARSE_URL = "parse_url" 34 | 35 | 36 | class ParseFileInput(BaseModel): 37 | mp_parse_type: Literal[MPParseType.PARSE_FILE] = MPParseType.PARSE_FILE 38 | file_input: FileInput 39 | parse_config: ParseFileConfig 40 | 41 | 42 | class ParseUrlInput(BaseModel): 43 | mp_parse_type: Literal[MPParseType.PARSE_URL] = MPParseType.PARSE_URL 44 | url: str 45 | 46 | 47 | class MPInput(BaseModel): 48 | input: Union[ParseFileInput, ParseUrlInput] = Field( 49 | ..., discriminator="mp_parse_type" 50 | ) 51 | -------------------------------------------------------------------------------- /libs/megaparse_sdk/megaparse_sdk/schema/mp_outputs.py: -------------------------------------------------------------------------------- 1 | from enum import Enum, auto 2 | from typing import Dict 3 | 4 | from pydantic import BaseModel, Field 5 | 6 | from megaparse_sdk.schema.document import Document 7 | 8 | 9 | class MPErrorType(Enum): 10 | MEMORY_LIMIT = auto() 11 | INTERNAL_SERVER_ERROR = auto() 12 | MODEL_NOT_SUPPORTED = auto() 13 | DOWNLOAD_ERROR = auto() 14 | PARSING_ERROR = auto() 15 | 16 | 17 | class ParseError(BaseModel): 18 | mp_err_code: MPErrorType 19 | message: str 20 | 21 | 22 | class MPOutputType(str, Enum): 23 | PARSE_OK = "parse_file_ok" 24 | PARSE_ERR = "parse_file_err" 25 | 26 | 27 | class MPOutput(BaseModel): 28 | output_type: MPOutputType 29 | result: str | Document | None 30 | err: ParseError | None = None 31 | metadata: Dict[str, str] = Field(default_factory=dict) 32 | -------------------------------------------------------------------------------- /libs/megaparse_sdk/megaparse_sdk/schema/parser_config.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | from typing import Optional 3 | 4 | from pydantic import BaseModel 5 | 6 | from .languages import Language 7 | from .supported_models import SupportedModel 8 | 9 | 10 | class ParserType(str, Enum): 11 | """Parser type enumeration.""" 12 | 13 | UNSTRUCTURED = "unstructured" 14 | LLAMA_PARSER = "llama_parser" 15 | MEGAPARSE_VISION = "megaparse_vision" 16 | 17 | 18 | class StrategyEnum(str, Enum): 19 | """Method to use for the conversion""" 20 | 21 | FAST = "fast" 22 | AUTO = "auto" 23 | HI_RES = "hi_res" 24 | 25 | 26 | class ParseFileConfig(BaseModel): 27 | llm_model_name: SupportedModel = SupportedModel.GPT_4 28 | method: ParserType = ParserType.UNSTRUCTURED 29 | strategy: StrategyEnum = StrategyEnum.AUTO 30 | check_table: bool = False 31 | language: Language = Language.ENGLISH 32 | parsing_instruction: Optional[str] = None 33 | -------------------------------------------------------------------------------- /libs/megaparse_sdk/megaparse_sdk/schema/supported_models.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | 4 | class SupportedModel(str, Enum): 5 | """Supported models enumeration.""" 6 | 7 | # OpenAI Models 8 | GPT_4 = "gpt-4" 9 | GPT_4_TURBO = "gpt-4-turbo" 10 | GPT_3_5_TURBO = "gpt-3.5-turbo" 11 | GPT_4O = "gpt-4o" 12 | GPT_4O_MINI = "gpt-4o-mini" 13 | 14 | # Anthropic Models 15 | CLAUDE_3_5_SONNET_LATEST = "claude-3-5-sonnet-latest" 16 | CLAUDE_3_5_SONNET = "claude-3-5-sonnet-20241022" 17 | CLAUDE_3_5_HAIKU = "claude-3-5-haiku-20241022" 18 | CLAUDE_3_5_HAIKU_LATEST = "claude-3-5-haiku-latest" 19 | CLAUDE_3_OPUS = "claude-3-opus-20240229" 20 | CLAUDE_3_OPUS_LATEST = "claude-3-opus-latest" 21 | CLAUDE_3_SONNET = "claude-3-sonnet-20240229" 22 | CLAUDE_3_HAIKU = "claude-3-haiku-20240307" 23 | 24 | def __str__(self): 25 | return self.value 26 | 27 | @classmethod 28 | def is_supported(cls, model_name: str) -> bool: 29 | """Check if the model is supported.""" 30 | return model_name in cls.__members__.values() 31 | 32 | @classmethod 33 | def get_supported_models(cls) -> list[str]: 34 | """Get the list of supported models.""" 35 | return list(cls.__members__.values()) 36 | -------------------------------------------------------------------------------- /libs/megaparse_sdk/megaparse_sdk/utils/load_ssl.py: -------------------------------------------------------------------------------- 1 | import ssl 2 | 3 | from megaparse_sdk.config import SSLConfig 4 | 5 | 6 | def load_ssl_cxt(ssl_config: SSLConfig): 7 | context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT) 8 | if ssl_config.ca_cert_file: 9 | context.load_verify_locations(cafile=ssl_config.ca_cert_file) 10 | context.load_cert_chain( 11 | certfile=ssl_config.ssl_cert_file, keyfile=ssl_config.ssl_key_file 12 | ) 13 | return context 14 | -------------------------------------------------------------------------------- /libs/megaparse_sdk/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "megaparse-sdk" 3 | version = "0.1.12" 4 | description = "Megaparse SDK" 5 | dependencies = [ 6 | "python-dotenv>=1.0.0", 7 | "pycryptodome>=3.21.0", 8 | "psutil>=6.1.0", 9 | "httpx>=0.27.0", 10 | "nats-py>=2.9.0", 11 | "loguru>=0.7.2", 12 | ] 13 | 14 | readme = "README.md" 15 | requires-python = ">= 3.11" 16 | 17 | [build-system] 18 | requires = ["hatchling==1.26.3"] 19 | build-backend = "hatchling.build" 20 | 21 | [tool.rye] 22 | managed = true 23 | dev-dependencies = [] 24 | universal = true 25 | 26 | [tool.hatch.metadata] 27 | allow-direct-references = true 28 | 29 | [tool.hatch.build.targets.wheel] 30 | packages = ["megaparse_sdk"] 31 | -------------------------------------------------------------------------------- /libs/megaparse_sdk/tests/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QuivrHQ/MegaParse/ba9a24aec950d6cf14834b8e2e11f5725778f12e/libs/megaparse_sdk/tests/README.md -------------------------------------------------------------------------------- /libs/megaparse_sdk/tests/certs/client-cert.pem: -------------------------------------------------------------------------------- 1 | -----BEGIN CERTIFICATE----- 2 | MIIEqDCCAxCgAwIBAgIRAITvq6ZEk6paYFDRbueJhEMwDQYJKoZIhvcNAQELBQAw 3 | gZ0xHjAcBgNVBAoTFW1rY2VydCBkZXZlbG9wbWVudCBDQTE5MDcGA1UECwwwYW1p 4 | bmVAYW1pbmVzLU1hY0Jvb2stUHJvLmxvY2FsIChhbWluZSBkaXJob3Vzc2kpMUAw 5 | PgYDVQQDDDdta2NlcnQgYW1pbmVAYW1pbmVzLU1hY0Jvb2stUHJvLmxvY2FsIChh 6 | bWluZSBkaXJob3Vzc2kpMB4XDTI0MTExOTEwNDgwN1oXDTI3MDIxOTEwNDgwN1ow 7 | ZDEnMCUGA1UEChMebWtjZXJ0IGRldmVsb3BtZW50IGNlcnRpZmljYXRlMTkwNwYD 8 | VQQLDDBhbWluZUBhbWluZXMtTWFjQm9vay1Qcm8ubG9jYWwgKGFtaW5lIGRpcmhv 9 | dXNzaSkwggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEKAoIBAQC2fDlGlKYIj8bp 10 | tlDYh8ooc56Zt+R1HF1GcqF0Gv+oub/dDvsIZnun5bBnA7W3tJ4M6virwg6cBiA5 11 | KDkIbwWfzHatsvFM0gMX3ZEfAwemo9Egi8udOsuAkP0OYlxzAB1PqOKCRfcfzFcH 12 | qmOb/JNlI82LBDLOqJDfGG4cRBYWqWRTYDxHsswSKFr/QHOHpImtrAyqo8qsXobN 13 | gLWSm1cNNtHa5XiCCJ7NUCVZh5cyEeCv1fS2297N+H0W9BxKpb1f9sAQ2N3ZLei8 14 | ghHuQVA8yhUB1YCO/8jsywvXb8EnZctPLvhuLxeCN7A4TESPk5i0LsITqJcl4vQT 15 | WWVVcNfJAgMBAAGjgZowgZcwDgYDVR0PAQH/BAQDAgWgMCcGA1UdJQQgMB4GCCsG 16 | AQUFBwMCBggrBgEFBQcDAQYIKwYBBQUHAwQwHwYDVR0jBBgwFoAUV2w3gvQM5La1 17 | 2fk80tJXoM/14l4wOwYDVR0RBDQwMoIJbG9jYWxob3N0gRNtZWdhcGFyc2VAcXVp 18 | dnIuYXBwhxAAAAAAAAAAAAAAAAAAAAABMA0GCSqGSIb3DQEBCwUAA4IBgQAYq4VZ 19 | 6spwGvcqg8kCOghu6o54UPYo/NLzh3oYewJnDJ+2XD786TpTgjZMGA6Ms+det6oV 20 | HdT5s77VFgJiJloHlD0fpKkRxjzyBOk5/bQcCKkTMBVfgJbMoAfa2gq+/7zxmLcn 21 | AmNg7BkmsTtHWPsLyN3rYI4dkkDKWkxp8Sezm9WPEa9OGJDJSYf4Dq9pN1lUoP1p 22 | vxsq7sW0HDWnx/I2zWuz3AaT9b4UayRnk4IRYxAuYYN/k0GNjVmmDveywNoNlkmW 23 | 0Az6ycPN+vvz8Jpm3CbZSIQLO8Yn57H/aU4DmOtunm3VLUiLucmfOggv8Sq5n2g9 24 | ze61UJu9lr2/nWOXnErl3V9UL3kJ1OlbFzTWDGm9zX7boo6MLXy+fAj+Tw0sCeMr 25 | drdxo8IUYYU6HUdtuLGMFznBFFUNhfFSwFANGPB38NyofwLPSZM0hYntQqBMt/P7 26 | /E+wQ67hSEutkIbOD3kGkGREIk3dVyUeajO9DFTaQ+yTnNtnuUbxs5LkRlw= 27 | -----END CERTIFICATE----- 28 | -------------------------------------------------------------------------------- /libs/megaparse_sdk/tests/certs/client-key.pem: -------------------------------------------------------------------------------- 1 | -----BEGIN PRIVATE KEY----- 2 | MIIEvgIBADANBgkqhkiG9w0BAQEFAASCBKgwggSkAgEAAoIBAQC2fDlGlKYIj8bp 3 | tlDYh8ooc56Zt+R1HF1GcqF0Gv+oub/dDvsIZnun5bBnA7W3tJ4M6virwg6cBiA5 4 | KDkIbwWfzHatsvFM0gMX3ZEfAwemo9Egi8udOsuAkP0OYlxzAB1PqOKCRfcfzFcH 5 | qmOb/JNlI82LBDLOqJDfGG4cRBYWqWRTYDxHsswSKFr/QHOHpImtrAyqo8qsXobN 6 | gLWSm1cNNtHa5XiCCJ7NUCVZh5cyEeCv1fS2297N+H0W9BxKpb1f9sAQ2N3ZLei8 7 | ghHuQVA8yhUB1YCO/8jsywvXb8EnZctPLvhuLxeCN7A4TESPk5i0LsITqJcl4vQT 8 | WWVVcNfJAgMBAAECggEBAIK2AlSzHyacze8UH16qDTzibGVRGjxkf895Rnqi6COU 9 | QYD3PQrsVYCS/sMbHiujHV7FZC+rRcmufaBTVl7bH10yGIQc28iZ2YtbsppTEkTj 10 | rGUynTtXJPNHZ2vJOs1I9LXdk7maogPN2zzraIQP7AgTGCSOclIi3fpfRmfKwUOj 11 | BkEzj7CbaAGtW9vTamPJG/+wgaaBcPhplQk4cD2mjdaMLfGQXNXiYgp09kf0hJ2k 12 | 0QbsQBC85bMSfmPAsoTRLxi94S12at3SABgF0oOCy9FZs/sWsdJRI6nbfvZ3C4xo 13 | 8y+rH7Yaej7AYK+jbU3Uk/1473cuCAnNKg65UyU4+gECgYEA2/ZQYRDU3JWNHQGy 14 | dJXZRl6hSFCw9y9RUc/QjcRs+VlnXE5UK1eLwfcKh0YYRhIWSE8z3mZmK09M/FG0 15 | xbU4qIZbDYcAI2nCiUeT8HmTjVSPMS1oWZrt7rh00gcyoLQt2TUS3bo2tsmdPyWW 16 | OgEiYfb4MoG/KCdYlACE6O4GMMECgYEA1GIMIHM2x4B1wgLnKeI3X2wYWuYCHtFB 17 | Px56GUFTZytBsHghxtovVlLh88FNS5rthvXuE0FHE9RljKhZaNgqrPOrlAZSuv18 18 | vK7RmG/NPJl2osbs677a/xoxNuVkfrRcxl4cvYOBL5huHo1D5sOitGFW+IlscgWY 19 | nWzXlY7AYQkCgYA6H96hp7b4CzTc42Pq1uYxaDQqTdhVmVVdzxKHQ86gHXXouHIZ 20 | eereeI95q5YifgkRVoyYSmrZKv1m95hTXk34inhpHLF2qi3T5Ow88YOCJ0QndJ5M 21 | f1o8aNXF4k0IllQ/P30axmhK6P/6fc4yybXyOTbg8dQ3oh4QDgsRGkTcgQKBgQCG 22 | qLgJpyN3cPK5FYAeJUl4nh//GlED2yekbp15/9py0pFu42x/GX3kHN8Y31oz8sJh 23 | zPKrkLsRTp0ohuFRwaWlTUZfr3arCugY9jr8jP6zSpZW9QvpGXTfRGsp5F5Im/Eq 24 | 8ScF3ih91gcUJfuEiExUVFeBdBinXvb58bXrJLzDiQKBgG+Z06uj2dWxtK4nqJvP 25 | HllTocAGVm+fEmupVsLU6ksVVrOl8O9TapMbY8pUj9J5oBYJvY+KFGoIoxYwhZrz 26 | 4NqY7iv8w+LQ7mQIwcQ4B67pDAQMJZTShR5v57FlAZldP5UpE5ASt22isBW31sYI 27 | 1OaXIqrCA/V43NydDezh0ylQ 28 | -----END PRIVATE KEY----- 29 | -------------------------------------------------------------------------------- /libs/megaparse_sdk/tests/certs/rootCA.pem: -------------------------------------------------------------------------------- 1 | -----BEGIN CERTIFICATE----- 2 | MIIFCzCCA3OgAwIBAgIQESt0eck2KvFrAMyiDyceujANBgkqhkiG9w0BAQsFADCB 3 | nTEeMBwGA1UEChMVbWtjZXJ0IGRldmVsb3BtZW50IENBMTkwNwYDVQQLDDBhbWlu 4 | ZUBhbWluZXMtTWFjQm9vay1Qcm8ubG9jYWwgKGFtaW5lIGRpcmhvdXNzaSkxQDA+ 5 | BgNVBAMMN21rY2VydCBhbWluZUBhbWluZXMtTWFjQm9vay1Qcm8ubG9jYWwgKGFt 6 | aW5lIGRpcmhvdXNzaSkwHhcNMjQxMTE5MTAwMTA5WhcNMzQxMTE5MTAwMTA5WjCB 7 | nTEeMBwGA1UEChMVbWtjZXJ0IGRldmVsb3BtZW50IENBMTkwNwYDVQQLDDBhbWlu 8 | ZUBhbWluZXMtTWFjQm9vay1Qcm8ubG9jYWwgKGFtaW5lIGRpcmhvdXNzaSkxQDA+ 9 | BgNVBAMMN21rY2VydCBhbWluZUBhbWluZXMtTWFjQm9vay1Qcm8ubG9jYWwgKGFt 10 | aW5lIGRpcmhvdXNzaSkwggGiMA0GCSqGSIb3DQEBAQUAA4IBjwAwggGKAoIBgQCw 11 | 6TX1kvqVMb8ZUQVT/vuDsedmbYgSFn68yJRlmE9BsqG7TLQHl2Kw6VQqZBSIkeZG 12 | CypmUysX/3qrvICeArIdmmsrWUTDYPoauw/a/RY0I07rALj3YR0Y7039Hxf/UPT9 13 | xlUtnM2NafkZyp6WRjEN0N4ETvJDIbUQiosiiPilxhwRbJURhT/JPskaw+OM2Sw5 14 | dFAT20zkYC5VIc4wJBFLAMG0XzI6Sy/4wI1WdRBXd2UMpQU4u7TyD0RB4mnHorV6 15 | kXjtLKD/KWSrSG1nnum9SB9eVatbRD+TUgoclwAKedrlCDEM4EsXVVuUuYCizQNb 16 | +H3BSPfj1upUW5eKfgAyB+8r4QGf2yCY9O8NMMrJ1K5Qv4vSuWAU2tZqAyE8Z4Ke 17 | UtHsl/M0zIvIKwyki2N/rieL/m6lTzS3dwSf9vv7eePEvxd8SBClSF07MUzyxkZ5 18 | UYNxaK5t2ZRADZ6n/9/hAQsMscCkHiX1N2ypBFV+86Pr78BC48JgIyCMwuiBN4sC 19 | AwEAAaNFMEMwDgYDVR0PAQH/BAQDAgIEMBIGA1UdEwEB/wQIMAYBAf8CAQAwHQYD 20 | VR0OBBYEFFdsN4L0DOS2tdn5PNLSV6DP9eJeMA0GCSqGSIb3DQEBCwUAA4IBgQBj 21 | KosfLfW/ZH80NM16pvpyRF3mCi+q+I+P8zrfilMYJBH4EEdEGAUgTO5do1kJXeel 22 | Wky+FNxaP6KCNiT+0amypKg+yjBlnqLKVdnEgR5s12ZfmerV59stx1A/c/bYMEAS 23 | re6xskBkowP2cVQHAC2dy/0Ov+lZsiNaPV2bQx6KUJurveebUQsH3uF3ZEhnUVQ6 24 | rt5+JGY4x9Tr1YMhvHqEDTrsipPdDB1MyW1SnCkqSXrz+DPXGd8BW0O0hpM5la81 25 | J+rfZGinbcUgXM6JMLIHDxLc4Xxzm4NijFzXhbR3XPXqEwsnZOuxcYYFgUGs3FwS 26 | 4ro+34a/O4uKS2KV8wsUWj/tWD2rLpduDgag4WSipCvWtaNve8gPdUiyPxUqxyoZ 27 | aFAFg/izXwmRntogJtV0Zvo3fqAaQQDl8t2s21IIx0wmgHzgmkswb5OwFg3dOn/S 28 | lmaH8v7FCBP7jHx/NCPTT5Sy/1EMRATmhFDUZ8Bod/TIlV3e+FCVqlX3kBBRbAU= 29 | -----END CERTIFICATE----- 30 | -------------------------------------------------------------------------------- /libs/megaparse_sdk/tests/pdf/MegaFake_report.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QuivrHQ/MegaParse/ba9a24aec950d6cf14834b8e2e11f5725778f12e/libs/megaparse_sdk/tests/pdf/MegaFake_report.pdf -------------------------------------------------------------------------------- /libs/megaparse_sdk/tests/pdf/sample_table.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QuivrHQ/MegaParse/ba9a24aec950d6cf14834b8e2e11f5725778f12e/libs/megaparse_sdk/tests/pdf/sample_table.pdf -------------------------------------------------------------------------------- /libs/megaparse_sdk/tests/test_nats_client.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import logging 3 | from pathlib import Path 4 | 5 | import nats 6 | import pytest 7 | import pytest_asyncio 8 | from megaparse_sdk.client import ClientState, MegaParseNATSClient 9 | from megaparse_sdk.config import ClientNATSConfig, SSLConfig 10 | from megaparse_sdk.schema.mp_exceptions import ( 11 | DownloadError, 12 | InternalServiceError, 13 | MemoryLimitExceeded, 14 | ModelNotSupported, 15 | ParsingException, 16 | ) 17 | from megaparse_sdk.schema.mp_inputs import MPInput, ParseFileInput, ParseUrlInput 18 | from megaparse_sdk.schema.mp_outputs import ( 19 | MPErrorType, 20 | MPOutput, 21 | MPOutputType, 22 | ParseError, 23 | ) 24 | from nats.aio.client import Client 25 | 26 | logger = logging.getLogger(__name__) 27 | 28 | NATS_URL = "nats://test@127.0.0.1:4222" 29 | NATS_SUBJECT = "parsing" 30 | SSL_CERT_FILE = "./tests/certs/client-cert.pem" 31 | SSL_KEY_FILE = "./tests/certs/client-key.pem" 32 | CA_CERT_FILE = "./tests/certs/rootCA.pem" 33 | 34 | 35 | @pytest.fixture(scope="session") 36 | def ssl_config() -> SSLConfig: 37 | return SSLConfig( 38 | ca_cert_file=CA_CERT_FILE, 39 | ssl_key_file=SSL_KEY_FILE, 40 | ssl_cert_file=SSL_CERT_FILE, 41 | ) 42 | 43 | 44 | @pytest.fixture(scope="session") 45 | def nc_config(ssl_config: SSLConfig) -> ClientNATSConfig: 46 | config = ClientNATSConfig( 47 | subject=NATS_SUBJECT, 48 | endpoint=NATS_URL, 49 | ssl_config=ssl_config, 50 | timeout=0.5, 51 | max_retries=1, 52 | backoff=-1, 53 | connect_timeout=1, 54 | reconnect_time_wait=1, 55 | max_reconnect_attempts=1, 56 | ) 57 | return config 58 | 59 | 60 | @pytest_asyncio.fixture(scope="function") 61 | async def nats_service(nc_config: ClientNATSConfig): 62 | # TODO: fix TLS handshake to work in CI 63 | # ssl_config = load_ssl_cxt(nc_config.ssl_config) 64 | nc = await nats.connect( 65 | nc_config.endpoint, 66 | tls=ssl_config, 67 | connect_timeout=nc_config.connect_timeout, 68 | reconnect_time_wait=nc_config.reconnect_time_wait, 69 | max_reconnect_attempts=nc_config.max_reconnect_attempts, 70 | ) 71 | yield nc 72 | await nc.drain() 73 | 74 | 75 | @pytest.mark.asyncio 76 | async def test_client_state_transition(nc_config: ClientNATSConfig): 77 | mpc = MegaParseNATSClient(nc_config) 78 | assert mpc._state == ClientState.UNOPENED 79 | async with mpc: 80 | assert mpc._state == ClientState.OPENED 81 | assert mpc._state == ClientState.CLOSED 82 | 83 | with pytest.raises(RuntimeError): 84 | async with mpc: 85 | pass 86 | 87 | 88 | @pytest.mark.asyncio(loop_scope="session") 89 | async def test_client_parse_file(nats_service: Client, nc_config: ClientNATSConfig): 90 | async def message_handler(msg): 91 | parsed_input = MPInput.model_validate_json(msg.data.decode("utf-8")).input 92 | assert isinstance(parsed_input, ParseFileInput) 93 | output = MPOutput(output_type=MPOutputType.PARSE_OK, result="test") 94 | await nats_service.publish(msg.reply, output.model_dump_json().encode("utf-8")) 95 | 96 | await nats_service.subscribe(NATS_SUBJECT, "worker", cb=message_handler) 97 | 98 | file_path = Path("./tests/pdf/sample_table.pdf") 99 | async with MegaParseNATSClient(nc_config) as mp_client: 100 | resp = await mp_client.parse_file(file=file_path) 101 | assert resp == "test" 102 | 103 | 104 | @pytest.mark.asyncio(loop_scope="session") 105 | async def test_client_parse_url(nats_service: Client, nc_config: ClientNATSConfig): 106 | async def message_handler(msg): 107 | parsed_input = MPInput.model_validate_json(msg.data.decode("utf-8")).input 108 | assert isinstance(parsed_input, ParseUrlInput) 109 | output = MPOutput(output_type=MPOutputType.PARSE_OK, result="url") 110 | await nats_service.publish(msg.reply, output.model_dump_json().encode("utf-8")) 111 | 112 | await nats_service.subscribe(NATS_SUBJECT, "worker", cb=message_handler) 113 | 114 | async with MegaParseNATSClient(nc_config) as mp_client: 115 | resp = await mp_client.parse_url(url="this://this") 116 | assert resp == "url" 117 | 118 | 119 | @pytest.mark.asyncio(loop_scope="session") 120 | async def test_client_parse_timeout(nats_service: Client, ssl_config: SSLConfig): 121 | nc_config = ClientNATSConfig( 122 | subject=NATS_SUBJECT, 123 | endpoint=NATS_URL, 124 | ssl_config=ssl_config, 125 | timeout=0.1, 126 | max_retries=1, 127 | backoff=1, 128 | ) 129 | 130 | async def service(msg): 131 | await asyncio.sleep(2 * nc_config.timeout) 132 | 133 | await nats_service.subscribe(NATS_SUBJECT, "worker", cb=service) 134 | 135 | file_path = Path("./tests/pdf/sample_table.pdf") 136 | with pytest.raises(ParsingException): 137 | async with MegaParseNATSClient(nc_config) as mp_client: 138 | await mp_client.parse_file(file=file_path) 139 | 140 | 141 | @pytest.mark.asyncio(loop_scope="session") 142 | async def test_client_parse_timeout_retry(nats_service: Client, ssl_config: SSLConfig): 143 | nc_config = ClientNATSConfig( 144 | subject=NATS_SUBJECT, 145 | endpoint=NATS_URL, 146 | ssl_config=ssl_config, 147 | timeout=0.1, 148 | max_retries=2, 149 | backoff=-5, 150 | ) 151 | 152 | msgs = [] 153 | 154 | async def service(msg): 155 | msgs.append(msg) 156 | await asyncio.sleep(2 * nc_config.timeout) 157 | 158 | await nats_service.subscribe(NATS_SUBJECT, "worker", cb=service) 159 | 160 | file_path = Path("./tests/pdf/sample_table.pdf") 161 | with pytest.raises(ParsingException): 162 | async with MegaParseNATSClient(nc_config) as mp_client: 163 | await mp_client.parse_file(file=file_path) 164 | assert len(msgs) == 2 165 | 166 | 167 | @pytest.mark.asyncio(loop_scope="session") 168 | @pytest.mark.parametrize( 169 | "mp_error_type, exception_class", 170 | [ 171 | ("MEMORY_LIMIT", MemoryLimitExceeded), 172 | ("INTERNAL_SERVER_ERROR", InternalServiceError), 173 | ("MODEL_NOT_SUPPORTED", ModelNotSupported), 174 | ("DOWNLOAD_ERROR", DownloadError), 175 | ("PARSING_ERROR", ParsingException), 176 | ], 177 | ) 178 | async def test_client_parse_file_excp( 179 | nats_service: Client, nc_config: ClientNATSConfig, mp_error_type, exception_class 180 | ): 181 | async def message_handler(msg): 182 | parsed_input = MPInput.model_validate_json(msg.data.decode("utf-8")).input 183 | assert isinstance(parsed_input, ParseFileInput) 184 | err = ParseError(mp_err_code=MPErrorType[mp_error_type], message="") 185 | output = MPOutput( 186 | output_type=MPOutputType.PARSE_ERR, 187 | err=err, 188 | result=None, 189 | ) 190 | await nats_service.publish(msg.reply, output.model_dump_json().encode("utf-8")) 191 | 192 | await nats_service.subscribe(NATS_SUBJECT, "worker", cb=message_handler) 193 | 194 | file_path = Path("./tests/pdf/sample_table.pdf") 195 | with pytest.raises(exception_class): 196 | async with MegaParseNATSClient(nc_config) as mp_client: 197 | await mp_client.parse_file(file=file_path) 198 | -------------------------------------------------------------------------------- /logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QuivrHQ/MegaParse/ba9a24aec950d6cf14834b8e2e11f5725778f12e/logo.png -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "megaparse-monorepo" 3 | version = "0.0.1" 4 | description = "Megaparse monorepo" 5 | authors = [ 6 | { name = "Stan Girard", email = "stan@quivr.app" }, 7 | { name = "Chloé Daems", email = "chloe@quivr.app" }, 8 | { name = "Amine Dirhoussi", email = "amine@quivr.app" }, 9 | { name = "Jacopo Chevallard", email = "jacopo@quivr.app" }, 10 | ] 11 | readme = "README.md" 12 | requires-python = ">= 3.11" 13 | dependencies = [ 14 | "packaging>=22.0", 15 | ] 16 | 17 | [build-system] 18 | requires = ["hatchling==1.26.3"] 19 | build-backend = "hatchling.build" 20 | 21 | [tool.rye] 22 | python = ">= 3.11" 23 | managed = true 24 | universal = true 25 | dev-dependencies = [ 26 | "mypy>=1.11.1", 27 | "pre-commit>=3.8.0", 28 | "ipykernel>=6.29.5", 29 | "ruff>=0.6.0", 30 | "flake8>=7.1.1", 31 | "flake8-black>=0.3.6", 32 | "pytest-asyncio>=0.23.8", 33 | "pytest>=8.3.3", 34 | "pytest-xdist>=3.6.1", 35 | "pytest-cov>=5.0.0", 36 | "pytest-profiling>=1.8.1", 37 | ] 38 | 39 | [tool.rye.workspace] 40 | members = ["libs/*"] 41 | 42 | [tool.hatch.metadata] 43 | allow-direct-references = true 44 | 45 | [tool.hatch.build.targets.wheel] 46 | packages = ["src/megaparse"] 47 | 48 | [tool.ruff] 49 | line-length = 88 50 | exclude = [".git", "__pycache__", ".mypy_cache", ".pytest_cache"] 51 | 52 | [tool.ruff.lint] 53 | select = [ 54 | "E", # pycodestyle errors 55 | "W", # pycodestyle warnings 56 | "F", # pyflakes 57 | "I", # isort 58 | "C", # flake8-comprehensions 59 | "B", # flake8-bugbear 60 | ] 61 | ignore = [ 62 | "B904", 63 | "B006", 64 | "E501", # line too long, handled by black 65 | "B008", # do not perform function calls in argument defaults 66 | "C901", # too complex 67 | ] 68 | 69 | [tool.ruff.lint.isort] 70 | order-by-type = true 71 | relative-imports-order = "closest-to-furthest" 72 | extra-standard-library = ["typing"] 73 | section-order = [ 74 | "future", 75 | "standard-library", 76 | "third-party", 77 | "first-party", 78 | "local-folder", 79 | ] 80 | known-first-party = [] 81 | 82 | 83 | [tool.pytest.ini_options] 84 | addopts = "--tb=short -ra -v" 85 | asyncio_default_fixture_loop_scope = "session" 86 | filterwarnings = ["ignore::DeprecationWarning"] 87 | markers = [ 88 | "slow: marks tests as slow (deselect with '-m \"not slow\"')", 89 | "base: these tests require quivr-core with extra `base` to be installed", 90 | "tika: these tests require a tika server to be running", 91 | "unstructured: these tests require `unstructured` dependency", 92 | ] 93 | -------------------------------------------------------------------------------- /release-please-config.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://raw.githubusercontent.com/googleapis/release-please/main/schemas/config.json", 3 | "separate-pull-requests": true, 4 | "include-v-in-tag": true, 5 | "bump-patch-for-minor-pre-major": true, 6 | "include-component-in-tag": true, 7 | "packages": { 8 | "libs/megaparse": { 9 | "release-type": "python", 10 | "package-name": "megaparse", 11 | "changelog-notes-type": "github" 12 | }, 13 | "libs/megaparse_sdk": { 14 | "release-type": "python", 15 | "package-name": "megaparse-sdk", 16 | "changelog-notes-type": "github" 17 | } 18 | } 19 | } 20 | --------------------------------------------------------------------------------