├── .aws
    └── task_definition.json
├── .env.example
├── .flake8
├── .gitattributes
├── .github
    └── workflows
    │   ├── CI.yml
    │   ├── build-and-deploy.yml
    │   ├── build-gpu.yml
    │   ├── release-please.yml
    │   └── test-build-docker.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .python-version
├── .release-please-manifest.json
├── .vscode
    ├── extensions.json
    ├── launch.json
    └── settings.json
├── CHANGELOG.md
├── Dockerfile
├── Dockerfile.gpu
├── LICENSE
├── Makefile
├── Pipfile
├── README.md
├── benchmark
    ├── process_single_doc.py
    └── test_quality_sim.py
├── docker-compose.dev.yml
├── docker-compose.yml
├── docs
    └── archive.txt
├── evaluations
    └── script.py
├── images
    └── tables.png
├── libs
    ├── megaparse
    │   ├── .python-version
    │   ├── CHANGELOG.md
    │   ├── README.md
    │   ├── bench.md
    │   ├── examples
    │   │   ├── parse_file_fast.py
    │   │   ├── parse_file_mp.py
    │   │   └── parse_file_unstructured.py
    │   ├── program.prof
    │   ├── pyproject.toml
    │   ├── src
    │   │   └── megaparse
    │   │   │   ├── __init__.py
    │   │   │   ├── api
    │   │   │       ├── __init__.py
    │   │   │       ├── app.py
    │   │   │       ├── exceptions
    │   │   │       │   ├── __init__.py
    │   │   │       │   └── megaparse_exceptions.py
    │   │   │       └── models
    │   │   │       │   ├── __init__.py
    │   │   │       │   └── base.py
    │   │   │   ├── configs
    │   │   │       └── auto.py
    │   │   │   ├── examples
    │   │   │       ├── parse_file.py
    │   │   │       └── parsing_process.py
    │   │   │   ├── exceptions
    │   │   │       └── base.py
    │   │   │   ├── formatter
    │   │   │       ├── base.py
    │   │   │       ├── structured_formatter
    │   │   │       │   ├── __init__.py
    │   │   │       │   └── custom_structured_formatter.py
    │   │   │       └── table_formatter
    │   │   │       │   ├── __init__.py
    │   │   │       │   ├── llm_table_formatter.py
    │   │   │       │   └── vision_table_formatter.py
    │   │   │   ├── layout_detection
    │   │   │       ├── layout_detector.py
    │   │   │       ├── models
    │   │   │       │   └── yolov10s-doclaynet.onnx
    │   │   │       └── output.py
    │   │   │   ├── megaparse.py
    │   │   │   ├── models
    │   │   │       └── page.py
    │   │   │   ├── parser
    │   │   │       ├── __init__.py
    │   │   │       ├── base.py
    │   │   │       ├── builder.py
    │   │   │       ├── doctr_parser.py
    │   │   │       ├── entity.py
    │   │   │       ├── llama.py
    │   │   │       ├── megaparse_vision.py
    │   │   │       └── unstructured_parser.py
    │   │   │   ├── predictor
    │   │   │       └── layout_predictor.py
    │   │   │   └── utils
    │   │   │       ├── extract_metadata.py
    │   │   │       ├── onnx.py
    │   │   │       └── strategy.py
    │   └── tests
    │   │   ├── __init__.py
    │   │   ├── certs
    │   │       ├── client-cert.pem
    │   │       └── client-key.pem
    │   │   ├── conftest.py
    │   │   ├── data
    │   │       ├── MegaFake_report.pdf
    │   │       ├── dummy.pdf
    │   │       └── grt_example
    │   │       │   └── MegaFake_report.md
    │   │   ├── pdf
    │   │       ├── mlbook.pdf
    │   │       ├── native
    │   │       │   ├── 0168011.pdf
    │   │       │   ├── 0168014.pdf
    │   │       │   └── 0168029.pdf
    │   │       ├── ocr
    │   │       │   ├── 0168003.pdf
    │   │       │   ├── 0168004.pdf
    │   │       │   ├── 0168119.pdf
    │   │       │   ├── 0168120.pdf
    │   │       │   ├── 0168123.pdf
    │   │       │   ├── 0168126.pdf
    │   │       │   ├── 0168127.pdf
    │   │       │   └── 0168322.pdf
    │   │       ├── rust.pdf
    │   │       ├── sample_native.pdf
    │   │       ├── sample_pdf.pdf
    │   │       ├── sample_table.pdf
    │   │       ├── test_detect_ocr.py
    │   │       ├── test_pdf_processing.py
    │   │       └── test_pdfium_parser.py
    │   │   ├── supported_docs
    │   │       ├── Sway.epub
    │   │       ├── file-sample_500kB.odt
    │   │       ├── file_example_XLSX_50.xlsx
    │   │       ├── file_example_XLS_50.xls
    │   │       ├── sample.csv
    │   │       ├── sample.docx
    │   │       ├── sample.markdown
    │   │       ├── sample.md
    │   │       ├── sample.otf
    │   │       ├── sample.pptx
    │   │       ├── sample.txt
    │   │       ├── sample.xml
    │   │       ├── sample_complexe.html
    │   │       └── sample_native.pdf
    │   │   ├── test_endpoints.py
    │   │   ├── test_import.py
    │   │   └── test_parsers.py
    └── megaparse_sdk
    │   ├── CHANGELOG.md
    │   ├── README.md
    │   ├── __init__.py
    │   ├── examples
    │       └── usage_example.py
    │   ├── megaparse_sdk
    │       ├── __init__.py
    │       ├── client.py
    │       ├── config.py
    │       ├── endpoints
    │       │   ├── __init__.py
    │       │   ├── file_upload.py
    │       │   └── url_upload.py
    │       ├── schema
    │       │   ├── __init__.py
    │       │   ├── document.py
    │       │   ├── extensions.py
    │       │   ├── languages.py
    │       │   ├── mp_exceptions.py
    │       │   ├── mp_inputs.py
    │       │   ├── mp_outputs.py
    │       │   ├── parser_config.py
    │       │   └── supported_models.py
    │       └── utils
    │       │   └── load_ssl.py
    │   ├── pyproject.toml
    │   └── tests
    │       ├── README.md
    │       ├── certs
    │           ├── client-cert.pem
    │           ├── client-key.pem
    │           └── rootCA.pem
    │       ├── pdf
    │           ├── MegaFake_report.pdf
    │           └── sample_table.pdf
    │       └── test_nats_client.py
├── logo.png
├── pyproject.toml
├── release-please-config.json
├── requirements-dev.lock
└── requirements.lock


/.aws/task_definition.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "taskDefinitionArn": "arn:aws:ecs:eu-west-1:253053805092:task-definition/megaparse-task:2",
 3 |     "containerDefinitions": [
 4 |         {
 5 |             "name": "megaparse",
 6 |             "image": "quay.io/unstructured-io/unstructured-api:latest",
 7 |             "cpu": 0,
 8 |             "portMappings": [
 9 |                 {
10 |                     "containerPort": 8000,
11 |                     "hostPort": 8000,
12 |                     "protocol": "tcp"
13 |                 }
14 |             ],
15 |             "essential": true,
16 |             "environment": [
17 |                 {
18 |                     "name": "UNSTRUCTURED_HI_RES_MODEL_NAME",
19 |                     "value": "detectron2_onnx"
20 |                 },
21 |                 {
22 |                     "name": "UNSTRUCTURED_PARALLEL_MODE_ENABLED",
23 |                     "value": "false"
24 |                 }
25 |             ],
26 |             "mountPoints": [],
27 |             "volumesFrom": [],
28 |             "logConfiguration": {
29 |                 "logDriver": "awslogs",
30 |                 "options": {
31 |                     "awslogs-group": "/ecs/megaparse",
32 |                     "awslogs-region": "eu-west-1",
33 |                     "awslogs-stream-prefix": "ecs"
34 |                 }
35 |             },
36 |             "systemControls": []
37 |         }
38 |     ],
39 |     "family": "megaparse-task",
40 |     "executionRoleArn": "arn:aws:iam::253053805092:role/megaparse-ecsTaskExecutionRole",
41 |     "networkMode": "awsvpc",
42 |     "revision": 2,
43 |     "volumes": [],
44 |     "status": "ACTIVE",
45 |     "requiresAttributes": [
46 |         {
47 |             "name": "com.amazonaws.ecs.capability.logging-driver.awslogs"
48 |         },
49 |         {
50 |             "name": "ecs.capability.execution-role-awslogs"
51 |         },
52 |         {
53 |             "name": "com.amazonaws.ecs.capability.docker-remote-api.1.19"
54 |         },
55 |         {
56 |             "name": "com.amazonaws.ecs.capability.docker-remote-api.1.18"
57 |         },
58 |         {
59 |             "name": "ecs.capability.task-eni"
60 |         }
61 |     ],
62 |     "placementConstraints": [],
63 |     "compatibilities": [
64 |         "EC2",
65 |         "FARGATE"
66 |     ],
67 |     "requiresCompatibilities": [
68 |         "FARGATE"
69 |     ],
70 |     "cpu": "2048",
71 |     "memory": "8192",
72 |     "tags": []
73 | }


--------------------------------------------------------------------------------
/.env.example:
--------------------------------------------------------------------------------
1 | LLAMA_CLOUD_API_KEY=llx-1234567890
2 | OPENAI_API_KEY=sk-1234567890
3 | MEGAPARSE_API_KEY=MyMegaParseKey


--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | ; Minimal configuration for Flake8 to work with Black.
3 | max-line-length = 100
4 | ignore = E101,E111,E112,E221,E222,E501,E711,E712,W503,W504,F401,BLK100
5 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | *.ipynb linguist-vendored
2 | *.html linguist-vendored


--------------------------------------------------------------------------------
/.github/workflows/CI.yml:
--------------------------------------------------------------------------------
 1 | name: Run tests
 2 | 
 3 | on:
 4 |   pull_request:
 5 |   workflow_dispatch:
 6 | 
 7 | env:
 8 |   NATS_TOKEN: test
 9 | 
10 | jobs:
11 |   test:
12 |     name: Run tests on Python ${{ matrix.python-version }}
13 |     runs-on: ubuntu-latest
14 |     strategy:
15 |       matrix:
16 |         python-version: ["3.11", "3.12"]
17 |     steps:
18 |       - name: 👀 Checkout code
19 |         uses: actions/checkout@v2
20 |         with:
21 |           submodules: true
22 | 
23 |       - name: Setup apt cache
24 |         uses: actions/cache@v2
25 |         with:
26 |           path: /var/cache/apt/archives
27 |           key: ${{ runner.os }}-apt-${{ hashFiles('/etc/apt/sources.list') }}
28 | 
29 |       - name: 😭 Install system dependencies
30 |         run: |
31 |           sudo apt-get update && sudo apt-get install -y \
32 |             netcat-traditional \
33 |             unzip \
34 |             libgeos-dev \
35 |             libcurl4-openssl-dev \
36 |             libssl-dev \
37 |             binutils \
38 |             curl \
39 |             git \
40 |             autoconf \
41 |             automake \
42 |             build-essential \
43 |             libtool \
44 |             gcc \
45 |             libmagic-dev \
46 |             poppler-utils \
47 |             tesseract-ocr \
48 |             libreoffice \
49 |             libpq-dev \
50 |             pandoc
51 | 
52 |       - name: 🔽 Install the latest version of rye
53 |         uses: eifinger/setup-rye@v4
54 |         with:
55 |           enable-cache: true
56 | 
57 |       - name: 📌 Pin Python version
58 |         run: rye pin ${{ matrix.python-version }}
59 | 
60 |       - name: 🔽 Download and Install NATS Server
61 |         run: |
62 |           curl -L https://github.com/nats-io/nats-server/releases/download/v2.10.22/nats-server-v2.10.22-linux-amd64.zip -o nats-server.zip
63 |           unzip nats-server.zip -d nats-server && sudo cp nats-server/nats-server-v2.10.22-linux-amd64/nats-server /usr/bin
64 | 
65 |       - name: 🛠️ Set up NATS arguments
66 |         run: |
67 |           nohup nats-server \
68 |             --addr 0.0.0.0 \
69 |             --port 4222 \
70 |             --auth "$NATS_TOKEN" > nats.log 2>&1 &
71 | 
72 |       - name: 🔍 Verify NATS Server is Running
73 |         run: |
74 |           sleep 1 # Give the server some time to start
75 |           if nc -zv localhost 4222; then
76 |             echo "✅ NATS Server is running on port 4222."
77 |           else
78 |             echo "❌ Failed to start NATS Server."
79 |             cat nats.log
80 |             exit 1
81 |           fi
82 | 
83 |       - name: 🔨 Sync dependencies
84 |         run: |
85 |           UV_INDEX_STRATEGY=unsafe-first-match rye sync --no-lock
86 | 
87 |       - name: 🚀 Run tests
88 |         run: |
89 |           rye test -p megaparse-sdk
90 | 


--------------------------------------------------------------------------------
/.github/workflows/build-and-deploy.yml:
--------------------------------------------------------------------------------
 1 | name: Build Docker image and push ECR
 2 | 
 3 | on:
 4 |   push:
 5 |     tags:
 6 |       - "v*"
 7 |     branches: [main]
 8 | 
 9 | env:
10 |   AWS_REGION: eu-west-1
11 |   ECR_REPOSITORY: quivrhq/megaparse
12 |   ECS_CLUSTER: megaparse
13 |   ECS_TASK_DEFINITION: .aws/task_definition.json
14 |   CONTAINER_NAME: megaparse
15 | 
16 | permissions:
17 |   contents: read
18 | 
19 | jobs:
20 |   deploy:
21 |     name: build docker
22 |     runs-on: ubuntu-latest
23 |     environment: production
24 |     outputs:
25 |       imageoutput: ${{ steps.build-image.outputs.imageoutput }}
26 | 
27 |     steps:
28 |       - name: Checkout
29 |         uses: actions/checkout@v3
30 | 
31 |       - name: Configure AWS credentials
32 |         uses: aws-actions/configure-aws-credentials@v4
33 |         with:
34 |           aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
35 |           aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
36 |           aws-region: us-east-1
37 | 
38 |       - name: Login to Amazon ECR
39 |         id: login-ecr
40 |         uses: aws-actions/amazon-ecr-login@v1
41 |         with:
42 |           registry-type: public
43 | 
44 |       - name: Build, tag, and push image to Amazon ECR
45 |         id: build-image
46 |         env:
47 |           ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }}
48 |           IMAGE_TAG: ${{ github.sha }}
49 |         run: |
50 |           # Build a docker container and push it to ECR
51 |           docker build -t $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG .
52 |           docker push $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG
53 | 
54 |           # Tag the image as 'latest' and push
55 |           docker tag $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG $ECR_REGISTRY/$ECR_REPOSITORY:latest
56 |           docker push $ECR_REGISTRY/$ECR_REPOSITORY:latest
57 | 
58 |           echo "imageoutput=$ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG" >> $GITHUB_OUTPUT
59 | 


--------------------------------------------------------------------------------
/.github/workflows/build-gpu.yml:
--------------------------------------------------------------------------------
 1 | name: Build docker GPU and push ECR
 2 | 
 3 | on:
 4 |   push:
 5 |     tags:
 6 |       - "v*"
 7 |     branches: [main]
 8 | 
 9 | env:
10 |   AWS_REGION: eu-west-1
11 |   ECR_REPOSITORY: quivrhq/megaparse-gpu
12 |   ECS_CLUSTER: megaparse
13 |   ECS_TASK_DEFINITION: .aws/task_definition.json
14 |   CONTAINER_NAME: megaparse
15 | 
16 | permissions:
17 |   contents: read
18 | 
19 | jobs:
20 |   deploy:
21 |     name: Build docker-gpu
22 |     runs-on:
23 |       group: big-boy-gpu
24 |     environment: production
25 |     outputs:
26 |       imageoutput: ${{ steps.build-image.outputs.imageoutput }}
27 | 
28 |     steps:
29 |       - name: Checkout
30 |         uses: actions/checkout@v3
31 | 
32 |       - name: Configure AWS credentials
33 |         uses: aws-actions/configure-aws-credentials@v4
34 |         with:
35 |           aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
36 |           aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
37 |           aws-region: us-east-1
38 | 
39 |       - name: Login to Amazon ECR
40 |         id: login-ecr
41 |         uses: aws-actions/amazon-ecr-login@v1
42 |         with:
43 |           registry-type: public
44 | 
45 |       - name: Build, tag, and push image to Amazon ECR
46 |         id: build-image
47 |         env:
48 |           ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }}
49 |           IMAGE_TAG: ${{ github.sha }}
50 |         run: |
51 |           # Build a docker container and push it to ECR
52 |           docker build -t $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG -f Dockerfile.gpu .
53 |           docker push $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG
54 | 
55 |           # Tag the image as 'latest' and push
56 |           docker tag $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG $ECR_REGISTRY/$ECR_REPOSITORY:latest
57 |           docker push $ECR_REGISTRY/$ECR_REPOSITORY:latest
58 | 
59 |           echo "imageoutput=$ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG" >> $GITHUB_OUTPUT
60 | 


--------------------------------------------------------------------------------
/.github/workflows/release-please.yml:
--------------------------------------------------------------------------------
 1 | on:
 2 |   push:
 3 |     branches:
 4 |       - main
 5 | 
 6 | permissions:
 7 |   contents: write
 8 |   pull-requests: write
 9 | 
10 | name: release-please
11 | 
12 | jobs:
13 |   release-please:
14 |     runs-on: ubuntu-latest
15 |     outputs:
16 |       release_created: ${{ steps.release.outputs['libs/megaparse--release_created'] }}
17 |       release_created_sdk: ${{ steps.release.outputs['libs/megaparse_sdk--release_created'] }}
18 |     steps:
19 |       - name: Checkout repository
20 |         uses: actions/checkout@v3
21 |         with:
22 |           fetch-depth: 0 # Fetch all history for tags and releases
23 | 
24 |       - name: Setup Python
25 |         uses: actions/setup-python@v4
26 |         with:
27 |           python-version: "3.11"
28 | 
29 |       - name: Run release-please
30 |         id: release
31 |         uses: google-github-actions/release-please-action@v4
32 |         with:
33 |           token: ${{ secrets.RELEASE_PLEASE_TOKEN }}
34 | 
35 |   deploy-megaparse:
36 |     if: needs.release-please.outputs.release_created == 'true'
37 |     needs: release-please
38 |     runs-on: ubuntu-latest
39 |     steps:
40 |       - uses: actions/checkout@v4
41 |       - name: Install Rye
42 |         uses: eifinger/setup-rye@v2
43 |         with:
44 |           enable-cache: true
45 |       - name: Rye Sync
46 |         run: rye sync --no-lock
47 |       - name: Rye Build
48 |         run: cd libs/megaparse && rye build
49 |       - name: Rye Publish
50 |         run: cd libs/megaparse && rye publish --token ${{ secrets.PYPI_API_TOKEN }} --yes
51 | 
52 |   deploy-sdk:
53 |     if: needs.release-please.outputs.release_created_sdk == 'true'
54 |     needs: release-please
55 |     runs-on: ubuntu-latest
56 |     steps:
57 |       - uses: actions/checkout@v4
58 |       - name: Install Rye
59 |         uses: eifinger/setup-rye@v2
60 |         with:
61 |           enable-cache: true
62 |       - name: Rye Sync
63 |         run: cd libs/megaparse_sdk && rye sync --no-lock
64 |       - name: Rye Build
65 |         run: cd libs/megaparse_sdk && rye build
66 |       - name: Rye Publish
67 |         run: cd libs/megaparse_sdk && rye publish --token ${{ secrets.PYPI_API_TOKEN }} --yes
68 | 


--------------------------------------------------------------------------------
/.github/workflows/test-build-docker.yml:
--------------------------------------------------------------------------------
 1 | on:
 2 |   pull_request:
 3 |     branches:
 4 |       - main
 5 | 
 6 | name: Test build docker
 7 | jobs:
 8 |   build-docker:
 9 |     runs-on: ubuntu-latest
10 |     strategy:
11 |       matrix:
12 |         dockerfile: [Dockerfile, Dockerfile.gpu]
13 |     steps:
14 |       - name: Checkout repository
15 |         uses: actions/checkout@v3
16 | 
17 |       - name: Set up QEMU
18 |         uses: docker/setup-qemu-action@v3
19 |         with:
20 |           platforms: all
21 | 
22 |       - name: Set up Docker Buildx
23 |         uses: docker/setup-buildx-action@v3
24 | 
25 |       - name: Build Docker image with caching
26 |         uses: docker/build-push-action@v4
27 |         with:
28 |           context: .
29 |           file: ${{ matrix.dockerfile }}
30 |           push: false
31 |           tags: quivrhq/megaparse:${{ matrix.dockerfile }}
32 |           cache-from: type=gha
33 |           cache-to: type=gha,mode=max
34 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | /output
 2 | /input
 3 | .env
 4 | __pycache__/
 5 | dist/**
 6 | megaparse.egg-info/
 7 | *.pyc
 8 | build/*
 9 | ENV
10 | venv
11 | */evaluations/*
12 | */cdp/*
13 | *.pkl
14 | 
15 | !megaparse/tests/output_tests/MegaFake_report.md
16 | *.DS_Store
17 | .tool-versions
18 | megaparse/sdk/examples/only_pdfs/*
19 | 
20 | **/profile/
21 | **/prof/
22 | .ropeproject/
23 | benchmark/hi_res/*
24 | benchmark/auto/*
25 | 
26 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: https://github.com/pre-commit/pre-commit-hooks
 3 |     rev: v4.6.0
 4 |     hooks:
 5 |       - id: check-added-large-files
 6 |         args: ["--maxkb=5000"]
 7 |       - id: check-toml
 8 |       - id: check-yaml
 9 |       - id: end-of-file-fixer
10 |       - id: trailing-whitespace
11 |       - id: check-merge-conflict
12 |       - id: detect-private-key
13 |       - id: check-case-conflict
14 |   - repo: https://github.com/pre-commit/pre-commit
15 |     rev: v3.6.2
16 |     hooks:
17 |       - id: validate_manifest
18 |   - repo: https://github.com/astral-sh/ruff-pre-commit
19 |     # Ruff version.
20 |     rev: v0.5.1
21 |     hooks:
22 |       # Run the linter.
23 |       - id: ruff
24 |         args: [--fix]
25 |         additional_dependencies: []
26 |       # Run the formatter.
27 |       - id: ruff-format
28 |         additional_dependencies: []
29 |   - repo: https://github.com/pre-commit/mirrors-mypy
30 |     rev: v1.10.1
31 |     hooks:
32 |       - id: mypy
33 |         name: mypy
34 |         additional_dependencies: ["types-aiofiles"]
35 | 


--------------------------------------------------------------------------------
/.python-version:
--------------------------------------------------------------------------------
1 | 3.11.9
2 | 


--------------------------------------------------------------------------------
/.release-please-manifest.json:
--------------------------------------------------------------------------------
1 | {
2 |   "libs/megaparse": "0.0.55",
3 |   "libs/megaparse_sdk": "0.1.12"
4 | }
5 | 


--------------------------------------------------------------------------------
/.vscode/extensions.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "recommendations": [
 3 |     "dbaeumer.vscode-eslint",
 4 |     "charliermarsh.ruff",
 5 |     "knisterpeter.vscode-github",
 6 |     "github.vscode-pull-request-github",
 7 |     "ms-python.python",
 8 |     "ms-python.vscode-pylance",
 9 |     "ms-python.debugpy"
10 |   ]
11 | }
12 | 


--------------------------------------------------------------------------------
/.vscode/launch.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "version": "0.2.0",
 3 |     "configurations": [
 4 |         {
 5 |             "name": "Python: Remote Attach",
 6 |             "type": "python",
 7 |             "request": "attach",
 8 |             "connect": {
 9 |                 "host": "localhost",
10 |                 "port": 5678
11 |             },
12 |             "pathMappings": [
13 |                 {
14 |                     "localRoot": "${workspaceFolder}/backend",
15 |                     "remoteRoot": "."
16 |                 }
17 |             ],
18 |             "justMyCode": true
19 |         },
20 |         {
21 |             "name": "Python: Debug Test Script",
22 |             "type": "python",
23 |             "request": "launch",
24 |             "program": "${workspaceFolder}/backend/test_process_file_and_notify.py",
25 |             "console": "integratedTerminal",
26 |             "justMyCode": false
27 |         },
28 |         {
29 |             "name": "Python: Debug",
30 |             "type": "debugpy",
31 |             "request": "launch",
32 |             "program": "${file}",
33 |             "console": "integratedTerminal",
34 |             "justMyCode": false,
35 |             "env": {
36 |                 "PYTHONPATH": "${workspaceFolder}/backend:${env:PYTHONPATH}"
37 |             },
38 |             "envFile": "${workspaceFolder}/.env"
39 |         }
40 |     ]
41 | }
42 | 


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "editor.formatOnSave": true,
 3 |   "editor.formatOnSaveMode": "file",
 4 |   "files.exclude": {
 5 |     "**/__pycache__": true,
 6 |     "**/.benchmarks/": true,
 7 |     "**/.cache/": true,
 8 |     "**/.pytest_cache/": true,
 9 |     "**/.next/": true,
10 |     "**/build/": true,
11 |     "**/.docusaurus/": true,
12 |     "**/node_modules/": true
13 |   },
14 |   "[python]": {
15 |     "editor.defaultFormatter": "charliermarsh.ruff",
16 |     "editor.formatOnSave": true,
17 |     "editor.codeActionsOnSave": {
18 |       "source.organizeImports": "explicit",
19 |       "source.fixAll": "explicit"
20 |     }
21 |   },
22 |   "python.testing.unittestEnabled": false,
23 |   "python.testing.pytestEnabled": true,
24 |   "python.testing.autoTestDiscoverOnSaveEnabled": true,
25 |   "python.analysis.autoImportCompletions": true,
26 |   "python.analysis.typeCheckingMode": "basic",
27 |   "python.analysis.diagnosticSeverityOverrides": {
28 |     "reportMissingImports": "error",
29 |     "reportUnusedImport": "warning",
30 |     "reportGeneralTypeIssues": "warning"
31 |   },
32 |   "makefile.configureOnOpen": false
33 | }
34 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.11.10-slim-bullseye
 2 | 
 3 | WORKDIR /app
 4 | 
 5 | # Install runtime dependencies
 6 | RUN apt-get update && apt-get upgrade && apt-get install -y \
 7 |     libgeos-dev \
 8 |     libcurl4-openssl-dev \
 9 |     libssl-dev \
10 |     binutils \
11 |     curl \
12 |     git \
13 |     autoconf \
14 |     automake \
15 |     build-essential \
16 |     libtool \
17 |     python-dev \
18 |     build-essential \
19 |     wget \
20 |     gcc \
21 |     # Additional dependencies for document handling
22 |     libmagic-dev \
23 |     poppler-utils \
24 |     tesseract-ocr \
25 |     libreoffice \
26 |     libpq-dev \
27 |     pandoc && \
28 |     rm -rf /var/lib/apt/lists/* && apt-get clean
29 | 
30 | COPY requirements.lock  pyproject.toml README.md ./
31 | COPY libs/megaparse/pyproject.toml libs/megaparse/README.md libs/megaparse/
32 | COPY libs/megaparse_sdk/pyproject.toml libs/megaparse_sdk/README.md libs/megaparse_sdk/
33 | 
34 | RUN pip install uv
35 | RUN uv pip install --no-cache --system -r requirements.lock
36 | 
37 | RUN playwright install --with-deps
38 | RUN python3 - -m nltk.downloader all
39 | 
40 | COPY . .
41 | 
42 | RUN uv pip install --no-cache --system /app/libs/megaparse /app/libs/megaparse_sdk
43 | 
44 | EXPOSE 8000
45 | CMD ["uvicorn", "megaparse.api.app:app", "--host", "0.0.0.0", "--port", "8000"]
46 | 


--------------------------------------------------------------------------------
/Dockerfile.gpu:
--------------------------------------------------------------------------------
 1 | FROM nvidia/cuda:12.6.3-cudnn-devel-ubuntu20.04
 2 | 
 3 | WORKDIR /app
 4 | 
 5 | ENV UV_COMPILE_BYTECODE=1
 6 | ENV UV_NO_CACHE=1
 7 | ENV DEBIAN_FRONTEND=noninteractive
 8 | 
 9 | # Install runtime dependencies
10 | RUN apt-get update && apt-get install -y software-properties-common && \
11 |     add-apt-repository ppa:deadsnakes/ppa && \
12 |     apt-get update && apt-get install -y \
13 |     python3.11  \
14 |     python3.11-dev \
15 |     libgeos-dev \
16 |     libcurl4-openssl-dev \
17 |     libssl-dev \
18 |     binutils \
19 |     curl \
20 |     git \
21 |     autoconf \
22 |     automake \
23 |     libtool \
24 |     python3-pip \
25 |     build-essential \
26 |     wget \
27 |     gcc \
28 |     # Additional dependencies for document handling
29 |     libmagic-dev \
30 |     poppler-utils \
31 |     tesseract-ocr \
32 |     libreoffice \
33 |     libpq-dev \
34 |     pandoc && \
35 |     rm -rf /var/lib/apt/lists/* && apt-get clean
36 | 
37 | RUN  update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 1 && \
38 |        update-alternatives --set python3 /usr/bin/python3.11
39 | 
40 | COPY requirements.lock  pyproject.toml README.md ./
41 | COPY libs/megaparse/pyproject.toml libs/megaparse/README.md libs/megaparse/
42 | COPY libs/megaparse_sdk/pyproject.toml libs/megaparse_sdk/README.md libs/megaparse_sdk/
43 | 
44 | RUN curl -LsSf https://astral.sh/uv/install.sh | sh
45 | ENV PATH="/root/.local/bin:$PATH"
46 | RUN uv pip install --no-cache --system -r requirements.lock
47 | 
48 | RUN playwright install --with-deps
49 | RUN python3 - -m nltk.downloader all
50 | 
51 | # FIXME: causes runtime link issues with onnxruntime_pybind_state.cc:507 unstructured
52 | # RUN python3 -c "from unstructured.nlp.tokenize import download_nltk_packages; download_nltk_packages()" && \
53 | # RUN python3 -c "import nltk; nltk.download('punkt_tab'); nltk.download('averaged_perceptron_tagger_eng')"
54 | 
55 | COPY . .
56 | 
57 | RUN uv pip install --no-cache --system /app/libs/megaparse /app/libs/megaparse_sdk
58 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .DEFAULT_TARGET=help
 2 | 
 3 | ## help: Display list of commands
 4 | .PHONY: help
 5 | help:
 6 | 	@echo "Available commands:"
 7 | 	@sed -n 's|^##||p' $(MAKEFILE_LIST) | column -t ':' | sed -e 's|^| |'
 8 | 
 9 | ## dev: Start development environment
10 | .PHONY: dev
11 | dev:
12 | 	DOCKER_BUILDKIT=1 docker compose -f docker-compose.dev.yml up --build
13 | 
14 | ## dev-build: Build development environment without cache
15 | .PHONY: dev-build
16 | dev-build:
17 | 	DOCKER_BUILDKIT=1 docker compose -f docker-compose.dev.yml build --no-cache
18 | 	DOCKER_BUILDKIT=1 docker compose -f docker-compose.dev.yml up
19 | 
20 | ## prod: Build and start production environment
21 | .PHONY: prod
22 | prod:
23 | 	docker compose -f docker-compose.yml up --build
24 | 


--------------------------------------------------------------------------------
/Pipfile:
--------------------------------------------------------------------------------
 1 | [[source]]
 2 | url = "https://pypi.org/simple"
 3 | verify_ssl = true
 4 | name = "pypi"
 5 | 
 6 | [packages]
 7 | 
 8 | [dev-packages]
 9 | 
10 | [requires]
11 | python_version = "3.11"
12 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # MegaParse - Your Parser for every type of documents
 2 | 
 3 | <div align="center">
 4 |     <img src="https://raw.githubusercontent.com/QuivrHQ/MegaParse/main/logo.png" alt="Quivr-logo" width="30%"  style="border-radius: 50%; padding-bottom: 20px"/>
 5 | </div>
 6 | 
 7 | MegaParse is a powerful and versatile parser that can handle various types of documents with ease. Whether you're dealing with text, PDFs, Powerpoint presentations, Word documents MegaParse has got you covered. Focus on having no information loss during parsing.
 8 | 
 9 | ## Key Features 🎯
10 | 
11 | - **Versatile Parser**: MegaParse is a powerful and versatile parser that can handle various types of documents with ease.
12 | - **No Information Loss**: Focus on having no information loss during parsing.
13 | - **Fast and Efficient**: Designed with speed and efficiency at its core.
14 | - **Wide File Compatibility**: Supports Text, PDF, Powerpoint presentations, Excel, CSV, Word documents.
15 | - **Open Source**: Freedom is beautiful, and so is MegaParse. Open source and free to use.
16 | 
17 | ## Support
18 | 
19 | - Files: ✅ PDF ✅ Powerpoint ✅ Word
20 | - Content: ✅ Tables ✅ TOC ✅ Headers ✅ Footers ✅ Images
21 | 
22 | ### Example
23 | 
24 | https://github.com/QuivrHQ/MegaParse/assets/19614572/1b4cdb73-8dc2-44ef-b8b4-a7509bc8d4f3
25 | 
26 | ## Installation
27 | 
28 | required python version >= 3.11
29 | 
30 | ```bash
31 | pip install megaparse
32 | ```
33 | 
34 | ## Usage
35 | 
36 | 1. Add your OpenAI or Anthropic API key to the .env file
37 | 
38 | 2. Install poppler on your computer (images and PDFs)
39 | 
40 | 3. Install tesseract on your computer (images and PDFs)
41 | 
42 | 4. If you have a mac, you also need to install libmagic ```brew install libmagic```
43 | 
44 | Use MegaParse as it is : 
45 | ```python
46 | from megaparse import MegaParse
47 | from langchain_openai import ChatOpenAI
48 | 
49 | megaparse = MegaParse()
50 | response = megaparse.load("./test.pdf")
51 | print(response)
52 | ```
53 | 
54 | ### Use MegaParse Vision
55 | 
56 | ```python
57 | from megaparse.parser.megaparse_vision import MegaParseVision
58 | 
59 | model = ChatOpenAI(model="gpt-4o", api_key=os.getenv("OPENAI_API_KEY"))  # type: ignore
60 | parser = MegaParseVision(model=model)
61 | response = parser.convert("./test.pdf")
62 | print(response)
63 | 
64 | ```
65 | **Note**: The model supported by MegaParse Vision are the multimodal ones such as claude 3.5, claude 4, gpt-4o and gpt-4.
66 | 
67 | ## Use as an API
68 | There is a MakeFile for you, simply use :
69 | ```make dev```
70 | at the root of the project and you are good to go.
71 | 
72 | See localhost:8000/docs for more info on the different endpoints !
73 | 
74 | ## BenchMark
75 | 
76 | <!---BENCHMARK-->
77 | | Parser                        | similarity_ratio |
78 | | ----------------------------- | ---------------- |
79 | | megaparse_vision              | 0.87             |
80 | | unstructured_with_check_table | 0.77             |
81 | | unstructured                  | 0.59             |
82 | | llama_parser                  | 0.33             |
83 | <!---END_BENCHMARK-->
84 | 
85 | _Higher the better_
86 | 
87 | Note: Want to evaluate and compare your Megaparse module with ours ? Please add your config in ```evaluations/script.py``` and then run ```python evaluations/script.py```. If it is better, do a PR, I mean, let's go higher together .
88 | 
89 | ## In Construction 🚧
90 | - Improve table checker
91 | - Create Checkers to add **modular postprocessing** ⚙️
92 | - Add Structured output, **let's get computer talking** 🤖
93 | 
94 | 
95 | 
96 | ## Star History
97 | 
98 | [![Star History Chart](https://api.star-history.com/svg?repos=QuivrHQ/MegaParse&type=Date)](https://star-history.com/#QuivrHQ/MegaParse&Date)
99 | 


--------------------------------------------------------------------------------
/benchmark/process_single_doc.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import time
 3 | from pathlib import Path
 4 | 
 5 | import numpy as np
 6 | from megaparse import MegaParse
 7 | 
 8 | N_TRY = 1
 9 | 
10 | 
11 | async def process_file(megaparse: MegaParse, file_path: str | Path):
12 |     try:
13 |         t0 = time.perf_counter()
14 |         _ = await megaparse.aload(
15 |             file_path=file_path,
16 |         )
17 |         total = time.perf_counter() - t0
18 |         return total
19 |     except Exception as e:
20 |         print(f"Exception occured: {e}")
21 |         return None
22 | 
23 | 
24 | async def test_process_file(file: str | Path):
25 |     # parser = UnstructuredParser(strategy=StrategyEnum.HI_RES)
26 |     megaparse = MegaParse()
27 |     task = []
28 |     for _ in range(N_TRY):
29 |         task.append(process_file(megaparse, file))
30 |     list_process_time = await asyncio.gather(*task)
31 | 
32 |     n_errors = sum([t is None for t in list_process_time])
33 |     list_process_time = [t for t in list_process_time if t is not None]
34 | 
35 |     np_list_process_time = np.array(list_process_time)
36 |     print(f"All errors : {n_errors}")
37 |     print(f"Average time taken: {np_list_process_time.mean()}")
38 |     print(f"Median time taken: {np.median(list_process_time)}")
39 |     print(f"Standard deviation of time taken: {np.std(list_process_time)}")
40 |     print(f"Max time taken: {np.max(list_process_time)}")
41 |     print(f"Min time taken: {np.min(list_process_time)}")
42 | 
43 | 
44 | if __name__ == "__main__":
45 |     folder_path = "/Users/amine/data/quivr/parsing/scanned/machine.pdf"
46 |     asyncio.run(test_process_file(folder_path))
47 | 


--------------------------------------------------------------------------------
/benchmark/test_quality_sim.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import difflib
 3 | from pathlib import Path
 4 | 
 5 | auto_dir = Path("benchmark/auto")
 6 | hi_res_dir = Path("benchmark/hi_res")
 7 | 
 8 | 
 9 | def jaccard_similarity(str1, str2):
10 |     if len(str1) == 0 and len(str2) == 0:
11 |         return 1
12 |     # Tokenize the strings into sets of words
13 |     words1 = set(str1.split())
14 |     words2 = set(str2.split())
15 | 
16 |     # Find intersection and union of the word sets
17 |     intersection = words1.intersection(words2)
18 |     union = words1.union(words2)
19 | 
20 |     # Compute Jaccard similarity
21 |     return len(intersection) / len(union) if len(union) != 0 else 0
22 | 
23 | 
24 | def compare_files(file_name):
25 |     file_path_auto = auto_dir / f"{file_name}.md"
26 |     file_path_hi_res = hi_res_dir / f"{file_name}.md"
27 | 
28 |     with open(file_path_auto, "r") as f:
29 |         auto_content = f.read()
30 | 
31 |     with open(file_path_hi_res, "r") as f:
32 |         hi_res_content = f.read()
33 | 
34 |     if len(auto_content) == 0 and len(hi_res_content) == 0:
35 |         return 1
36 | 
37 |     similarity = difflib.SequenceMatcher(None, auto_content, hi_res_content).ratio()
38 |     # similarity = jaccard_similarity(auto_content, hi_res_content)
39 | 
40 |     return similarity
41 | 
42 | 
43 | def main():
44 |     files = os.listdir(hi_res_dir)
45 |     print(f"Comparing {len(files)} files...")
46 |     similarity_dict = {}
47 |     for file in files:
48 |         file_name = Path(file).stem
49 |         similarity = compare_files(file_name)
50 |         similarity_dict[file_name] = similarity
51 | 
52 |     avg_similarity = sum(similarity_dict.values()) / len(similarity_dict)
53 |     print(f"\nAverage similarity: {avg_similarity}\n")
54 | 
55 |     pass_rate = sum(
56 |         [similarity > 0.9 for similarity in similarity_dict.values()]
57 |     ) / len(similarity_dict)
58 | 
59 |     print(f"Pass rate: {pass_rate}\n")
60 | 
61 |     print("Under 0.9 similarity documents:")
62 |     print("-------------------------------")
63 |     for file_name, similarity in similarity_dict.items():
64 |         if similarity < 0.9:
65 |             print(f"{file_name}: {similarity}")
66 | 
67 | 
68 | if __name__ == "__main__":
69 |     main()
70 | 


--------------------------------------------------------------------------------
/docker-compose.dev.yml:
--------------------------------------------------------------------------------
 1 | version: "3.8"
 2 | 
 3 | services:
 4 |   megaparse:
 5 |     build:
 6 |       context: .
 7 |       dockerfile: Dockerfile
 8 |       cache_from:
 9 |         - megaparse:latest
10 |       args:
11 |         - DEV_MODE=true
12 |     image: megaparse:latest
13 |     extra_hosts:
14 |       - "host.docker.internal:host-gateway"
15 |     container_name: megaparse
16 |     volumes:
17 |       - ./:/app/
18 |     command: >
19 |       /bin/bash -c "python -m uvicorn megaparse.api.app:app --host 0.0.0.0 --log-level info --reload --port 8000"
20 |     restart: always
21 |     ports:
22 |       - 8000:8000
23 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: "3.8"
 2 | 
 3 | services:
 4 |   megaparse:
 5 |     image: megaparse:latest
 6 |     pull_policy: if_not_present
 7 |     container_name: megaparse
 8 |     extra_hosts:
 9 |       - "host.docker.internal:host-gateway"
10 |     healthcheck:
11 |       test: [ "CMD", "curl", "http://localhost:5050/healthz" ]
12 |     command: >
13 |       /bin/bash -c "python -m uvicorn megaparse.api.app:app --host 0.0.0.0 --log-level info --reload --port 8000 --loop uvloop"
14 |     restart: always
15 |     ports:
16 |       - 8000:8000
17 | 


--------------------------------------------------------------------------------
/docs/archive.txt:
--------------------------------------------------------------------------------
 1 | ### (Optional) Use LlamaParse for Improved Results
 2 | 
 3 | 1. Create an account on [Llama Cloud](https://cloud.llamaindex.ai/) and get your API key.
 4 | 
 5 | 2. Change the parser to LlamaParser
 6 | 
 7 | ```python
 8 | from megaparse import MegaParse
 9 | from langchain_openai import ChatOpenAI
10 | from megaparse.parser.llama_parser import LlamaParser
11 | 
12 | parser = LlamaParser(api_key = os.getenv("LLAMA_CLOUD_API_KEY"))
13 | megaparse = MegaParse(parser)
14 | response = megaparse.load("./test.pdf")
15 | print(response)
16 | megaparse.save("./test.md") #saves the last processed doc in md format
17 | ```


--------------------------------------------------------------------------------
/evaluations/script.py:
--------------------------------------------------------------------------------
 1 | import difflib
 2 | import os
 3 | 
 4 | from langchain_openai import ChatOpenAI
 5 | from megaparse.megaparse import MegaParse
 6 | from megaparse.parser.llama import LlamaParser
 7 | from megaparse.parser.megaparse_vision import MegaParseVision
 8 | from megaparse.parser.unstructured_parser import UnstructuredParser
 9 | from megaparse_sdk.schema.parser_config import StrategyEnum
10 | 
11 | if __name__ == "__main__":
12 |     print("---Launching evaluations script---")
13 |     model = ChatOpenAI(model="gpt-4o", api_key=str(os.getenv("OPENAI_API_KEY")))  # type: ignore
14 |     parser_dict = {
15 |         "unstructured": UnstructuredParser(strategy=StrategyEnum.AUTO, model=None),
16 |         "unstructured_with_check_table": UnstructuredParser(
17 |             strategy=StrategyEnum.AUTO,
18 |             model=model,
19 |         ),
20 |         "llama_parser": LlamaParser(api_key=str(os.getenv("LLAMA_CLOUD_API_KEY"))),
21 |         "megaparse_vision": MegaParseVision(model=model),
22 |     }
23 | 
24 |     base_pdf_path = "tests/data/MegaFake_report.pdf"
25 |     base_md_path = "tests/data/grt_example/MegaFake_report.md"
26 |     with open(base_md_path, "r", encoding="utf-8") as f:
27 |         base_md = f.read()
28 | 
29 |     score_dict = {}
30 | 
31 |     for method, parser in parser_dict.items():
32 |         print(f"Method: {method}")
33 |         megaparse = MegaParse()
34 |         result = megaparse.load(file_path=base_pdf_path)
35 |         score_dict[method] = difflib.SequenceMatcher(None, base_md, result).ratio()
36 |         print(f"Score for method {method}: {score_dict[method]}")
37 | 
38 |     # Sort the results
39 |     sorted_score = sorted(score_dict.items(), key=lambda x: x[1], reverse=True)
40 | 
41 |     # Generate a table with the results
42 |     benchmark_results = "| Parser | similarity_ratio |\n|---|---|\n"
43 |     for parser, score in sorted_score:
44 |         benchmark_results += f"| {parser} | {score:.2f} |\n"
45 | 
46 |     print(benchmark_results)
47 | 
48 |     # Update README.md file
49 |     with open("README.md", "r") as readme_file:
50 |         readme_content = readme_file.read()
51 | 
52 |     start_marker = "<!---BENCHMARK-->"
53 |     end_marker = "<!---END_BENCHMARK-->"
54 |     start_index = readme_content.find(start_marker) + len(start_marker)
55 |     end_index = readme_content.find(end_marker)
56 | 
57 |     updated_readme_content = (
58 |         readme_content[:start_index]
59 |         + "\n"
60 |         + benchmark_results
61 |         + readme_content[end_index:]
62 |     )
63 | 
64 |     with open("README.md", "w") as readme_file:
65 |         readme_file.write(updated_readme_content)
66 | 


--------------------------------------------------------------------------------
/images/tables.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QuivrHQ/MegaParse/ba9a24aec950d6cf14834b8e2e11f5725778f12e/images/tables.png


--------------------------------------------------------------------------------
/libs/megaparse/.python-version:
--------------------------------------------------------------------------------
1 | 3.11.9


--------------------------------------------------------------------------------
/libs/megaparse/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | # Changelog
  2 | 
  3 | ## [0.0.55](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.54...megaparse-v0.0.55) (2025-02-14)
  4 | 
  5 | 
  6 | ### Features
  7 | 
  8 | * remove tensorrt ([#230](https://github.com/QuivrHQ/MegaParse/issues/230)) ([8b8abbc](https://github.com/QuivrHQ/MegaParse/commit/8b8abbc6a2a1b33d4e921d55d2519b773ec062c8))
  9 | 
 10 | ## [0.0.54](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.53...megaparse-v0.0.54) (2025-02-11)
 11 | 
 12 | 
 13 | ### Features
 14 | 
 15 | * add_layout_detection ([#220](https://github.com/QuivrHQ/MegaParse/issues/220)) ([2d2d0b4](https://github.com/QuivrHQ/MegaParse/commit/2d2d0b42bba4c883db423568e932eda42edd60d7))
 16 | 
 17 | ## [0.0.53](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.52...megaparse-v0.0.53) (2025-01-16)
 18 | 
 19 | 
 20 | ### Features
 21 | 
 22 | * modular parser and formatter v0 ([#175](https://github.com/QuivrHQ/MegaParse/issues/175)) ([1f4dcf8](https://github.com/QuivrHQ/MegaParse/commit/1f4dcf88a5901c5a2682cb79284a0dbb08034cb2))
 23 | * Text detection in auto strategy ([#209](https://github.com/QuivrHQ/MegaParse/issues/209)) ([03c7ada](https://github.com/QuivrHQ/MegaParse/commit/03c7ada1dc245e13ef41ffd6fa3a8ed869269d37))
 24 | * type strategy output ([#216](https://github.com/QuivrHQ/MegaParse/issues/216)) ([deb8765](https://github.com/QuivrHQ/MegaParse/commit/deb8765a4df8917a4857f51a02025243192d5cf8))
 25 | 
 26 | 
 27 | ### Bug Fixes
 28 | 
 29 | * Add EngineConfig & StrategyHandler ([#211](https://github.com/QuivrHQ/MegaParse/issues/211)) ([2e1c6dd](https://github.com/QuivrHQ/MegaParse/commit/2e1c6ddd676227d1cbc4cff9771b20595259ba38))
 30 | * add parse tests for every supported extensions ([#198](https://github.com/QuivrHQ/MegaParse/issues/198)) ([9dff0de](https://github.com/QuivrHQ/MegaParse/commit/9dff0de0c1de848151fe9a6519b658f0924c1228))
 31 | * logging error ([#218](https://github.com/QuivrHQ/MegaParse/issues/218)) ([a2170d7](https://github.com/QuivrHQ/MegaParse/commit/a2170d7c711a5d7a0531f03aa9576937ddd6576e))
 32 | * megaparse.load & add tests ([#202](https://github.com/QuivrHQ/MegaParse/issues/202)) ([13c2677](https://github.com/QuivrHQ/MegaParse/commit/13c2677bdadb4ba985a1abf9bafeb70548ab59f9))
 33 | * Strategy heuristic test & fix ([#203](https://github.com/QuivrHQ/MegaParse/issues/203)) ([7b7fb40](https://github.com/QuivrHQ/MegaParse/commit/7b7fb40cae4ed380a5f0ca0035a7bd2bcc9147c3))
 34 | * sync convert to parsers ([#186](https://github.com/QuivrHQ/MegaParse/issues/186)) ([fbb7d36](https://github.com/QuivrHQ/MegaParse/commit/fbb7d365fbaf710a687fdc6becacd6d301c09707))
 35 | 
 36 | ## [0.0.52](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.51...megaparse-v0.0.52) (2024-12-16)
 37 | 
 38 | 
 39 | ### Bug Fixes
 40 | 
 41 | * hatchling version ([#193](https://github.com/QuivrHQ/MegaParse/issues/193)) ([f6070a5](https://github.com/QuivrHQ/MegaParse/commit/f6070a5483a20eeb83751a2dcfc01b7f0fb14473))
 42 | 
 43 | ## [0.0.51](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.50...megaparse-v0.0.51) (2024-12-16)
 44 | 
 45 | 
 46 | ### Features
 47 | 
 48 | * updating langchain version ([#187](https://github.com/QuivrHQ/MegaParse/issues/187)) ([0f1f597](https://github.com/QuivrHQ/MegaParse/commit/0f1f5977df147e6b8c65d55445ccd86ef6f1a862))
 49 | 
 50 | ## [0.0.50](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.49...megaparse-v0.0.50) (2024-12-13)
 51 | 
 52 | 
 53 | ### Features
 54 | 
 55 | * small fixes ([#181](https://github.com/QuivrHQ/MegaParse/issues/181)) ([004afe2](https://github.com/QuivrHQ/MegaParse/commit/004afe2f170570075bbebcd32dec5d15ddba4609))
 56 | 
 57 | ## [0.0.49](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.48...megaparse-v0.0.49) (2024-12-12)
 58 | 
 59 | 
 60 | ### Features
 61 | 
 62 | * custom auto ([#131](https://github.com/QuivrHQ/MegaParse/issues/131)) ([3cb5be4](https://github.com/QuivrHQ/MegaParse/commit/3cb5be4a8c8eeb6dd6e9b87d7bbca24491db4c29))
 63 | * faster ocr ([#180](https://github.com/QuivrHQ/MegaParse/issues/180)) ([5661cb2](https://github.com/QuivrHQ/MegaParse/commit/5661cb2d52d959cbca0f41339791129cd35d4036))
 64 | 
 65 | ## [0.0.48](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.47...megaparse-v0.0.48) (2024-12-03)
 66 | 
 67 | 
 68 | ### Features
 69 | 
 70 | * Update imports and parsers in README.md ([#156](https://github.com/QuivrHQ/MegaParse/issues/156)) ([33e0303](https://github.com/QuivrHQ/MegaParse/commit/33e0303821691c4b1fc821e6b33b874bd332d430))
 71 | 
 72 | ## [0.0.47](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.46...megaparse-v0.0.47) (2024-11-21)
 73 | 
 74 | 
 75 | ### Features
 76 | 
 77 | * refacto megaparse for service ([#132](https://github.com/QuivrHQ/MegaParse/issues/132)) ([ab9ad7f](https://github.com/QuivrHQ/MegaParse/commit/ab9ad7fb7db580a04a998d144dd2ba3407068334))
 78 | * release plz ([#134](https://github.com/QuivrHQ/MegaParse/issues/134)) ([d8a221e](https://github.com/QuivrHQ/MegaParse/commit/d8a221e23f6e15e969c1328f183da3582d0d7925))
 79 | 
 80 | ## [0.0.22](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.21...megaparse-v0.0.22) (2024-07-24)
 81 | 
 82 | 
 83 | ### Features
 84 | 
 85 | * Add instructions for installing poppler and tesseract ([#10](https://github.com/QuivrHQ/MegaParse/issues/10)) ([3399552](https://github.com/QuivrHQ/MegaParse/commit/3399552bc8be705f6d34306743388a96d099eebc))
 86 | * Add MegaParse class to __init__.py ([84c0d64](https://github.com/QuivrHQ/MegaParse/commit/84c0d648ef1ddf048ec911210d89be155443dc72))
 87 | * Add support for Unstructured Parser, improve Table and Image Parsing, and add TOC and Hyperlinks for Docx ([#9](https://github.com/QuivrHQ/MegaParse/issues/9)) ([4934776](https://github.com/QuivrHQ/MegaParse/commit/493477672cef9fe22b0ab56ced1d5572104e1914))
 88 | * base loader ([#65](https://github.com/QuivrHQ/MegaParse/issues/65)) ([eb8149f](https://github.com/QuivrHQ/MegaParse/commit/eb8149f05ec2793f59fd87109a1aba8095f6f1d0))
 89 | * base loader class ([#64](https://github.com/QuivrHQ/MegaParse/issues/64)) ([801a026](https://github.com/QuivrHQ/MegaParse/commit/801a026e4b3411f8ac85171a6928e3d17c027648))
 90 | * Update benchmark results in README.md ([#15](https://github.com/QuivrHQ/MegaParse/issues/15)) ([1dfcb4c](https://github.com/QuivrHQ/MegaParse/commit/1dfcb4ce19467f7fb8137e10e5f5fbf35e563df0))
 91 | 
 92 | 
 93 | ### Bug Fixes
 94 | 
 95 | * add __init__.py ([a5b8de9](https://github.com/QuivrHQ/MegaParse/commit/a5b8de9e1e01ef681ac2ef59a6e111ae7bd6cf70))
 96 | * change name ([6b36437](https://github.com/QuivrHQ/MegaParse/commit/6b36437787f048d36d69c3b06c2d59f7dc7a741f))
 97 | * PR Comments ([a0ab0ba](https://github.com/QuivrHQ/MegaParse/commit/a0ab0baa5dd9aae644baef55348f1af28a6776a7))
 98 | * remove nest asycio ([22195a2](https://github.com/QuivrHQ/MegaParse/commit/22195a27e9dc3583bf1fbde2a95e9fbecc8d96a4))
 99 | * use aload_data ([e5c73fe](https://github.com/QuivrHQ/MegaParse/commit/e5c73fefcbf09bb12810adc6d4412f7742c42089))
100 | 
101 | ## [0.0.21](https://github.com/QuivrHQ/MegaParse/compare/v0.0.20...v0.0.21) (2024-07-24)
102 | 
103 | 
104 | ### Features
105 | 
106 | * base loader ([#65](https://github.com/QuivrHQ/MegaParse/issues/65)) ([eb8149f](https://github.com/QuivrHQ/MegaParse/commit/eb8149f05ec2793f59fd87109a1aba8095f6f1d0))
107 | * base loader class ([#64](https://github.com/QuivrHQ/MegaParse/issues/64)) ([801a026](https://github.com/QuivrHQ/MegaParse/commit/801a026e4b3411f8ac85171a6928e3d17c027648))
108 | 


--------------------------------------------------------------------------------
/libs/megaparse/README.md:
--------------------------------------------------------------------------------
1 | # MegaParse CORE
2 | 
3 | - Core package of megaparse
4 | 
5 | > **Note:** The test files in `tests/pdf/ocr` and `tests/pdf/native` come from SAFEDOCS (CC-MAIN-2021-31-PDF-UNTRUNCATED). You can find more information [here](https://digitalcorpora.org/corpora/file-corpora/cc-main-2021-31-pdf-untruncated/).


--------------------------------------------------------------------------------
/libs/megaparse/bench.md:
--------------------------------------------------------------------------------
  1 | ------------
  2 | UNSTRUCTURED(HI-RES):
  3 | ------------
  4 | 
  5 | folder: cdp
  6 |          cdp_etiquette.pdf parsing took: 2.10s
  7 | folder: scanned-tables
  8 |          POZIBILAN 2022.pdf parsing took: 78.72s
  9 |          Banco Popilar Number 2.pdf parsing took: 94.44s
 10 | folder: native
 11 |          00b03d60-fe45-4318-a511-18ee921b7bbb.pdf parsing took: 3.25s
 12 |          0b0ab5f4-b654-4846-bd9b-18b3c1075c52.pdf parsing took: 39.75s
 13 |          0adb1fd6-d009-4097-bcf6-b8f3af38d3f0.pdf parsing took: 25.02s
 14 | folder: scanned
 15 |          machine.pdf parsing took: 54.29s
 16 |          medical.pdf parsing took: 76.11s
 17 |          les_americains.pdf parsing took: 643.84s
 18 |          agency.pdf parsing took: 114.19s
 19 |          clark.pdf parsing took: 27.89s
 20 |          tables_ocr.pdf parsing took: 81.21s
 21 | folder: rich
 22 |          language_learning.pdf parsing took: 2.60s
 23 |          dites nous tout....pdf parsing took: 1.62s
 24 | 
 25 | ------------
 26 | UNSTRUCTURED(FAST):
 27 | ------------
 28 | folder: cdp
 29 |          cdp_etiquette.pdf parsing took: 0.05s
 30 | folder: scanned-tables
 31 |         POZIBILAN 2022.pdf:  can't parse
 32 |         Banco Popilar Number 2.pdf:  can't parse
 33 | folder: native
 34 |          00b03d60-fe45-4318-a511-18ee921b7bbb.pdf parsing took: 0.07s
 35 |          0b0ab5f4-b654-4846-bd9b-18b3c1075c52.pdf parsing took: 0.86s
 36 |          0adb1fd6-d009-4097-bcf6-b8f3af38d3f0.pdf parsing took: 0.24s
 37 | folder: scanned
 38 |         machine.pdf parsing took: 0.02s
 39 |         medical.pdf parsing took: 0.04s
 40 |         les_americains.pdf parsing took: 5.90s
 41 |         agency.pdf:  can't parse
 42 |         clark.pdf:  can't parse
 43 |         tables_ocr.pdf:  can't parse
 44 | folder: rich
 45 |         language_learning.pdf:  can't parse
 46 |          dites nous tout....pdf parsing took: 0.02s
 47 | 
 48 | ------------
 49 | Megaparse (
 50 |         strategy = AUTO
 51 |         Config = {
 52 |                 provider=COREML,
 53 |                 det_arch: str = "fast_base"
 54 |                 det_batch_size: int = 2
 55 |                 assume_straight_pages: bool = True
 56 |                 preserve_aspect_ratio: bool = True
 57 |                 symmetric_pad: bool = True
 58 |                 load_in_8_bit: bool = False
 59 |                 reco_arch: str = "crnn_vgg16_bn"
 60 |                 rec_batch_size: int = 512
 61 |         }
 62 | )
 63 | ------------
 64 | folder: cdp
 65 |         cdp_etiquette.pdf parsing took: 1.71s
 66 | folder: scanned-tables
 67 |         POZIBILAN 2022.pdf parsing took: 17.76s
 68 |         Banco Popilar Number 2.pdf parsing took: 19.25s
 69 | folder: native
 70 |         00b03d60-fe45-4318-a511-18ee921b7bbb.pdf parsing took: 0.96s
 71 |         0b0ab5f4-b654-4846-bd9b-18b3c1075c52.pdf parsing took: 12.57s
 72 |         0adb1fd6-d009-4097-bcf6-b8f3af38d3f0.pdf parsing took: 1.53s
 73 | folder: scanned
 74 |         machine.pdf parsing took: 9.90s
 75 |         medical.pdf parsing took: 13.09s
 76 |         les_americains.pdf parsing took: 139.53s
 77 |         agency.pdf parsing took: 10.73s
 78 |         clark.pdf parsing took: 10.69s
 79 |         tables_ocr.pdf parsing took: 15.58s
 80 | folder: rich
 81 |         language_learning.pdf parsing took: 1.74s
 82 |         dites nous tout....pdf parsing took: 0.64s
 83 | ----
 84 | | Type            | PDF Name                          | Unstructured(HI-RES) | Unstructured(FAST)    | Megaparse( w/ doctr COREML)  |
 85 | |------------------|-----------------------------------|---------------------|----------------------|--------------------|
 86 | | **cdp**         | cdp_etiquette.pdf                 | 2.10s               | 0.05s (bad parsing)              | 1.71s             |
 87 | | **scanned-tables** | POZIBILAN 2022.pdf             | 78.72s              | can't parse          | 17.76s            |
 88 | | **scanned-tables** | Banco Popilar Number 2.pdf     | 94.44s              | can't parse          | 19.25s            |
 89 | | **native**       | 00b03d60-fe45-4318-a511-18ee921b7bbb.pdf | 3.25s  | 0.07s               | 0.96s             |
 90 | | **native**       | 0b0ab5f4-b654-4846-bd9b-18b3c1075c52.pdf | 39.75s | 0.86s               | 12.57s            |
 91 | | **native**       | 0adb1fd6-d009-4097-bcf6-b8f3af38d3f0.pdf | 25.02s | 0.24s               | 1.53s             |
 92 | | **scanned**      | machine.pdf                      | 54.29s              | 0.02s               | 9.90s             |
 93 | | **scanned**      | medical.pdf                      | 76.11s              | 0.04s               | 13.09s            |
 94 | | **scanned**      | les_americains.pdf               | 643.84s             | 5.90s               | 139.53s           |
 95 | | **scanned**      | agency.pdf                       | 114.19s             | can't parse          | 10.73s            |
 96 | | **scanned**      | clark.pdf                        | 28.89s              | can't parse          | 10.69s            |
 97 | | **scanned**      | tables_ocr.pdf                   | 81.21s              | can't parse          | 15.58s            |
 98 | | **rich**         | language_learning.pdf            | 2.60s               | can't parse          | 1.74s             |
 99 | | **rich**         | dites nous tout....pdf           | 1.62s               | 0.02s               | 0.64s             |
100 | 


--------------------------------------------------------------------------------
/libs/megaparse/examples/parse_file_fast.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from dataclasses import dataclass
 3 | from time import perf_counter
 4 | 
 5 | from unstructured.partition.auto import partition
 6 | 
 7 | 
 8 | @dataclass
 9 | class File:
10 |     file_path: str
11 |     file_name: str
12 |     file_extension: str
13 | 
14 | 
15 | def list_files_in_directory(directory_path: str) -> dict[str, list[File]]:
16 |     directory_dict = {}
17 |     for root, _, files in os.walk(directory_path):
18 |         folder_name = os.path.basename(root)
19 |         if len(folder_name) > 0:
20 |             file_list = []
21 |             for file_name in files:
22 |                 file_path = os.path.join(root, file_name)
23 |                 file_extension = os.path.splitext(file_name)[1]
24 |                 file_list.append(
25 |                     File(
26 |                         file_path=file_path,
27 |                         file_name=file_name,
28 |                         file_extension=file_extension,
29 |                     )
30 |                 )
31 |             directory_dict[folder_name] = file_list
32 | 
33 |     return directory_dict
34 | 
35 | 
36 | def main():
37 |     file_path = "/Users/amine/data/quivr/parsing/native/0b0ab5f4-b654-4846-bd9b-18b3c1075c52.pdf"
38 |     folder_path = "/Users/amine/data/quivr/parsing/"
39 | 
40 |     list_files = list_files_in_directory(folder_path)
41 | 
42 |     for folder_name, files in list_files.items():
43 |         print(f"folder: {folder_name}")
44 |         for file in files:
45 |             if file.file_extension == ".pdf":
46 |                 s = perf_counter()
47 |                 elements = partition(
48 |                     filename=file.file_path,
49 |                     strategy="fast",
50 |                 )
51 |                 if len(elements) == 0:
52 |                     print(f"\t{file.file_name}:  can't parse ")
53 |                     continue
54 | 
55 |                 e = perf_counter()
56 |                 print(f"\t {file.file_name} parsing took: {e-s:.2f}s")
57 | 
58 | 
59 | if __name__ == "__main__":
60 |     els = main()
61 | 


--------------------------------------------------------------------------------
/libs/megaparse/examples/parse_file_mp.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from dataclasses import dataclass
 3 | from time import perf_counter
 4 | 
 5 | from megaparse import MegaParse
 6 | from megaparse.configs.auto import DeviceEnum, MegaParseConfig
 7 | 
 8 | 
 9 | @dataclass
10 | class File:
11 |     file_path: str
12 |     file_name: str
13 |     file_extension: str
14 | 
15 | 
16 | def list_files_in_directory(directory_path: str) -> dict[str, list[File]]:
17 |     directory_dict = {}
18 |     for root, _, files in os.walk(directory_path):
19 |         folder_name = os.path.basename(root)
20 |         if len(folder_name) > 0:
21 |             file_list = []
22 |             for file_name in files:
23 |                 file_path = os.path.join(root, file_name)
24 |                 file_extension = os.path.splitext(file_name)[1]
25 |                 file_list.append(
26 |                     File(
27 |                         file_path=file_path,
28 |                         file_name=file_name,
29 |                         file_extension=file_extension,
30 |                     )
31 |                 )
32 |             directory_dict[folder_name] = file_list
33 | 
34 |     return directory_dict
35 | 
36 | 
37 | def main():
38 |     folder_path = "/Users/amine/data/quivr/parsing/"
39 | 
40 |     list_files = list_files_in_directory(folder_path)
41 |     config = MegaParseConfig(device=DeviceEnum.COREML)
42 |     mp = MegaParse(config=config)
43 | 
44 |     for folder_name, files in list_files.items():
45 |         print(f"folder: {folder_name}")
46 |         for file in files:
47 |             if file.file_extension == ".pdf":
48 |                 s = perf_counter()
49 |                 result = mp.load(file.file_path)
50 |                 if len(result) == 0:
51 |                     print(f"\t{file.file_name}:  can't parse ")
52 |                     continue
53 | 
54 |                 e = perf_counter()
55 |                 print(f"\t {file.file_name} parsing took: {e-s:.2f}s")
56 | 
57 | 
58 | if __name__ == "__main__":
59 |     els = main()
60 | 


--------------------------------------------------------------------------------
/libs/megaparse/examples/parse_file_unstructured.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from dataclasses import dataclass
 3 | from time import perf_counter
 4 | 
 5 | from unstructured.partition.auto import partition
 6 | 
 7 | 
 8 | @dataclass
 9 | class File:
10 |     file_path: str
11 |     file_name: str
12 |     file_extension: str
13 | 
14 | 
15 | def list_files_in_directory(directory_path: str) -> dict[str, list[File]]:
16 |     directory_dict = {}
17 |     for root, _, files in os.walk(directory_path):
18 |         folder_name = os.path.basename(root)
19 |         if len(folder_name) > 0:
20 |             file_list = []
21 |             for file_name in files:
22 |                 file_path = os.path.join(root, file_name)
23 |                 file_extension = os.path.splitext(file_name)[1]
24 |                 file_list.append(
25 |                     File(
26 |                         file_path=file_path,
27 |                         file_name=file_name,
28 |                         file_extension=file_extension,
29 |                     )
30 |                 )
31 |             directory_dict[folder_name] = file_list
32 | 
33 |     return directory_dict
34 | 
35 | 
36 | def main():
37 |     file_path = "/Users/amine/data/quivr/parsing/native/0b0ab5f4-b654-4846-bd9b-18b3c1075c52.pdf"
38 |     folder_path = "/Users/amine/data/quivr/parsing/"
39 | 
40 |     list_files = list_files_in_directory(folder_path)
41 | 
42 |     for folder_name, files in list_files.items():
43 |         print(f"folder: {folder_name}")
44 |         for file in files:
45 |             if file.file_extension == ".pdf":
46 |                 s = perf_counter()
47 |                 _ = partition(
48 |                     filename=file.file_path,
49 |                     strategy="hi_res",
50 |                 )
51 |                 e = perf_counter()
52 |                 print(f"\t {file.file_name} parsing took: {e-s:.2f}s")
53 | 
54 | 
55 | if __name__ == "__main__":
56 |     els = main()
57 | 


--------------------------------------------------------------------------------
/libs/megaparse/program.prof:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QuivrHQ/MegaParse/ba9a24aec950d6cf14834b8e2e11f5725778f12e/libs/megaparse/program.prof


--------------------------------------------------------------------------------
/libs/megaparse/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "megaparse"
 3 | version = "0.0.55"
 4 | authors = [
 5 |     { name = "Stan Girard", email = "stan@quivr.app" },
 6 |     { name = "Chloé Daems", email = "chloe@quivr.app" },
 7 |     { name = "Amine Dirhoussi", email = "amine@quivr.app" },
 8 |     { name = "Jacopo Chevallard", email = "jacopo@quivr.app" },
 9 | ]
10 | 
11 | readme = "README.md"
12 | requires-python = ">= 3.11"
13 | 
14 | dependencies = [
15 |     "megaparse-sdk",
16 |     "pycryptodome>=3.21.0",
17 |     "pdfplumber>=0.11.0",
18 |     "backoff>=2.2.1",
19 |     "pypdf>=5.0.1",
20 |     "psutil>=6.1.0",
21 |     "numpy<=2.0.0",
22 |     "playwright>=1.47.0",
23 |     "langchain-anthropic>=0.1.23",
24 |     "python-magic>=0.4.27",
25 |     "unstructured[all-docs]==0.15.0",
26 |     "langchain>=0.3,<0.4",
27 |     "langchain-community>=0.3,<0.4",
28 |     "langchain-openai>=0.1.21",
29 |     "langchain-core>=0.3,<0.4",
30 |     "llama-parse>=0.4.0",
31 |     "pydantic-settings>=2.6.1",
32 |     "onnxruntime==1.20.0; platform_machine == 'x86_64'",
33 |     "onnxruntime-gpu==1.20.0; platform_machine == 'x86_64'",
34 |     "onnxtr[gpu-headless]>=0.6.0; platform_machine == 'x86_64'",
35 |     "onnxtr[cpu]>=0.6.0; platform_machine != 'x86_64'",
36 |     "pypdfium2>=4.30.0",
37 | ]
38 | 
39 | [project.optional-dependencies]
40 | api = [
41 |     "python-dotenv>=1.0.0",
42 |     "uvloop>=0.18.0",
43 |     "pydantic-settings>=2.6.1",
44 |     "uvicorn>=0.32.0",
45 |     "fastapi>=0.115.2",
46 |     "ratelimit>=2.2.1",
47 | 
48 | ]
49 | 
50 | 
51 | [build-system]
52 | requires = ["hatchling==1.26.3"]
53 | build-backend = "hatchling.build"
54 | 
55 | [tool.rye]
56 | managed = true
57 | dev-dependencies = []
58 | universal = true
59 | 
60 | [tool.hatch.metadata]
61 | allow-direct-references = true
62 | 
63 | [tool.hatch.build.targets.wheel]
64 | packages = ["src/megaparse", "src/api"]
65 | 


--------------------------------------------------------------------------------
/libs/megaparse/src/megaparse/__init__.py:
--------------------------------------------------------------------------------
1 | from .megaparse import MegaParse
2 | 
3 | __all__ = ["MegaParse"]
4 | 


--------------------------------------------------------------------------------
/libs/megaparse/src/megaparse/api/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QuivrHQ/MegaParse/ba9a24aec950d6cf14834b8e2e11f5725778f12e/libs/megaparse/src/megaparse/api/__init__.py


--------------------------------------------------------------------------------
/libs/megaparse/src/megaparse/api/app.py:
--------------------------------------------------------------------------------
  1 | import io
  2 | import os
  3 | import tempfile
  4 | from typing import Any, Optional
  5 | 
  6 | import httpx
  7 | import psutil
  8 | import uvicorn
  9 | from fastapi import Depends, FastAPI, File, Form, HTTPException, UploadFile
 10 | from langchain_anthropic import ChatAnthropic
 11 | from langchain_community.document_loaders import PlaywrightURLLoader
 12 | from langchain_openai import ChatOpenAI
 13 | from llama_parse.utils import Language
 14 | from megaparse_sdk.schema.document import Document
 15 | from megaparse_sdk.schema.parser_config import (
 16 |     ParserType,
 17 |     StrategyEnum,
 18 | )
 19 | from megaparse_sdk.schema.supported_models import SupportedModel
 20 | 
 21 | from megaparse import MegaParse
 22 | from megaparse.api.exceptions.megaparse_exceptions import (
 23 |     HTTPDownloadError,
 24 |     HTTPFileNotFound,
 25 |     HTTPModelNotSupported,
 26 |     HTTPParsingException,
 27 |     ParsingException,
 28 | )
 29 | from megaparse.parser.builder import ParserBuilder
 30 | 
 31 | app = FastAPI()
 32 | 
 33 | playwright_loader = PlaywrightURLLoader(urls=[], remove_selectors=["header", "footer"])
 34 | 
 35 | 
 36 | def parser_builder_dep():
 37 |     return ParserBuilder()
 38 | 
 39 | 
 40 | def get_playwright_loader():
 41 |     return playwright_loader
 42 | 
 43 | 
 44 | @app.get("/healthz")
 45 | def healthz():
 46 |     return {"status": "ok"}
 47 | 
 48 | 
 49 | def _check_free_memory() -> bool:
 50 |     """Reject traffic when free memory is below minimum (default 2GB)."""
 51 |     mem = psutil.virtual_memory()
 52 |     memory_free_minimum = int(os.environ.get("MEMORY_FREE_MINIMUM_MB", 2048))
 53 | 
 54 |     if mem.available <= memory_free_minimum * 1024 * 1024:
 55 |         return False
 56 |     return True
 57 | 
 58 | 
 59 | @app.post(
 60 |     "/v1/file",
 61 | )
 62 | async def parse_file(
 63 |     file: UploadFile = File(...),
 64 |     method: ParserType = Form(ParserType.UNSTRUCTURED),
 65 |     strategy: StrategyEnum = Form(StrategyEnum.AUTO),
 66 |     check_table: bool = Form(False),
 67 |     language: Language = Form(Language.ENGLISH),
 68 |     parsing_instruction: Optional[str] = Form(None),
 69 |     model_name: Optional[SupportedModel] = Form(SupportedModel.GPT_4O),
 70 |     parser_builder=Depends(parser_builder_dep),
 71 | ) -> dict[str, str | Document]:
 72 |     if not _check_free_memory():
 73 |         raise HTTPException(
 74 |             status_code=503, detail="Service unavailable due to low memory"
 75 |         )
 76 |     model = None
 77 |     if model_name and check_table:
 78 |         if model_name.startswith("gpt"):
 79 |             model = ChatOpenAI(model=model_name, api_key=os.getenv("OPENAI_API_KEY"))  # type: ignore
 80 |         elif model_name.startswith("claude"):
 81 |             model = ChatAnthropic(
 82 |                 model_name=model_name,
 83 |                 api_key=os.getenv("ANTHROPIC_API_KEY"),  # type: ignore
 84 |                 timeout=60,
 85 |                 stop=None,
 86 |             )
 87 | 
 88 |         else:
 89 |             raise HTTPModelNotSupported()
 90 | 
 91 |     # parser_config = ParseFileConfig( #FIXME
 92 |     #     method=method,
 93 |     #     strategy=strategy,
 94 |     #     llm_model_name=SupportedModel(model_name) if model_name and check_table else None,
 95 |     #     language=language,
 96 |     #     parsing_instruction=parsing_instruction,
 97 |     # )
 98 |     try:
 99 |         # parser = parser_builder.build(parser_config)
100 |         megaparse = MegaParse()
101 |         if not file.filename:
102 |             raise HTTPFileNotFound("No filename provided")
103 |         _, extension = os.path.splitext(file.filename)
104 |         file_bytes = await file.read()
105 |         file_stream = io.BytesIO(file_bytes)
106 |         result = await megaparse.aload(file=file_stream, file_extension=extension)
107 |         return {"message": "File parsed successfully", "result": result}
108 |     except ParsingException as e:
109 |         print(e)
110 |         raise HTTPParsingException(file.filename)
111 |     except ValueError as e:
112 |         print(e)
113 |         raise HTTPException(status_code=400, detail=str(e))
114 |     except Exception as e:
115 |         print(e)
116 |         raise HTTPException(status_code=500, detail=str(e))
117 | 
118 | 
119 | @app.post(
120 |     "/v1/url",
121 | )
122 | async def upload_url(
123 |     url: str, playwright_loader=Depends(get_playwright_loader)
124 | ) -> dict[str, Any]:
125 |     playwright_loader.urls = [url]
126 | 
127 |     if url.endswith(".pdf"):
128 |         ## Download the file
129 | 
130 |         async with httpx.AsyncClient() as client:
131 |             response = await client.get(url)
132 |         if response.status_code != 200:
133 |             raise HTTPDownloadError(url)
134 | 
135 |         with tempfile.NamedTemporaryFile(delete=False, suffix="pdf") as temp_file:
136 |             temp_file.write(response.content)
137 |             try:
138 |                 megaparse = MegaParse()
139 |                 result = await megaparse.aload(temp_file.name)
140 |                 return {"message": "File parsed successfully", "result": result}
141 |             except ParsingException:
142 |                 raise HTTPParsingException(url)
143 |     else:
144 |         data = await playwright_loader.aload()
145 |         # Now turn the data into a string
146 |         extracted_content = ""
147 |         for page in data:
148 |             extracted_content += page.page_content
149 |         if not extracted_content:
150 |             raise HTTPDownloadError(
151 |                 url,
152 |                 message="Failed to extract content from the website. Valid URL example : https://www.quivr.com",
153 |             )
154 |         return {
155 |             "message": "Website content parsed successfully",
156 |             "result": extracted_content,
157 |         }
158 | 
159 | 
160 | if __name__ == "__main__":
161 |     uvicorn.run(app, host="0.0.0.0", port=8000)
162 | 


--------------------------------------------------------------------------------
/libs/megaparse/src/megaparse/api/exceptions/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QuivrHQ/MegaParse/ba9a24aec950d6cf14834b8e2e11f5725778f12e/libs/megaparse/src/megaparse/api/exceptions/__init__.py


--------------------------------------------------------------------------------
/libs/megaparse/src/megaparse/api/exceptions/megaparse_exceptions.py:
--------------------------------------------------------------------------------
 1 | from fastapi import HTTPException
 2 | 
 3 | 
 4 | class HTTPModelNotSupported(HTTPException):
 5 |     def __init__(
 6 |         self,
 7 |         detail: str = "The requested model is not supported yet.",
 8 |         headers: dict | None = None,
 9 |     ):
10 |         super().__init__(status_code=501, detail=detail, headers=headers)
11 | 
12 | 
13 | class HTTPFileNotFound(HTTPException):
14 |     def __init__(
15 |         self,
16 |         message="The UploadFile.filename does not exist and is needed for this operation",
17 |     ):
18 |         super().__init__(status_code=404, detail=message)
19 | 
20 | 
21 | class HTTPDownloadError(HTTPException):
22 |     def __init__(self, file_name, message="Failed to download the file"):
23 |         message = f"{file_name} : {message}"
24 |         super().__init__(status_code=400, detail=message)
25 | 
26 | 
27 | class HTTPParsingException(HTTPException):
28 |     def __init__(self, file_name, message="Failed to parse the file"):
29 |         message = f"{file_name} : {message}"
30 |         super().__init__(status_code=500, detail=message)
31 | 
32 | 
33 | class ParsingException(Exception):
34 |     """Exception raised for errors in the parsing process."""
35 | 
36 |     def __init__(self, message="An error occurred during parsing"):
37 |         self.message = message
38 |         super().__init__(self.message)
39 | 


--------------------------------------------------------------------------------
/libs/megaparse/src/megaparse/api/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QuivrHQ/MegaParse/ba9a24aec950d6cf14834b8e2e11f5725778f12e/libs/megaparse/src/megaparse/api/models/__init__.py


--------------------------------------------------------------------------------
/libs/megaparse/src/megaparse/api/models/base.py:
--------------------------------------------------------------------------------
 1 | from enum import Enum
 2 | 
 3 | 
 4 | class MarkDownType(str, Enum):
 5 |     """Markdown type enumeration."""
 6 | 
 7 |     TITLE = "Title"
 8 |     SUBTITLE = "Subtitle"
 9 |     HEADER = "Header"
10 |     FOOTER = "Footer"
11 |     NARRATIVE_TEXT = "NarrativeText"
12 |     LIST_ITEM = "ListItem"
13 |     TABLE = "Table"
14 |     PAGE_BREAK = "PageBreak"
15 |     IMAGE = "Image"
16 |     FORMULA = "Formula"
17 |     FIGURE_CAPTION = "FigureCaption"
18 |     ADDRESS = "Address"
19 |     EMAIL_ADDRESS = "EmailAddress"
20 |     CODE_SNIPPET = "CodeSnippet"
21 |     PAGE_NUMBER = "PageNumber"
22 |     DEFAULT = "Default"
23 |     UNDEFINED = "Undefined"
24 | 


--------------------------------------------------------------------------------
/libs/megaparse/src/megaparse/configs/auto.py:
--------------------------------------------------------------------------------
 1 | from enum import Enum
 2 | 
 3 | from pydantic import BaseModel
 4 | from pydantic_settings import BaseSettings, SettingsConfigDict
 5 | 
 6 | 
 7 | class TextDetConfig(BaseModel):
 8 |     det_arch: str = "fast_base"
 9 |     batch_size: int = 2
10 |     assume_straight_pages: bool = True
11 |     preserve_aspect_ratio: bool = True
12 |     symmetric_pad: bool = True
13 |     load_in_8_bit: bool = False
14 | 
15 | 
16 | class AutoStrategyConfig(BaseModel):
17 |     page_threshold: float = 0.6
18 |     document_threshold: float = 0.2
19 | 
20 | 
21 | class TextRecoConfig(BaseModel):
22 |     reco_arch: str = "crnn_vgg16_bn"
23 |     batch_size: int = 512
24 | 
25 | 
26 | class DeviceEnum(str, Enum):
27 |     CPU = "cpu"
28 |     CUDA = "cuda"
29 |     COREML = "coreml"
30 | 
31 | 
32 | class DoctrConfig(BaseModel):
33 |     straighten_pages: bool = False
34 |     detect_orientation: bool = False
35 |     detect_language: bool = False
36 |     text_det_config: TextDetConfig = TextDetConfig()
37 |     text_reco_config: TextRecoConfig = TextRecoConfig()
38 | 
39 | 
40 | class MegaParseConfig(BaseSettings):
41 |     """
42 |     Configuration for Megaparse.
43 |     """
44 | 
45 |     model_config = SettingsConfigDict(
46 |         env_prefix="MEGAPARSE_",
47 |         env_file=(".env.local", ".env"),
48 |         env_nested_delimiter="__",
49 |         extra="ignore",
50 |         use_enum_values=True,
51 |     )
52 |     doctr_config: DoctrConfig = DoctrConfig()
53 |     auto_config: AutoStrategyConfig = AutoStrategyConfig()
54 |     device: DeviceEnum = DeviceEnum.CPU
55 | 


--------------------------------------------------------------------------------
/libs/megaparse/src/megaparse/examples/parse_file.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | from megaparse.megaparse import MegaParse
 4 | from pydantic import BaseModel, Field
 5 | 
 6 | 
 7 | class MyCustomFormat(BaseModel):
 8 |     title: str = Field(description="The title of the document.")
 9 |     problem: str = Field(description="The problem statement.")
10 |     solution: str = Field(description="The solution statement.")
11 | 
12 | 
13 | def main():
14 |     # model = ChatOpenAI(name="gpt-4o")
15 |     # formatter_1 = CustomStructuredFormatter(model=model, output_model=MyCustomFormat)
16 | 
17 |     megaparse = MegaParse()
18 | 
19 |     file_path = Path("./tests/pdf/ocr/0168127.pdf")
20 |     result = megaparse.load(file_path=file_path)
21 |     print(result)
22 | 
23 | 
24 | if __name__ == "__main__":
25 |     main()
26 | 


--------------------------------------------------------------------------------
/libs/megaparse/src/megaparse/exceptions/base.py:
--------------------------------------------------------------------------------
1 | class ParsingException(Exception):
2 |     """Exception raised for errors in the parsing process."""
3 | 
4 |     def __init__(self, message="An error occurred during parsing"):
5 |         self.message = message
6 |         super().__init__(self.message)
7 | 


--------------------------------------------------------------------------------
/libs/megaparse/src/megaparse/formatter/base.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC
 2 | from pathlib import Path
 3 | from typing import Union
 4 | 
 5 | from langchain_core.language_models.chat_models import BaseChatModel
 6 | from megaparse_sdk.schema.document import Document
 7 | 
 8 | 
 9 | class BaseFormatter(ABC):
10 |     """
11 |     A class used to improve the layout of elements, particularly focusing on converting HTML tables to markdown tables.
12 |     Attributes
13 |     ----------
14 |     model : BaseChatModel
15 |         An instance of a chat model used to process and improve the layout of elements.
16 |     Methods
17 |     -------
18 |     improve_layout(elements: List[Element]) -> List[Element]
19 |         Processes a list of elements, converting HTML tables to markdown tables and improving the overall layout.
20 |     """
21 | 
22 |     def __init__(self, model: BaseChatModel | None = None):
23 |         self.model = model
24 | 
25 |     def format(
26 |         self, document: Document, file_path: Path | str | None = None
27 |     ) -> Union[Document, str]:
28 |         raise NotImplementedError("Subclasses should implement this method")
29 | 
30 |     async def aformat(
31 |         self, document: Document, file_path: Path | str | None = None
32 |     ) -> Union[Document, str]:
33 |         raise NotImplementedError("Subclasses should implement this method")
34 | 


--------------------------------------------------------------------------------
/libs/megaparse/src/megaparse/formatter/structured_formatter/__init__.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | from langchain_core.language_models.chat_models import BaseChatModel
 4 | from megaparse.formatter.base import BaseFormatter
 5 | from megaparse_sdk.schema.document import Document
 6 | from pydantic import BaseModel
 7 | 
 8 | 
 9 | class StructuredFormatter(BaseFormatter):
10 |     def __init__(self, model: BaseChatModel, output_model: type[BaseModel]):
11 |         super().__init__(model)
12 |         self.output_model = output_model
13 | 
14 |     async def aformat(
15 |         self,
16 |         document: Document,
17 |         file_path: Path | str | None = None,
18 |     ) -> str:  # FIXME: Return a structured output of type BaseModel ?
19 |         raise NotImplementedError()
20 | 
21 |     def format(
22 |         self,
23 |         document: Document,
24 |         file_path: Path | str | None = None,
25 |     ) -> str:  # FIXME: Return a structured output of type BaseModel ?
26 |         raise NotImplementedError()
27 | 


--------------------------------------------------------------------------------
/libs/megaparse/src/megaparse/formatter/structured_formatter/custom_structured_formatter.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | from megaparse.formatter.structured_formatter import StructuredFormatter
 4 | from megaparse_sdk.schema.document import Document
 5 | from pydantic import BaseModel
 6 | 
 7 | 
 8 | class CustomStructuredFormatter(StructuredFormatter):
 9 |     def format(
10 |         self,
11 |         document: Document,
12 |         file_path: Path | str | None = None,
13 |     ) -> str:
14 |         """
15 |         Structure the file using an AI language model.
16 |         Args:
17 |             text: The text to format.
18 |             file_path: The file path of the text.
19 |             model: The AI language model to use for formatting.
20 |         Returns:
21 |             The structured text.
22 |         """
23 |         if not self.model:
24 |             raise ValueError("A Model is needed to use the CustomStructuredFormatter.")
25 |         print("Formatting text using CustomStructuredFormatter...")
26 |         text = str(document)
27 |         if len(text) < 0:
28 |             raise ValueError(
29 |                 "A non empty text is needed to format text using CustomStructuredFormatter."
30 |             )
31 |         if not self.output_model:
32 |             raise ValueError(
33 |                 "An output model is needed to structure text using CustomStructuredFormatter."
34 |             )
35 | 
36 |         structured_model = self.model.with_structured_output(self.output_model)  # type: ignore
37 | 
38 |         formatted_text = structured_model.invoke(
39 |             f"Parse the text in a structured format: {text}"
40 |         )
41 |         assert isinstance(formatted_text, BaseModel), "Model output is not a BaseModel."
42 | 
43 |         return formatted_text.model_dump_json()
44 | 
45 |     async def aformat(
46 |         self,
47 |         document: Document,
48 |         file_path: Path | str | None = None,
49 |     ) -> str:
50 |         """
51 |         Asynchronously structure the file using an AI language model.
52 |         Args:
53 |             text: The text to format.
54 |             file_path: The file path of the text.
55 |             model: The AI language model to use for formatting.
56 |         Returns:
57 |             The structured text.
58 |         """
59 |         if not self.model:
60 |             raise ValueError("A Model is needed to use the CustomStructuredFormatter.")
61 |         print("Formatting text using CustomStructuredFormatter...")
62 |         text = str(document)
63 | 
64 |         if len(text) < 0:
65 |             raise ValueError(
66 |                 "A non empty text is needed to format text using CustomStructuredFormatter."
67 |             )
68 |         if not self.output_model:
69 |             raise ValueError(
70 |                 "An output model is needed to structure text using CustomStructuredFormatter."
71 |             )
72 | 
73 |         structured_model = self.model.with_structured_output(self.output_model)  # type: ignore
74 | 
75 |         formatted_text = await structured_model.ainvoke(
76 |             f"Parse the text in a structured format: {text}"
77 |         )
78 |         assert isinstance(formatted_text, BaseModel), "Model output is not a BaseModel."
79 | 
80 |         return formatted_text.model_dump_json()
81 | 


--------------------------------------------------------------------------------
/libs/megaparse/src/megaparse/formatter/table_formatter/__init__.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | from megaparse.formatter.base import BaseFormatter
 4 | from megaparse_sdk.schema.document import Document
 5 | 
 6 | 
 7 | class TableFormatter(BaseFormatter):
 8 |     def format(
 9 |         self, document: Document, file_path: Path | str | None = None
10 |     ) -> Document:
11 |         raise NotImplementedError("Subclasses should implement this method")
12 | 
13 |     async def aformat(
14 |         self, document: Document, file_path: Path | str | None = None
15 |     ) -> Document:
16 |         raise NotImplementedError("Subclasses should implement this method")
17 | 


--------------------------------------------------------------------------------
/libs/megaparse/src/megaparse/formatter/table_formatter/llm_table_formatter.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import warnings
  3 | from pathlib import Path
  4 | from typing import Optional
  5 | 
  6 | from langchain_core.language_models.chat_models import BaseChatModel
  7 | from langchain_core.prompts import ChatPromptTemplate
  8 | from megaparse.formatter.table_formatter import TableFormatter
  9 | from megaparse_sdk.schema.document import Document, TableBlock
 10 | 
 11 | 
 12 | class SimpleMDTableFormatter(TableFormatter):
 13 |     """
 14 |     A formatter that converts table elements into Markdown format using llms.
 15 |     """
 16 | 
 17 |     TABLE_MARKER_START = "[TABLE]"
 18 |     TABLE_MARKER_END = "[/TABLE]"
 19 |     CODE_BLOCK_PATTERN = r"^```.*$\n?"
 20 | 
 21 |     def __init__(self, model: Optional[BaseChatModel] = None):
 22 |         super().__init__(model)
 23 | 
 24 |     async def aformat(
 25 |         self, document: Document, file_path: Path | str | None = None
 26 |     ) -> Document:
 27 |         warnings.warn(
 28 |             "The SimpleMDTableFormatter is a sync formatter, please use the sync format method",
 29 |             UserWarning,
 30 |             stacklevel=2,
 31 |         )
 32 |         return self.format(document=document, file_path=file_path)
 33 | 
 34 |     def format(
 35 |         self, document: Document, file_path: Path | str | None = None
 36 |     ) -> Document:
 37 |         """
 38 |         Formats table elements within a list of elements.
 39 |         Args:
 40 |             elements: A list of Element objects.
 41 |         Returns:
 42 |             A list of Element objects with formatted tables.
 43 |         """
 44 |         if not self.model:
 45 |             raise ValueError("A Model is needed to use the SimpleMDTableFormatter.")
 46 |         print("Formatting tables using SimpleMDTableFormatter...")
 47 |         table_stack = []
 48 |         formatted_elements = []
 49 | 
 50 |         for block in document.content:
 51 |             if isinstance(block, TableBlock):
 52 |                 previous_table = table_stack[-1] if table_stack else ""
 53 |                 formatted_table = self.format_table(block, previous_table)
 54 |                 table_stack.append(formatted_table.text)
 55 |                 formatted_elements.append(formatted_table)
 56 |             else:
 57 |                 formatted_elements.append(block)
 58 | 
 59 |         document.content = formatted_elements
 60 |         return document
 61 | 
 62 |     def format_table(
 63 |         self, table_element: TableBlock, previous_table: str
 64 |     ) -> TableBlock:
 65 |         """
 66 |         Formats a single table element into Markdown using an AI language model.
 67 |         Args:
 68 |             table_element: The table element to format.
 69 |             previous_table: The previously formatted table text.
 70 |         Returns:
 71 |             The formatted table element.
 72 |         """
 73 |         assert self.model is not None, "Model is not set."
 74 | 
 75 |         prompt = ChatPromptTemplate.from_messages(
 76 |             [
 77 |                 (
 78 |                     "human",
 79 |                     (
 80 |                         "You are an expert in markdown tables. Transform the following parsed table into a "
 81 |                         "markdown table. Provide just the table in pure markdown, nothing else.\n"
 82 |                         "<TEXT>\n{text}\n</TEXT>\n"
 83 |                         "<PREVIOUS_TABLE>\n{previous_table}\n</PREVIOUS_TABLE>"
 84 |                     ),
 85 |                 ),
 86 |             ]
 87 |         )
 88 | 
 89 |         chain = prompt | self.model
 90 |         result = chain.invoke(
 91 |             {
 92 |                 "text": table_element.text,
 93 |                 "previous_table": previous_table,
 94 |             }
 95 |         )
 96 | 
 97 |         content_str = str(result.content)
 98 |         cleaned_content = re.sub(
 99 |             self.CODE_BLOCK_PATTERN, "", content_str, flags=re.MULTILINE
100 |         )
101 |         markdown_table = (
102 |             f"{self.TABLE_MARKER_START}\n"
103 |             f"{cleaned_content}\n"
104 |             f"{self.TABLE_MARKER_END}\n\n"
105 |         )
106 | 
107 |         table_element.text = markdown_table
108 | 
109 |         return table_element
110 | 


--------------------------------------------------------------------------------
/libs/megaparse/src/megaparse/formatter/table_formatter/vision_table_formatter.py:
--------------------------------------------------------------------------------
  1 | import base64
  2 | from io import BytesIO
  3 | from pathlib import Path
  4 | from typing import List, Optional
  5 | 
  6 | from langchain_core.language_models.chat_models import BaseChatModel
  7 | from langchain_core.messages import HumanMessage
  8 | from megaparse.formatter.table_formatter import TableFormatter
  9 | from megaparse_sdk.schema.document import Document, TableBlock
 10 | from pdf2image import convert_from_path
 11 | from PIL import Image
 12 | 
 13 | TABLE_OCR_PROMPT = """
 14 | You are tasked with transcribing the content of a table into markdown format. Your goal is to create a well-structured, readable markdown table that accurately represents the original content while adding appropriate formatting.
 15 | Answer uniquely with the parsed table. Do not include the fenced code blocks backticks.
 16 | """
 17 | 
 18 | 
 19 | class VisionMDTableFormatter(TableFormatter):
 20 |     """
 21 |     A formatter that converts table elements into Markdown format using an AI language model.
 22 |     """
 23 | 
 24 |     TABLE_MARKER_START = "[TABLE]"
 25 |     TABLE_MARKER_END = "[/TABLE]"
 26 |     CODE_BLOCK_PATTERN = r"^```.*$\n?"
 27 | 
 28 |     def __init__(self, model: Optional[BaseChatModel] = None):
 29 |         super().__init__(model)
 30 | 
 31 |     def _crop_table_image(self, table_element: TableBlock, file_path: str) -> str:
 32 |         """
 33 |         Helper method to crop the table portion of the PDF page and convert it to a base64 string.
 34 |         """
 35 |         assert table_element.bbox, "Table element must have coordinates."
 36 |         bbox = table_element.bbox
 37 |         page_number = table_element.page_range[0]
 38 |         assert page_number, "Table element must have a page number."
 39 |         assert bbox, "Table element must have coordinates."
 40 | 
 41 |         pages = convert_from_path(file_path)
 42 | 
 43 |         # Calculate the box for cropping
 44 |         box = (
 45 |             bbox.top_left.x,
 46 |             bbox.top_left.y,
 47 |             bbox.bottom_right.x,
 48 |             bbox.bottom_right.y,
 49 |         )
 50 |         table_image = pages[page_number - 1].crop(box)
 51 |         # Convert the cropped image to base64
 52 |         table_image64 = self.process_file([table_image])[0]
 53 |         return table_image64
 54 | 
 55 |     async def aformat(
 56 |         self, document: Document, file_path: Path | str | None = None
 57 |     ) -> Document:
 58 |         """
 59 |         Asynchronously formats table elements within a list of elements.
 60 |         """
 61 |         if not self.model:
 62 |             raise ValueError("A Model is needed to use the VisionMDTableFormatter.")
 63 |         print("Formatting tables using VisionMDTableFormatter (async)...")
 64 |         assert (
 65 |             file_path
 66 |         ), "A file path is needed to format tables using VisionMDTableFormatter."
 67 |         if not isinstance(file_path, str):
 68 |             file_path = str(file_path)
 69 |         formatted_elements = []
 70 |         for block in document.content:
 71 |             if isinstance(block, TableBlock):
 72 |                 formatted_table = await self.aformat_table(block, file_path)
 73 |                 formatted_elements.append(formatted_table)
 74 |             else:
 75 |                 formatted_elements.append(block)
 76 | 
 77 |         document.content = formatted_elements
 78 |         return document
 79 | 
 80 |     def format(
 81 |         self, document: Document, file_path: Path | str | None = None
 82 |     ) -> Document:
 83 |         """
 84 |         Asynchronously formats table elements within a list of elements.
 85 |         """
 86 |         if not self.model:
 87 |             raise ValueError("A Model is needed to use the VisionMDTableFormatter.")
 88 |         print("Formatting tables using VisionMDTableFormatter (async)...")
 89 |         assert (
 90 |             file_path
 91 |         ), "A file path is needed to format tables using VisionMDTableFormatter."
 92 |         if not isinstance(file_path, str):
 93 |             file_path = str(file_path)
 94 |         formatted_elements = []
 95 |         for block in document.content:
 96 |             if isinstance(block, TableBlock):
 97 |                 formatted_table = self.format_table(block, file_path)
 98 |                 formatted_elements.append(formatted_table)
 99 |             else:
100 |                 formatted_elements.append(block)
101 | 
102 |         document.content = formatted_elements
103 |         return document
104 | 
105 |     async def aformat_table(
106 |         self, table_element: TableBlock, file_path: str
107 |     ) -> TableBlock:
108 |         """
109 |         Asynchronously formats a table element into Markdown format using a Vision Model.
110 |         """
111 |         table_image64 = self._crop_table_image(table_element, file_path)
112 |         formatted_table = await self.avision_extract(table_image64)
113 | 
114 |         markdown_table = (
115 |             f"{self.TABLE_MARKER_START}\n"
116 |             f"{formatted_table}\n"
117 |             f"{self.TABLE_MARKER_END}\n\n"
118 |         )
119 |         # Replace the element's text with the formatted table text
120 |         table_element.text = markdown_table
121 |         return table_element
122 | 
123 |     def format_table(self, table_element: TableBlock, file_path: str) -> TableBlock:
124 |         """
125 |         Asynchronously formats a table element into Markdown format using a Vision Model.
126 |         """
127 |         table_image64 = self._crop_table_image(table_element, file_path)
128 |         formatted_table = self.vision_extract(table_image64)
129 | 
130 |         markdown_table = (
131 |             f"{self.TABLE_MARKER_START}\n"
132 |             f"{formatted_table}\n"
133 |             f"{self.TABLE_MARKER_END}\n\n"
134 |         )
135 |         # Replace the element's text with the formatted table text
136 |         table_element.text = markdown_table
137 |         return table_element
138 | 
139 |     def process_file(self, images: List[Image.Image], image_format="PNG") -> List[str]:
140 |         """
141 |         Convert a list of PIL images to base64 encoded images.
142 |         """
143 |         try:
144 |             images_base64 = []
145 |             for image in images:
146 |                 buffered = BytesIO()
147 |                 image.save(buffered, format=image_format)
148 |                 image_base64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
149 |                 images_base64.append(image_base64)
150 |             return images_base64
151 |         except Exception as e:
152 |             raise ValueError(f"Error processing PDF file: {str(e)}")
153 | 
154 |     async def avision_extract(self, table_image: str) -> str:
155 |         """
156 |         Asynchronously send image data to the language model for processing.
157 |         """
158 |         assert (
159 |             self.model
160 |         ), "A model is needed to use the VisionMDTableFormatter (async)."
161 |         image_prompt = {
162 |             "type": "image_url",
163 |             "image_url": {"url": f"data:image/jpeg;base64,{table_image}"},
164 |         }
165 | 
166 |         message = HumanMessage(
167 |             content=[
168 |                 {"type": "text", "text": TABLE_OCR_PROMPT},
169 |                 image_prompt,
170 |             ],
171 |         )
172 |         response = await self.model.ainvoke([message])
173 |         return str(response.content)
174 | 
175 |     def vision_extract(self, table_image: str) -> str:
176 |         """
177 |         Synchronously send image data to the language model for processing.
178 |         """
179 |         assert self.model, "A model is needed to use the VisionMDTableFormatter (sync)."
180 |         image_prompt = {
181 |             "type": "image_url",
182 |             "image_url": {"url": f"data:image/jpeg;base64,{table_image}"},
183 |         }
184 | 
185 |         message = HumanMessage(
186 |             content=[
187 |                 {"type": "text", "text": TABLE_OCR_PROMPT},
188 |                 image_prompt,
189 |             ],
190 |         )
191 |         response = self.model.invoke([message])
192 |         return str(response.content)
193 | 


--------------------------------------------------------------------------------
/libs/megaparse/src/megaparse/layout_detection/layout_detector.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | import pathlib
  4 | import uuid
  5 | from typing import Any, List
  6 | 
  7 | import numpy as np
  8 | import onnxruntime as rt
  9 | from megaparse.configs.auto import DeviceEnum
 10 | from megaparse.layout_detection.output import LayoutDetectionOutput
 11 | from megaparse.utils.onnx import get_providers
 12 | from megaparse_sdk.schema.document import BBOX, Point2D
 13 | from onnxtr.models.engine import EngineConfig
 14 | from onnxtr.models.preprocessor import PreProcessor
 15 | from PIL import Image, ImageDraw
 16 | from PIL.Image import Image as PILImage
 17 | 
 18 | logger = logging.getLogger("megaparse")
 19 | 
 20 | LABEL_MAP = {
 21 |     0: "Caption",
 22 |     1: "Footnote",
 23 |     2: "Formula",
 24 |     3: "List-item",
 25 |     4: "Page-footer",
 26 |     5: "Page-header",
 27 |     6: "Picture",
 28 |     7: "Section-header",
 29 |     8: "Table",
 30 |     9: "Text",
 31 |     10: "Title",
 32 | }
 33 | 
 34 | default_cfg: dict[str, dict[str, Any]] = {
 35 |     "yolov10s-doclaynet": {
 36 |         "mean": (0.5, 0.5, 0.5),
 37 |         "std": (1.0, 1.0, 1.0),
 38 |         "url_8_bit": None,
 39 |         "input_shape": (1, 1024, 1024),
 40 |         "url": pathlib.Path(__file__).parent.joinpath("models/yolov10s-doclaynet.onnx"),
 41 |     }
 42 | }
 43 | 
 44 | 
 45 | class LayoutDetector:
 46 |     def __init__(
 47 |         self,
 48 |         device: DeviceEnum = DeviceEnum.CPU,
 49 |         threshold: float = 0.1,
 50 |         preserve_aspect_ratio: bool = True,
 51 |         model_name: str = "yolov10s-doclaynet",
 52 |         load_in_8_bit: bool = False,
 53 |     ):
 54 |         model_config = default_cfg[model_name]
 55 |         self.device = device
 56 |         general_options = rt.SessionOptions()
 57 |         providers = get_providers(self.device)
 58 |         self.threshold = threshold
 59 |         self.batch_size, self.required_width, self.required_height = model_config[
 60 |             "input_shape"
 61 |         ]
 62 |         self.preserve_aspect_ratio = preserve_aspect_ratio
 63 | 
 64 |         self.pre_processor = PreProcessor(
 65 |             output_size=(self.required_width, self.required_height),
 66 |             batch_size=self.batch_size,
 67 |             preserve_aspect_ratio=self.preserve_aspect_ratio,
 68 |         )
 69 | 
 70 |         engine_config = EngineConfig(
 71 |             session_options=general_options,
 72 |             providers=providers,
 73 |         )
 74 |         model_path = (
 75 |             model_config.get("url_8_bit") if load_in_8_bit else model_config.get("url")
 76 |         )
 77 |         assert model_path, f"Model path not found for {model_name}"
 78 | 
 79 |         self.model = rt.InferenceSession(model_path, engine_config=engine_config)
 80 | 
 81 |     def __call__(
 82 |         self, img_pages: list[PILImage], output_dir: str | None = None
 83 |     ) -> List[List[LayoutDetectionOutput]]:
 84 |         pages = [np.array(img) for img in img_pages]
 85 |         # Dimension check
 86 |         if any(page.ndim != 3 for page in pages):
 87 |             raise ValueError(
 88 |                 "incorrect input shape: all pages are expected to be multi-channel 2D images."
 89 |             )
 90 |         processed_batches = self.pre_processor(pages)
 91 |         processed_batches = np.array(processed_batches)
 92 |         processed_batches = processed_batches.squeeze(1)  # Horrendus
 93 |         processed_batches = processed_batches.transpose(0, 3, 1, 2)
 94 | 
 95 |         pred_batches = np.array(
 96 |             [
 97 |                 self.model.run(None, {"images": np.expand_dims(batch, axis=0)})
 98 |                 for batch in processed_batches
 99 |             ]
100 |         )
101 |         pred_batches = np.concatenate(pred_batches, axis=0)
102 |         pred_batches = pred_batches.squeeze(1)  # Horrendus
103 | 
104 |         processed_preds = []
105 |         for page, pred in zip(pages, pred_batches, strict=True):
106 |             img_h, img_w = page.shape[:2]
107 |             bboxes = self.extract_bboxes_from_page(pred, img_h, img_w)
108 |             processed_preds.append(bboxes)
109 | 
110 |         if output_dir:
111 |             self._save_layout(pages=pages, preds=processed_preds, output_dir=output_dir)
112 | 
113 |         return processed_preds
114 | 
115 |     def extract_bboxes_from_page(
116 |         self, preds: np.ndarray, img_h: int, img_w: int
117 |     ) -> List[LayoutDetectionOutput]:
118 |         results = []
119 | 
120 |         assert preds.shape == (300, 6)
121 | 
122 |         scale_h = img_h / self.required_height
123 |         scale_w = img_w / self.required_width
124 | 
125 |         for det in preds:
126 |             # Rescale the bounding box coordinates to the original dimensions
127 |             x1, y1, x2, y2, score, cls_idx = det
128 |             if score < self.threshold:
129 |                 continue
130 | 
131 |             x1 *= scale_w
132 |             x2 *= scale_w
133 |             y1 *= scale_h
134 |             y2 *= scale_h
135 | 
136 |             if self.preserve_aspect_ratio:
137 |                 ratio = img_h / img_w
138 |                 x1 = x1 * (ratio if ratio > 1 else 1)
139 |                 x2 = x2 * (ratio if ratio > 1 else 1)
140 |                 y1 = y1 / (ratio if ratio < 1 else 1)
141 |                 y2 = y2 / (ratio if ratio < 1 else 1)
142 | 
143 |             x1 = max(0, min(x1, img_w))
144 |             x2 = max(0, min(x2, img_w))
145 |             y1 = max(0, min(y1, img_h))
146 |             y2 = max(0, min(y2, img_h))
147 | 
148 |             bbox_id = uuid.uuid4()
149 | 
150 |             results.append(
151 |                 LayoutDetectionOutput(
152 |                     bbox_id=bbox_id,
153 |                     bbox=BBOX(
154 |                         top_left=Point2D(x=x1 / img_w, y=y1 / img_h),
155 |                         bottom_right=Point2D(x=x2 / img_w, y=y2 / img_h),
156 |                     ),
157 |                     prob=det[4],
158 |                     label=int(det[5]),
159 |                 )
160 |             )
161 | 
162 |         result = self.topK(results)  # or topK
163 |         return result
164 | 
165 |     def nms(
166 |         self,
167 |         raw_bboxes: List[LayoutDetectionOutput],
168 |         iou_threshold: float = 0.9,  # FIXME: thresh Configurable in constructor
169 |     ) -> List[LayoutDetectionOutput]:
170 |         """
171 |         Non-Maximum Suppression (NMS) algorithm.
172 | 
173 |         Args:
174 |             raw_bboxes (list): List of LayoutBBox objects.
175 |             iou_threshold (float): IoU threshold for suppression.
176 | 
177 |         Returns:
178 |             None: The input list `raw_bboxes` is modified in-place.
179 |         """
180 |         raw_bboxes.sort(key=lambda x: x.prob, reverse=True)
181 | 
182 |         current_index = 0
183 |         for index in range(len(raw_bboxes)):
184 |             drop = False
185 |             for prev_index in range(current_index):
186 |                 iou = raw_bboxes[index].bbox.iou(raw_bboxes[prev_index].bbox)
187 |                 if iou > iou_threshold:
188 |                     drop = True
189 |                     break
190 |             if not drop:
191 |                 raw_bboxes[current_index], raw_bboxes[index] = (
192 |                     raw_bboxes[index],
193 |                     raw_bboxes[current_index],
194 |                 )
195 |                 current_index += 1
196 | 
197 |         return raw_bboxes[:current_index]
198 | 
199 |     def topK(
200 |         self, detectResult: List[LayoutDetectionOutput], topK: int = 50
201 |     ) -> List[LayoutDetectionOutput]:
202 |         if len(detectResult) <= topK:
203 |             return detectResult
204 |         else:
205 |             predBoxs = []
206 |             sort_detectboxs = sorted(detectResult, key=lambda x: x.prob, reverse=True)
207 |             for i in range(topK):
208 |                 predBoxs.append(sort_detectboxs[i])
209 |             return predBoxs
210 | 
211 |     def _save_layout(
212 |         self,
213 |         pages: list[np.ndarray],
214 |         preds: list[list[LayoutDetectionOutput]],
215 |         output_dir: str,
216 |     ):
217 |         os.makedirs(output_dir, exist_ok=True)
218 |         for i, (page, layout) in enumerate(zip(pages, preds, strict=True)):
219 |             image = Image.fromarray(page)
220 |             draw = ImageDraw.Draw(image)
221 |             img_w, img_h = image.size
222 | 
223 |             for detection in layout:
224 |                 x_min, y_min, x_max, y_max = detection.bbox.to_numpy()
225 |                 bbox = x_min * img_w, y_min * img_h, x_max * img_w, y_max * img_h
226 |                 confidence = detection.prob
227 |                 category = detection.label
228 |                 label = LABEL_MAP.get(category, "Unknown")
229 | 
230 |                 draw.rectangle(bbox, outline="red", width=2)
231 |                 # assert bbox[2] <= image.width
232 |                 # assert bbox[3] <= image.height
233 |                 draw.text(
234 |                     (bbox[0], bbox[1]),
235 |                     f"{label} ({confidence:.2f})",
236 |                     fill="red",
237 |                 )
238 | 
239 |             image.save(os.path.join(output_dir, f"page_{i}.png"))
240 | 


--------------------------------------------------------------------------------
/libs/megaparse/src/megaparse/layout_detection/models/yolov10s-doclaynet.onnx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QuivrHQ/MegaParse/ba9a24aec950d6cf14834b8e2e11f5725778f12e/libs/megaparse/src/megaparse/layout_detection/models/yolov10s-doclaynet.onnx


--------------------------------------------------------------------------------
/libs/megaparse/src/megaparse/layout_detection/output.py:
--------------------------------------------------------------------------------
 1 | from uuid import UUID
 2 | 
 3 | from megaparse_sdk.schema.document import BBOX
 4 | from pydantic import BaseModel
 5 | 
 6 | 
 7 | class LayoutDetectionOutput(BaseModel):
 8 |     bbox_id: UUID
 9 |     bbox: BBOX
10 |     prob: float
11 |     label: int
12 | 


--------------------------------------------------------------------------------
/libs/megaparse/src/megaparse/models/page.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | 
 3 | from megaparse_sdk.schema.document import TextDetection
 4 | from megaparse_sdk.schema.parser_config import StrategyEnum
 5 | from PIL.Image import Image as PILImage
 6 | from pydantic import BaseModel, ConfigDict
 7 | from pypdfium2._helpers.page import PdfPage
 8 | 
 9 | 
10 | class PageDimension(BaseModel):
11 |     """
12 |     A class to represent a page dimension
13 |     """
14 | 
15 |     width: float
16 |     height: float
17 | 
18 | 
19 | class Page(BaseModel):
20 |     """
21 |     A class to represent a page
22 |     """
23 | 
24 |     strategy: StrategyEnum
25 |     text_detections: TextDetection | None = None
26 |     rasterized: PILImage | None = None
27 |     page_size: PageDimension
28 |     page_index: int
29 |     pdfium_elements: PdfPage
30 | 
31 |     model_config = ConfigDict(arbitrary_types_allowed=True)
32 | 
33 | 
34 | class GatewayDocument(BaseModel):
35 |     """
36 |     A class to represent a Gateway MegaParse Document, which is a container of pages.
37 |     """
38 | 
39 |     file_name: str
40 |     pages: List[Page]
41 | 


--------------------------------------------------------------------------------
/libs/megaparse/src/megaparse/parser/__init__.py:
--------------------------------------------------------------------------------
1 | from .base import BaseParser
2 | 
3 | __all__ = ["BaseParser"]
4 | 


--------------------------------------------------------------------------------
/libs/megaparse/src/megaparse/parser/base.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from pathlib import Path
 3 | from typing import IO
 4 | 
 5 | from megaparse_sdk.schema.document import Document
 6 | from megaparse_sdk.schema.extensions import FileExtension
 7 | 
 8 | 
 9 | class BaseParser(ABC):
10 |     """Mother Class for all the parsers [Unstructured, LlamaParse, MegaParseVision]"""
11 | 
12 |     supported_extensions = []
13 | 
14 |     def check_supported_extension(
15 |         self, file_extension: FileExtension | None, file_path: str | Path | None = None
16 |     ):
17 |         if not file_extension and not file_path:
18 |             raise ValueError(
19 |                 f"Either file_path or file_extension must be provided for {self.__class__.__name__}"
20 |             )
21 |         if file_path and not file_extension:
22 |             file_path = Path(file_path) if isinstance(file_path, str) else file_path
23 |             file_extension = FileExtension(file_path.suffix)
24 |         if file_extension and file_extension not in self.supported_extensions:
25 |             raise ValueError(
26 |                 f"Unsupported file extension {file_extension.value} for {self.__class__.__name__}"
27 |             )
28 | 
29 |     @abstractmethod
30 |     async def aconvert(
31 |         self,
32 |         file_path: str | Path | None = None,
33 |         file: IO[bytes] | None = None,
34 |         file_extension: FileExtension | None = None,
35 |         **kwargs,
36 |     ) -> Document:
37 |         """
38 |         Convert the given file to a specific format.
39 | 
40 |         Args:
41 |             file_path (str | Path): The path to the file to be converted.
42 |             **kwargs: Additional keyword arguments for the conversion process.
43 | 
44 |         Returns:
45 |             str: The result of the conversion process.
46 | 
47 |         Raises:
48 |             NotImplementedError: If the method is not implemented by a subclass.
49 |         """
50 |         raise NotImplementedError("Subclasses should implement this method")
51 | 
52 |     @abstractmethod
53 |     def convert(
54 |         self,
55 |         file_path: str | Path | None = None,
56 |         file: IO[bytes] | None = None,
57 |         file_extension: FileExtension | None = None,
58 |         **kwargs,
59 |     ) -> Document:
60 |         """
61 |         Convert the given file to the unstructured format.
62 | 
63 |         Args:
64 |             file_path (str | Path): The path to the file to be converted.
65 |             **kwargs: Additional keyword arguments for the conversion process.
66 | 
67 |         Returns:
68 |             str: The result of the conversion process.
69 | 
70 |         Raises:
71 |             NotImplementedError: If the method is not implemented by a subclass.
72 |         """
73 |         raise NotImplementedError("Subclasses should implement this method")
74 | 


--------------------------------------------------------------------------------
/libs/megaparse/src/megaparse/parser/builder.py:
--------------------------------------------------------------------------------
 1 | from megaparse_sdk.schema.parser_config import ParseFileConfig
 2 | 
 3 | from megaparse.parser.base import BaseParser
 4 | from megaparse.parser.llama import LlamaParser
 5 | from megaparse.parser.megaparse_vision import MegaParseVision
 6 | from megaparse.parser.unstructured_parser import UnstructuredParser
 7 | 
 8 | parser_dict: dict[str, type] = {
 9 |     "unstructured": UnstructuredParser,
10 |     "llama_parser": LlamaParser,
11 |     "megaparse_vision": MegaParseVision,
12 | }
13 | 
14 | 
15 | class ParserBuilder:
16 |     def build(self, config: ParseFileConfig) -> BaseParser:
17 |         """
18 |         Build a parser based on the given configuration.
19 | 
20 |         Args:
21 |             config (ParserDict): The configuration to be used for building the parser.
22 | 
23 |         Returns:
24 |             BaseParser: The built parser.
25 | 
26 |         Raises:
27 |             ValueError: If the configuration is invalid.
28 |         """
29 |         return parser_dict[config.method](**config.model_dump())
30 | 


--------------------------------------------------------------------------------
/libs/megaparse/src/megaparse/parser/entity.py:
--------------------------------------------------------------------------------
 1 | from enum import Enum
 2 | from typing import List, Optional
 3 | 
 4 | 
 5 | class TagEnum(str, Enum):
 6 |     """Possible tags for the elements in the file"""
 7 | 
 8 |     TABLE = "TABLE"
 9 |     TOC = "TOC"
10 |     HEADER = "HEADER"
11 |     IMAGE = "IMAGE"
12 | 
13 | 
14 | class SupportedModel(Enum):
15 |     GPT_4O = ("gpt-4o", None)
16 |     GPT_4O_TURBO = ("gpt-4o-turbo", None)
17 |     CLAUDE_3_5_SONNET = ("claude-3-5-sonnet", ["latest", "20241022"])
18 |     CLAUDE_3_OPUS = ("claude-3-opus", ["latest", "20240229"])
19 | 
20 |     def __init__(self, model_name: str, supported_releases: Optional[List[str]]):
21 |         self.model_name = model_name
22 |         self.supported_releases = supported_releases
23 | 
24 |     @classmethod
25 |     def is_supported(cls, model_name: str) -> bool:
26 |         # Attempt to match model_name by checking if it starts with a known model name
27 |         for model in cls:
28 |             if model_name.startswith(model.model_name):
29 |                 # Extract the release version if available
30 |                 release = model_name[len(model.model_name) :].lstrip("-") or None
31 |                 # Check if the model supports this release
32 |                 if model.supported_releases is None:
33 |                     return True
34 |                 return release in model.supported_releases if release else False
35 |         return False
36 | 


--------------------------------------------------------------------------------
/libs/megaparse/src/megaparse/parser/llama.py:
--------------------------------------------------------------------------------
  1 | from pathlib import Path
  2 | from typing import IO, List
  3 | 
  4 | from llama_index.core.schema import Document as LlamaDocument
  5 | from llama_parse import LlamaParse as _LlamaParse
  6 | from llama_parse.utils import Language, ResultType
  7 | from megaparse_sdk.schema.document import BBOX, Point2D, TextBlock
  8 | from megaparse_sdk.schema.document import Document as MPDocument
  9 | from megaparse_sdk.schema.extensions import FileExtension
 10 | 
 11 | from megaparse.parser import BaseParser
 12 | 
 13 | 
 14 | class LlamaParser(BaseParser):
 15 |     supported_extensions = [FileExtension.PDF]
 16 | 
 17 |     def __init__(
 18 |         self,
 19 |         api_key: str,
 20 |         verbose=True,
 21 |         language: Language = Language.FRENCH,
 22 |         parsing_instruction: str | None = None,
 23 |         **kwargs,
 24 |     ) -> None:
 25 |         self.api_key = api_key
 26 |         self.verbose = verbose
 27 |         self.language = language
 28 |         if parsing_instruction:
 29 |             self.parsing_instruction = parsing_instruction
 30 |         else:
 31 |             self.parsing_instruction = """Do not take into account the page breaks (no --- between pages),
 32 |             do not repeat the header and the footer so the tables are merged if needed. Keep the same format for similar tables."""
 33 | 
 34 |     async def aconvert(
 35 |         self,
 36 |         file_path: str | Path | None = None,
 37 |         file: IO[bytes] | None = None,
 38 |         file_extension: None | FileExtension = None,
 39 |         **kwargs,
 40 |     ) -> MPDocument:
 41 |         if not file_path:
 42 |             raise ValueError("File_path should be provided to run LlamaParser")
 43 |         self.check_supported_extension(file_extension, file_path)
 44 | 
 45 |         llama_parser = _LlamaParse(
 46 |             api_key=self.api_key,
 47 |             result_type=ResultType.MD,
 48 |             gpt4o_mode=True,
 49 |             verbose=self.verbose,
 50 |             language=self.language,
 51 |             parsing_instruction=self.parsing_instruction,
 52 |         )
 53 | 
 54 |         documents: List[LlamaDocument] = await llama_parser.aload_data(str(file_path))
 55 | 
 56 |         return self.__to_elements_list__(documents)
 57 | 
 58 |     def convert(
 59 |         self,
 60 |         file_path: str | Path | None = None,
 61 |         file: IO[bytes] | None = None,
 62 |         file_extension: None | FileExtension = None,
 63 |         **kwargs,
 64 |     ) -> MPDocument:
 65 |         if not file_path:
 66 |             raise ValueError("File_path should be provided to run LlamaParser")
 67 |         self.check_supported_extension(file_extension, file_path)
 68 | 
 69 |         llama_parser = _LlamaParse(
 70 |             api_key=self.api_key,
 71 |             result_type=ResultType.JSON,
 72 |             gpt4o_mode=True,
 73 |             verbose=self.verbose,
 74 |             language=self.language,
 75 |             parsing_instruction=self.parsing_instruction,
 76 |         )
 77 | 
 78 |         documents: List[LlamaDocument] = llama_parser.load_data(str(file_path))
 79 | 
 80 |         return self.__to_elements_list__(documents)
 81 | 
 82 |     def __to_elements_list__(self, llama_doc: List[LlamaDocument]) -> MPDocument:
 83 |         list_blocks = []
 84 |         for i, page in enumerate(llama_doc):
 85 |             list_blocks.append(
 86 |                 TextBlock(
 87 |                     text=page.text,
 88 |                     metadata={},
 89 |                     page_range=(i, i + 1),
 90 |                     bbox=BBOX(
 91 |                         top_left=Point2D(x=0, y=0), bottom_right=Point2D(x=1, y=1)
 92 |                     ),
 93 |                 )
 94 |             )
 95 |         return MPDocument(
 96 |             metadata={},
 97 |             detection_origin="llamaparse",
 98 |             content=list_blocks,
 99 |         )
100 | 


--------------------------------------------------------------------------------
/libs/megaparse/src/megaparse/parser/megaparse_vision.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import base64
  3 | import re
  4 | from io import BytesIO
  5 | from pathlib import Path
  6 | from typing import IO, List
  7 | 
  8 | from langchain_core.language_models.chat_models import BaseChatModel
  9 | from langchain_core.messages import HumanMessage
 10 | from megaparse_sdk.schema.document import BBOX, Block, Point2D, TextBlock
 11 | from megaparse_sdk.schema.document import Document as MPDocument
 12 | from megaparse_sdk.schema.extensions import FileExtension
 13 | from pdf2image import convert_from_path
 14 | 
 15 | from megaparse.parser import BaseParser
 16 | from megaparse.parser.entity import SupportedModel, TagEnum
 17 | 
 18 | # BASE_OCR_PROMPT = """
 19 | # Transcribe the content of this file into markdown. Be mindful of the formatting.
 20 | # Add formatting if you think it is not clear.
 21 | # Do not include page breaks and merge content of tables if it is continued in the next page.
 22 | # Add tags around what you identify as a table [TABLE], header - complete chain of characters that are repeated at each start of pages - [HEADER], table of content [TOC] in the format '[tag] ... [/tag]'
 23 | # Return only the parsed content.
 24 | # """
 25 | 
 26 | BASE_OCR_PROMPT = """
 27 | You are tasked with transcribing and formatting the content of a file into markdown. Your goal is to create a well-structured, readable markdown document that accurately represents the original content while adding appropriate formatting and tags.
 28 | 
 29 | 
 30 | Follow these instructions to complete the task:
 31 | 
 32 | 1. Carefully read through the entire file content.
 33 | 
 34 | 2. Transcribe the content into markdown format, paying close attention to the existing formatting and structure.
 35 | 
 36 | 3. If you encounter any unclear formatting in the original content, use your judgment to add appropriate markdown formatting to improve readability and structure.
 37 | 
 38 | 4. For tables, headers, and table of contents, add the following tags:
 39 |    - Tables: Enclose the entire table in [TABLE] and [/TABLE] tags. Merge content of tables if it is continued in the next page.
 40 |    - Headers (complete chain of characters repeated at the start of each page): Enclose in [HEADER] and [/HEADER] tags inside the markdown file.
 41 |    - Table of contents: Enclose in [TOC] and [/TOC] tags
 42 | 
 43 | 5. When transcribing tables:
 44 |    - If a table continues across multiple pages, merge the content into a single, cohesive table.
 45 |    - Use proper markdown table formatting with pipes (|) and hyphens (-) for table structure.
 46 | 
 47 | 6. Do not include page breaks in your transcription.
 48 | 
 49 | 7. Maintain the logical flow and structure of the document, ensuring that sections and subsections are properly formatted using markdown headers (# for main headers, ## for subheaders, etc.).
 50 | 
 51 | 8. Use appropriate markdown syntax for other formatting elements such as bold, italic, lists, and code blocks as needed.
 52 | 
 53 | 10. Return only the parsed content in markdown format, including the specified tags for tables, headers, and table of contents.
 54 | """
 55 | 
 56 | 
 57 | class MegaParseVision(BaseParser):
 58 |     supported_extensions = [FileExtension.PDF]
 59 | 
 60 |     def __init__(self, model: BaseChatModel, **kwargs):
 61 |         if hasattr(model, "model_name"):
 62 |             if not SupportedModel.is_supported(model.model_name):
 63 |                 raise ValueError(
 64 |                     f"Invald model name, MegaParse vision only supports model that have vision capabilities. "
 65 |                     f"{model.model_name} is not supported."
 66 |                 )
 67 |         self.model = model
 68 | 
 69 |         self.parsed_chunks: list[str] | None = None
 70 | 
 71 |     def process_file(self, file_path: str, image_format: str = "PNG") -> List[str]:
 72 |         """
 73 |         Process a PDF file and convert its pages to base64 encoded images.
 74 | 
 75 |         :param file_path: Path to the PDF file
 76 |         :param image_format: Format to save the images (default: PNG)
 77 |         :return: List of base64 encoded images
 78 |         """
 79 |         try:
 80 |             images = convert_from_path(file_path)
 81 |             images_base64 = []
 82 |             for image in images:
 83 |                 buffered = BytesIO()
 84 |                 image.save(buffered, format=image_format)
 85 |                 image_base64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
 86 |                 images_base64.append(image_base64)
 87 |             return images_base64
 88 |         except Exception as e:
 89 |             raise ValueError(f"Error processing PDF file: {str(e)}")
 90 | 
 91 |     def get_element(self, tag: TagEnum, chunk: str):
 92 |         pattern = rf"\[{tag.value}\]([\s\S]*?)\[/{tag.value}\]"
 93 |         all_elmts = re.findall(pattern, chunk)
 94 |         if not all_elmts:
 95 |             print(f"No {tag.value} found in the chunk")
 96 |             return []
 97 |         return [elmt.strip() for elmt in all_elmts]
 98 | 
 99 |     async def asend_to_mlm(self, images_data: List[str]) -> str:
100 |         """
101 |         Send images to the language model for processing.
102 | 
103 |         :param images_data: List of base64 encoded images
104 |         :return: Processed content as a string
105 |         """
106 |         images_prompt = [
107 |             {
108 |                 "type": "image_url",
109 |                 "image_url": {"url": f"data:image/jpeg;base64,{image_data}"},
110 |             }
111 |             for image_data in images_data
112 |         ]
113 |         message = HumanMessage(
114 |             content=[
115 |                 {"type": "text", "text": BASE_OCR_PROMPT},
116 |                 *images_prompt,
117 |             ],
118 |         )
119 |         response = await self.model.ainvoke([message])
120 |         return str(response.content)
121 | 
122 |     def send_to_mlm(self, images_data: List[str]) -> str:
123 |         """
124 |         Send images to the language model for processing.
125 | 
126 |         :param images_data: List of base64 encoded images
127 |         :return: Processed content as a string
128 |         """
129 |         images_prompt = [
130 |             {
131 |                 "type": "image_url",
132 |                 "image_url": {"url": f"data:image/jpeg;base64,{image_data}"},
133 |             }
134 |             for image_data in images_data
135 |         ]
136 |         message = HumanMessage(
137 |             content=[
138 |                 {"type": "text", "text": BASE_OCR_PROMPT},
139 |                 *images_prompt,
140 |             ],
141 |         )
142 |         response = self.model.invoke([message])
143 |         return str(response.content)
144 | 
145 |     async def aconvert(
146 |         self,
147 |         file_path: str | Path | None = None,
148 |         file: IO[bytes] | None = None,
149 |         file_extension: FileExtension | None = None,
150 |         batch_size: int = 3,
151 |         **kwargs,
152 |     ) -> MPDocument:
153 |         """
154 |         Parse a PDF file and process its content using the language model.
155 | 
156 |         :param file_path: Path to the PDF file
157 |         :param batch_size: Number of pages to process concurrently
158 |         :return: List of processed content strings
159 |         """
160 |         if not file_path:
161 |             raise ValueError("File_path should be provided to run MegaParseVision")
162 | 
163 |         if isinstance(file_path, Path):
164 |             file_path = str(file_path)
165 | 
166 |         self.check_supported_extension(file_extension, file_path)
167 | 
168 |         pdf_base64 = self.process_file(file_path)
169 |         n_pages = len(pdf_base64)
170 |         tasks = [
171 |             self.asend_to_mlm(pdf_base64[i : i + batch_size])
172 |             for i in range(0, len(pdf_base64), batch_size)
173 |         ]
174 |         self.parsed_chunks = await asyncio.gather(*tasks)
175 |         responses = self.get_cleaned_content("\n".join(self.parsed_chunks))
176 |         return self.__to_elements_list__(responses, n_pages=n_pages)
177 | 
178 |     def convert(
179 |         self,
180 |         file_path: str | Path | None = None,
181 |         file: IO[bytes] | None = None,
182 |         file_extension: FileExtension | None = None,
183 |         batch_size: int = 3,
184 |         **kwargs,
185 |     ) -> MPDocument:
186 |         """
187 |         Parse a PDF file and process its content using the language model.
188 | 
189 |         :param file_path: Path to the PDF file
190 |         :param batch_size: Number of pages to process at a time
191 |         :return: List of processed content strings
192 |         """
193 |         if not file_path:
194 |             raise ValueError("File_path should be provided to run MegaParseVision")
195 | 
196 |         if isinstance(file_path, Path):
197 |             file_path = str(file_path)
198 | 
199 |         self.check_supported_extension(file_extension, file_path)
200 | 
201 |         pdf_base64 = self.process_file(file_path)
202 |         n_pages = len(pdf_base64)
203 |         chunks = [
204 |             pdf_base64[i : i + batch_size]
205 |             for i in range(0, len(pdf_base64), batch_size)
206 |         ]
207 |         self.parsed_chunks = []
208 |         for chunk in chunks:
209 |             response = self.send_to_mlm(chunk)
210 |             self.parsed_chunks.append(response)
211 |         responses = self.get_cleaned_content("\n".join(self.parsed_chunks))
212 |         return self.__to_elements_list__(responses, n_pages)
213 | 
214 |     def get_cleaned_content(self, parsed_file: str) -> str:
215 |         """
216 |         Get cleaned parsed file without any tags defined in TagEnum.
217 | 
218 |         This method removes all tags from TagEnum from the parsed file, formats the content,
219 |         and handles the HEADER tag specially by keeping only the first occurrence.
220 | 
221 |         Args:
222 |             parsed_file (str): The parsed file content with tags.
223 | 
224 |         Returns:
225 |             str: The cleaned content without TagEnum tags.
226 | 
227 |         """
228 |         tag_pattern = "|".join(map(re.escape, TagEnum.__members__.values()))
229 |         tag_regex = rf"\[({tag_pattern})\](.*?)\[/\1\]"
230 |         # handle the HEADER tag specially
231 |         header_pattern = rf"\[{TagEnum.HEADER.value}\](.*?)\[/{TagEnum.HEADER.value}\]"
232 |         headers = re.findall(header_pattern, parsed_file, re.DOTALL)
233 |         if headers:
234 |             first_header = headers[0].strip()
235 |             # Remove all HEADER tags and their content
236 |             parsed_file = re.sub(header_pattern, "", parsed_file, flags=re.DOTALL)
237 |             # Add the first header back at the beginning
238 |             parsed_file = f"{first_header}\n{parsed_file}"
239 | 
240 |         # Remove all other tags
241 |         def remove_tag(match):
242 |             return match.group(2)
243 | 
244 |         cleaned_content = re.sub(tag_regex, remove_tag, parsed_file, flags=re.DOTALL)
245 | 
246 |         cleaned_content = re.sub(r"^```.*$\n?", "", cleaned_content, flags=re.MULTILINE)
247 |         cleaned_content = re.sub(r"\n\s*\n", "\n\n", cleaned_content)
248 |         cleaned_content = cleaned_content.replace("|\n\n|", "|\n|")
249 |         cleaned_content = cleaned_content.strip()
250 | 
251 |         return cleaned_content
252 | 
253 |     def __to_elements_list__(self, mpv_doc: str, n_pages: int) -> MPDocument:
254 |         list_blocks: List[Block] = [
255 |             TextBlock(
256 |                 text=mpv_doc,
257 |                 metadata={},
258 |                 page_range=(0, n_pages - 1),
259 |                 bbox=BBOX(top_left=Point2D(x=0, y=0), bottom_right=Point2D(x=1, y=1)),
260 |             )
261 |         ]
262 |         return MPDocument(
263 |             metadata={},
264 |             detection_origin="megaparse_vision",
265 |             content=list_blocks,
266 |         )
267 | 


--------------------------------------------------------------------------------
/libs/megaparse/src/megaparse/predictor/layout_predictor.py:
--------------------------------------------------------------------------------
 1 | from PIL import Image
 2 | from unstructured_inference.inference.layout import PageLayout
 3 | from unstructured_inference.models.base import get_model
 4 | from unstructured_inference.visualize import draw_bbox
 5 | 
 6 | 
 7 | def extract_layout(
 8 |     page_number: int, page_image: Image.Image, model_name: str = "yolox"
 9 | ) -> PageLayout:
10 |     layout_model = get_model(model_name)
11 |     parsed_page = PageLayout.from_image(
12 |         image=page_image,
13 |         number=page_number,
14 |         detection_model=layout_model,
15 |         element_extraction_model=None,
16 |         fixed_layout=None,
17 |     )
18 | 
19 |     colors = ["red" for _ in parsed_page.elements]
20 |     for el, color in zip(parsed_page.elements, colors, strict=True):
21 |         page_image = draw_bbox(page_image, el, color=color, details=False)
22 | 
23 |     page_image.show()
24 | 
25 |     return parsed_page
26 | 


--------------------------------------------------------------------------------
/libs/megaparse/src/megaparse/utils/extract_metadata.py:
--------------------------------------------------------------------------------
1 | from typing import Any, Dict
2 | 
3 | import pypdfium2 as pdfium
4 | 
5 | 
6 | def get_doc_metdata(pdfium_document: pdfium.PdfDocument) -> Dict[str, Any]:
7 |     pass
8 | 


--------------------------------------------------------------------------------
/libs/megaparse/src/megaparse/utils/onnx.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from typing import List
 3 | 
 4 | import onnxruntime as rt
 5 | from megaparse.configs.auto import DeviceEnum
 6 | 
 7 | logger = logging.getLogger("megaparse")
 8 | 
 9 | 
10 | def get_providers(device: DeviceEnum) -> List[str]:
11 |     prov = rt.get_available_providers()
12 |     logger.info("Available providers: %s", prov)
13 |     if device == DeviceEnum.CUDA:
14 |         if "CUDAExecutionProvider" not in prov:
15 |             raise ValueError(
16 |                 "onnxruntime can't find CUDAExecutionProvider in list of available providers"
17 |             )
18 |         return ["CUDAExecutionProvider"]
19 |     elif device == DeviceEnum.COREML:
20 |         if "CoreMLExecutionProvider" not in prov:
21 |             raise ValueError(
22 |                 "onnxruntime can't find CoreMLExecutionProvider in list of available providers"
23 |             )
24 |         return ["CoreMLExecutionProvider"]
25 |     elif device == DeviceEnum.CPU:
26 |         return ["CPUExecutionProvider"]
27 |     else:
28 |         raise ValueError("device not in (CUDA,CoreML,CPU)")
29 | 


--------------------------------------------------------------------------------
/libs/megaparse/src/megaparse/utils/strategy.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | 
 3 | import numpy as np
 4 | from megaparse.models.page import Page
 5 | from megaparse_sdk.schema.document import TextDetection
 6 | from megaparse_sdk.schema.parser_config import StrategyEnum
 7 | from pypdfium2._helpers.page import PdfPage
 8 | 
 9 | 
10 | def get_page_strategy(
11 |     pdfium_page: PdfPage, onnxtr_page: TextDetection | None, threshold: float
12 | ) -> StrategyEnum:
13 |     if onnxtr_page is None:
14 |         return StrategyEnum.FAST
15 |     text_coords = []
16 |     # Get all the images in the page
17 |     for obj in pdfium_page.get_objects():
18 |         if obj.type == 1:  # type: ignore
19 |             text_coords.append(obj.get_pos())
20 | 
21 |     p_width, p_height = int(pdfium_page.get_width()), int(pdfium_page.get_height())
22 | 
23 |     pdfium_canva = np.zeros((int(p_height), int(p_width)))
24 | 
25 |     for coords in text_coords:
26 |         # (left,bottom,right, top)
27 |         # 0---l--------------R-> y
28 |         # |
29 |         # B   (x0,y0)
30 |         # |
31 |         # T                 (x1,y1)
32 |         # ^
33 |         # x
34 |         x0, y0, x1, y1 = (
35 |             p_height - coords[3],
36 |             coords[0],
37 |             p_height - coords[1],
38 |             coords[2],
39 |         )
40 |         x0 = max(0, min(p_height, int(x0)))
41 |         y0 = max(0, min(p_width, int(y0)))
42 |         x1 = max(0, min(p_height, int(x1)))
43 |         y1 = max(0, min(p_width, int(y1)))
44 |         pdfium_canva[x0:x1, y0:y1] = 1
45 | 
46 |     onnxtr_canva = np.zeros((int(p_height), int(p_width)))
47 |     for block in onnxtr_page.bboxes:
48 |         x0, y0 = block.bbox[0]
49 |         x1, y1 = block.bbox[1]
50 |         x0 = max(0, min(int(x0 * p_width), int(p_width)))
51 |         y0 = max(0, min(int(y0 * p_height), int(p_height)))
52 |         x1 = max(0, min(int(x1 * p_width), int(p_width)))
53 |         y1 = max(0, min(int(y1 * p_height), int(p_height)))
54 |         onnxtr_canva[y0:y1, x0:x1] = 1
55 | 
56 |     intersection = np.logical_and(pdfium_canva, onnxtr_canva)
57 |     union = np.logical_or(pdfium_canva, onnxtr_canva)
58 |     sum_intersection = np.sum(intersection)
59 |     sum_union = np.sum(union)
60 |     iou = sum_intersection / sum_union if sum_union != 0 else 0
61 |     if iou < threshold:
62 |         return StrategyEnum.HI_RES
63 |     return StrategyEnum.FAST
64 | 
65 | 
66 | def determine_global_strategy(pages: List[Page], threshold: float) -> StrategyEnum:
67 |     count = sum(1 for page in pages if page.strategy == StrategyEnum.HI_RES)
68 |     if count / len(pages) > threshold:
69 |         return StrategyEnum.HI_RES
70 |     return StrategyEnum.FAST
71 | 


--------------------------------------------------------------------------------
/libs/megaparse/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QuivrHQ/MegaParse/ba9a24aec950d6cf14834b8e2e11f5725778f12e/libs/megaparse/tests/__init__.py


--------------------------------------------------------------------------------
/libs/megaparse/tests/certs/client-cert.pem:
--------------------------------------------------------------------------------
 1 | -----BEGIN CERTIFICATE-----
 2 | MIIEqDCCAxCgAwIBAgIRAITvq6ZEk6paYFDRbueJhEMwDQYJKoZIhvcNAQELBQAw
 3 | gZ0xHjAcBgNVBAoTFW1rY2VydCBkZXZlbG9wbWVudCBDQTE5MDcGA1UECwwwYW1p
 4 | bmVAYW1pbmVzLU1hY0Jvb2stUHJvLmxvY2FsIChhbWluZSBkaXJob3Vzc2kpMUAw
 5 | PgYDVQQDDDdta2NlcnQgYW1pbmVAYW1pbmVzLU1hY0Jvb2stUHJvLmxvY2FsIChh
 6 | bWluZSBkaXJob3Vzc2kpMB4XDTI0MTExOTEwNDgwN1oXDTI3MDIxOTEwNDgwN1ow
 7 | ZDEnMCUGA1UEChMebWtjZXJ0IGRldmVsb3BtZW50IGNlcnRpZmljYXRlMTkwNwYD
 8 | VQQLDDBhbWluZUBhbWluZXMtTWFjQm9vay1Qcm8ubG9jYWwgKGFtaW5lIGRpcmhv
 9 | dXNzaSkwggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEKAoIBAQC2fDlGlKYIj8bp
10 | tlDYh8ooc56Zt+R1HF1GcqF0Gv+oub/dDvsIZnun5bBnA7W3tJ4M6virwg6cBiA5
11 | KDkIbwWfzHatsvFM0gMX3ZEfAwemo9Egi8udOsuAkP0OYlxzAB1PqOKCRfcfzFcH
12 | qmOb/JNlI82LBDLOqJDfGG4cRBYWqWRTYDxHsswSKFr/QHOHpImtrAyqo8qsXobN
13 | gLWSm1cNNtHa5XiCCJ7NUCVZh5cyEeCv1fS2297N+H0W9BxKpb1f9sAQ2N3ZLei8
14 | ghHuQVA8yhUB1YCO/8jsywvXb8EnZctPLvhuLxeCN7A4TESPk5i0LsITqJcl4vQT
15 | WWVVcNfJAgMBAAGjgZowgZcwDgYDVR0PAQH/BAQDAgWgMCcGA1UdJQQgMB4GCCsG
16 | AQUFBwMCBggrBgEFBQcDAQYIKwYBBQUHAwQwHwYDVR0jBBgwFoAUV2w3gvQM5La1
17 | 2fk80tJXoM/14l4wOwYDVR0RBDQwMoIJbG9jYWxob3N0gRNtZWdhcGFyc2VAcXVp
18 | dnIuYXBwhxAAAAAAAAAAAAAAAAAAAAABMA0GCSqGSIb3DQEBCwUAA4IBgQAYq4VZ
19 | 6spwGvcqg8kCOghu6o54UPYo/NLzh3oYewJnDJ+2XD786TpTgjZMGA6Ms+det6oV
20 | HdT5s77VFgJiJloHlD0fpKkRxjzyBOk5/bQcCKkTMBVfgJbMoAfa2gq+/7zxmLcn
21 | AmNg7BkmsTtHWPsLyN3rYI4dkkDKWkxp8Sezm9WPEa9OGJDJSYf4Dq9pN1lUoP1p
22 | vxsq7sW0HDWnx/I2zWuz3AaT9b4UayRnk4IRYxAuYYN/k0GNjVmmDveywNoNlkmW
23 | 0Az6ycPN+vvz8Jpm3CbZSIQLO8Yn57H/aU4DmOtunm3VLUiLucmfOggv8Sq5n2g9
24 | ze61UJu9lr2/nWOXnErl3V9UL3kJ1OlbFzTWDGm9zX7boo6MLXy+fAj+Tw0sCeMr
25 | drdxo8IUYYU6HUdtuLGMFznBFFUNhfFSwFANGPB38NyofwLPSZM0hYntQqBMt/P7
26 | /E+wQ67hSEutkIbOD3kGkGREIk3dVyUeajO9DFTaQ+yTnNtnuUbxs5LkRlw=
27 | -----END CERTIFICATE-----
28 | 


--------------------------------------------------------------------------------
/libs/megaparse/tests/certs/client-key.pem:
--------------------------------------------------------------------------------
 1 | -----BEGIN PRIVATE KEY-----
 2 | MIIEvgIBADANBgkqhkiG9w0BAQEFAASCBKgwggSkAgEAAoIBAQC2fDlGlKYIj8bp
 3 | tlDYh8ooc56Zt+R1HF1GcqF0Gv+oub/dDvsIZnun5bBnA7W3tJ4M6virwg6cBiA5
 4 | KDkIbwWfzHatsvFM0gMX3ZEfAwemo9Egi8udOsuAkP0OYlxzAB1PqOKCRfcfzFcH
 5 | qmOb/JNlI82LBDLOqJDfGG4cRBYWqWRTYDxHsswSKFr/QHOHpImtrAyqo8qsXobN
 6 | gLWSm1cNNtHa5XiCCJ7NUCVZh5cyEeCv1fS2297N+H0W9BxKpb1f9sAQ2N3ZLei8
 7 | ghHuQVA8yhUB1YCO/8jsywvXb8EnZctPLvhuLxeCN7A4TESPk5i0LsITqJcl4vQT
 8 | WWVVcNfJAgMBAAECggEBAIK2AlSzHyacze8UH16qDTzibGVRGjxkf895Rnqi6COU
 9 | QYD3PQrsVYCS/sMbHiujHV7FZC+rRcmufaBTVl7bH10yGIQc28iZ2YtbsppTEkTj
10 | rGUynTtXJPNHZ2vJOs1I9LXdk7maogPN2zzraIQP7AgTGCSOclIi3fpfRmfKwUOj
11 | BkEzj7CbaAGtW9vTamPJG/+wgaaBcPhplQk4cD2mjdaMLfGQXNXiYgp09kf0hJ2k
12 | 0QbsQBC85bMSfmPAsoTRLxi94S12at3SABgF0oOCy9FZs/sWsdJRI6nbfvZ3C4xo
13 | 8y+rH7Yaej7AYK+jbU3Uk/1473cuCAnNKg65UyU4+gECgYEA2/ZQYRDU3JWNHQGy
14 | dJXZRl6hSFCw9y9RUc/QjcRs+VlnXE5UK1eLwfcKh0YYRhIWSE8z3mZmK09M/FG0
15 | xbU4qIZbDYcAI2nCiUeT8HmTjVSPMS1oWZrt7rh00gcyoLQt2TUS3bo2tsmdPyWW
16 | OgEiYfb4MoG/KCdYlACE6O4GMMECgYEA1GIMIHM2x4B1wgLnKeI3X2wYWuYCHtFB
17 | Px56GUFTZytBsHghxtovVlLh88FNS5rthvXuE0FHE9RljKhZaNgqrPOrlAZSuv18
18 | vK7RmG/NPJl2osbs677a/xoxNuVkfrRcxl4cvYOBL5huHo1D5sOitGFW+IlscgWY
19 | nWzXlY7AYQkCgYA6H96hp7b4CzTc42Pq1uYxaDQqTdhVmVVdzxKHQ86gHXXouHIZ
20 | eereeI95q5YifgkRVoyYSmrZKv1m95hTXk34inhpHLF2qi3T5Ow88YOCJ0QndJ5M
21 | f1o8aNXF4k0IllQ/P30axmhK6P/6fc4yybXyOTbg8dQ3oh4QDgsRGkTcgQKBgQCG
22 | qLgJpyN3cPK5FYAeJUl4nh//GlED2yekbp15/9py0pFu42x/GX3kHN8Y31oz8sJh
23 | zPKrkLsRTp0ohuFRwaWlTUZfr3arCugY9jr8jP6zSpZW9QvpGXTfRGsp5F5Im/Eq
24 | 8ScF3ih91gcUJfuEiExUVFeBdBinXvb58bXrJLzDiQKBgG+Z06uj2dWxtK4nqJvP
25 | HllTocAGVm+fEmupVsLU6ksVVrOl8O9TapMbY8pUj9J5oBYJvY+KFGoIoxYwhZrz
26 | 4NqY7iv8w+LQ7mQIwcQ4B67pDAQMJZTShR5v57FlAZldP5UpE5ASt22isBW31sYI
27 | 1OaXIqrCA/V43NydDezh0ylQ
28 | -----END PRIVATE KEY-----
29 | 


--------------------------------------------------------------------------------
/libs/megaparse/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | from typing import IO
 3 | 
 4 | import pytest_asyncio
 5 | from httpx import ASGITransport, AsyncClient
 6 | from langchain_community.document_loaders import PlaywrightURLLoader
 7 | from langchain_core.documents import Document
 8 | from megaparse.api.app import app, get_playwright_loader, parser_builder_dep
 9 | from megaparse.parser.base import BaseParser
10 | from megaparse_sdk.schema.document import Document as MPDocument
11 | from megaparse_sdk.schema.document import TextBlock
12 | from megaparse_sdk.schema.extensions import FileExtension
13 | 
14 | 
15 | class FakeParserBuilder:
16 |     def build(self, *args, **kwargs) -> BaseParser:
17 |         """
18 |         Build a fake parser based on the given configuration.
19 | 
20 |         Returns:
21 |             BaseParser: The built fake parser.
22 | 
23 |         Raises:
24 |             ValueError: If the configuration is invalid.
25 |         """
26 | 
27 |         class FakeParser(BaseParser):
28 |             def convert(
29 |                 self,
30 |                 file_path: str | Path | None = None,
31 |                 file: IO[bytes] | None = None,
32 |                 file_extension: None | FileExtension = None,
33 |                 **kwargs,
34 |             ) -> MPDocument:
35 |                 print("Fake parser is converting the file")
36 |                 return MPDocument(
37 |                     file_name="Fake file",
38 |                     content=[TextBlock(text="Fake conversion result", metadata={})],
39 |                     metadata={},
40 |                     detection_origin="fakeparser",
41 |                 )
42 | 
43 |             async def aconvert(
44 |                 self,
45 |                 file_path: str | Path | None = None,
46 |                 file: IO[bytes] | None = None,
47 |                 file_extension: None | FileExtension = None,
48 |                 **kwargs,
49 |             ) -> MPDocument:
50 |                 print("Fake parser is converting the file")
51 |                 return MPDocument(
52 |                     file_name="Fake file",
53 |                     content=[TextBlock(text="Fake conversion result", metadata={})],
54 |                     metadata={},
55 |                     detection_origin="fakeparser",
56 |                 )
57 | 
58 |         return FakeParser()
59 | 
60 | 
61 | @pytest_asyncio.fixture(scope="function")
62 | async def test_client():
63 |     print("Setting up test_client fixture")
64 | 
65 |     def fake_parser_builder():
66 |         return FakeParserBuilder()
67 | 
68 |     def fake_playwright_loader():
69 |         class FakePlaywrightLoader(PlaywrightURLLoader):
70 |             async def aload(self):
71 |                 return [Document(page_content="Fake website content")]
72 | 
73 |         return FakePlaywrightLoader(urls=[], remove_selectors=["header", "footer"])
74 | 
75 |     app.dependency_overrides[parser_builder_dep] = fake_parser_builder
76 |     app.dependency_overrides[get_playwright_loader] = fake_playwright_loader
77 |     async with AsyncClient(
78 |         transport=ASGITransport(app=app),  # type: ignore
79 |         base_url="http://test",
80 |     ) as ac:
81 |         yield ac
82 |     app.dependency_overrides = {}
83 | 


--------------------------------------------------------------------------------
/libs/megaparse/tests/data/MegaFake_report.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QuivrHQ/MegaParse/ba9a24aec950d6cf14834b8e2e11f5725778f12e/libs/megaparse/tests/data/MegaFake_report.pdf


--------------------------------------------------------------------------------
/libs/megaparse/tests/data/dummy.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QuivrHQ/MegaParse/ba9a24aec950d6cf14834b8e2e11f5725778f12e/libs/megaparse/tests/data/dummy.pdf


--------------------------------------------------------------------------------
/libs/megaparse/tests/data/grt_example/MegaFake_report.md:
--------------------------------------------------------------------------------
 1 | | My Mega fake report | #1756394 | 31/05/2024 |
 2 | |---------------------|----------|------------|
 3 | 
 4 | ## Why Mega Parse might be the best ?
 5 | 
 6 | ### Introduction
 7 | 
 8 | Mega Parse is a state-of-the-art document parser designed to convert various document formats such as PDF, DOCX, PPTX, and more into Markdown (MD) format, making them ready for Retrieval-Augmented Generation (RAG) ingestion. In today's data-driven world, the ability to efficiently manage and utilize large volumes of information is crucial. This report explores the features, benefits, and comparative performance of Mega Parse, illustrating why it stands out as a superior tool in the realm of document parsing.
 9 | 
10 | ### Features of Mega Parse
11 | 
12 | Mega Parse boasts an impressive array of features tailored to meet the diverse needs of modern enterprises.
13 | 
14 | **Multiple Format Support:** Mega Parse supports a wide range of document formats including PDF, DOCX, and PPTX. This versatility allows users to handle various document types without needing multiple tools. Whether you are working with text documents, presentations, or scanned PDFs, Mega Parse has you covered.
15 | 
16 | **High-Speed Processing:** One of the standout features of Mega Parse is its ability to convert documents at a rapid pace. With processing speeds of up to 120 pages per minute, it significantly enhances productivity by reducing the time spent on document conversion.
17 | 
18 | **Markdown Output:** Mega Parse converts documents into a structured Markdown format. Markdown is a lightweight markup language with plain text formatting syntax, which is widely used because of its simplicity and ease of conversion to other formats. This makes it ideal for RAG ingestion, where structured and easily interpretable data is paramount.
19 | 
20 | Accuracy: Accuracy in text extraction and formatting is a critical aspect of any document parser. Mega Parse ensures high accuracy, maintaining the integrity and structure of the original documents. This is particularly important for documents that contain complex formatting and embedded elements.
21 | 
22 | Customizable Parsing Rules: Users can define custom parsing rules to suit specific needs, allowing for greater control over the conversion process. This flexibility ensures that Mega Parse can be adapted to a wide variety of use cases.
23 | 
24 | Batch Processing: Mega Parse supports batch processing, enabling the simultaneous conversion of multiple documents. This feature is particularly useful for organizations dealing with large volumes of documents, as it streamlines the workflow and saves time.
25 | 
26 | Error Handling: Advanced error handling capabilities ensure that any issues encountered during the conversion process are managed effectively, minimizing disruptions and maintaining workflow efficiency.
27 | 
28 | # Benefits of Mega Parse
29 | 
30 | The implementation of Mega Parse offers numerous benefits that can transform the way organizations manage their documents.
31 | 
32 | **Efficiency:** By significantly speeding up the document conversion process, Mega Parse increases overall efficiency. This is especially beneficial for industries that handle large volumes of documents on a daily basis, such as legal firms, financial institutions, and research organizations.
33 | 
34 | **Versatility:** Mega Parse's ability to handle multiple document types makes it a versatile tool for various industries. Whether you need to convert legal documents, technical manuals, or business presentations, Mega Parse is equipped to handle the task.
35 | 
36 | **Enhanced Knowledge Management:** Converting documents to Markdown facilitates easier content management and retrieval. Markdown files are not only lightweight but
37 | also highly compatible with various knowledge management systems, making it easier to organize, search, and utilize information.
38 | 
39 | Improved Workflow: Mega Parse simplifies the process of preparing documents for machine learning and AI applications. By converting documents into a structured format, it reduces the time and effort required to preprocess data, allowing teams to focus on higher-level tasks.
40 | 
41 | Cost Savings: The efficiency and speed of Mega Parse can lead to significant cost savings. Reduced processing times and improved workflow efficiency mean that resources can be allocated more effectively, ultimately lowering operational costs.
42 | 
43 | Scalability: Mega Parse is designed to scale with the needs of an organization. As document volumes grow, Mega Parse can handle the increased load without compromising performance, making it a future-proof solution for document management.
44 | 
45 | # Comparative Performance
46 | 
47 | The following table provides a comprehensive comparative analysis of Mega Parse against other document parsers based on fictional performance metrics. This comparison highlights the strengths of Mega Parse in various key areas.
48 | 
49 | | Metric              | Mega Parse       | Parser A       | Parser B       | Parser C       | Parser D       |
50 | |---------------------|------------------|----------------|----------------|----------------|----------------|
51 | | Supported Formats   | PDF, DOCX, PPTX  | PDF, DOCX      | DOCX, PPTX     | PDF, PPTX      | PDF, DOCX, XLSX|
52 | | Conversion Speed (pages/min)   | 120              | 90             | 100            | 85             | 95             |
53 | | **Accuracy Rate (%)**    | 98         | 95         | 93         | 90         | 92         |
54 | | **Output Format**        | Markdown   | HTML       | Markdown   | Plain Text | HTML       |
55 | | **Error Rate (%)**       | 1          | 3          | 4          | 5          | 3          |
56 | | **Ease of Use**          | High       | Medium     | High       | Medium     | Medium     |
57 | | **Integration Capability** | Excellent | Good       | Good       | Fair       | Good       |
58 | | **Batch Processing**     | Yes        | No         | Yes        | No         | Yes        |
59 | | **Custom Parsing Rules** | Yes        | Limited    | Yes        | No         | Limited    |
60 | | **Multilingual Support** | Yes        | Yes        | No         | Yes        | Yes        |
61 | | **OCR (Optical Character Recognition)** | Yes | No | Yes | No | Yes |
62 | | **Price (per user/month)** | $30       | $25        | $20        | $15        | $18        |
63 | | **Customer Support Rating (out of 5)** | 4.8 | 4.2 | 4.5 | 3.9 | 4.1 |
64 | | **Free Trial Available** | Yes        | Yes        | No         | Yes        | No         |
65 | | **Cloud Integration**    | Yes        | No         | Yes        | Yes        | No         |
66 | | **Security Features**    | Advanced   | Basic      | Advanced   | Basic      | Intermediate |
67 | | **User Community Size**        | Large                          | Medium                         | Medium                         | Small                          | Medium                         |
68 | | **Monthly Updates**            | Yes                            | Yes                            | No                             | Yes                            | No                             |
69 | | **Mobile App Availability**    | Yes                            | No                             | Yes                            | No                             | Yes                            |
70 | | **Platform Compatibility**     | Windows, Mac, Linux            | Windows, Mac                   | Windows                        | Mac, Linux                     | Windows, Linux                 |
71 | | **Data Privacy Compliance**    | High                           | Medium                         | High                           | Low                            | Medium                         |
72 | | **AI-Driven Enhancements**     | Yes                            | No                             | Yes                            | No                             | Yes                            |
73 | | **File Size Limit (per document)** | 1GB                            | 500MB                          | 750MB                          | 200MB                          | 500MB                          |
74 | | **User Training Resources**    | Extensive                      | Moderate                       | Extensive                      | Limited                        | Moderate                       |
75 | | **API Access**                 | Yes                            | No                             | Yes                            | No                             | Yes                            |
76 | | **Customizable Output Templates** | Yes                            | Limited                        | Yes                            | No                             | Yes                            |
77 | | **Collaboration Features**     | Yes                            | No                             | Yes                            | No                             | Limited                        |
78 | | **Document Version Control**   | Yes                            | No                             | Yes                            | No                             | Yes                            |
79 | | **Import/Export Options**      | Extensive                      | Moderate                       | Extensive                      | Limited                        | Moderate                       |
80 | | Feedback Mechanism | Yes | No | Yes | No | Yes |
81 | 
82 | *Note: All data presented in this table is fictional and for illustrative purposes only.*
83 | 
84 | ## Conclusion
85 | 
86 | Mega Parse stands out as a leading document parser due to its extensive format support, high-speed processing, and accuracy. Its ability to convert a variety of document types into Markdown format makes it an invaluable tool for organizations looking to streamline their document management processes and enhance their knowledge management systems. With features like customizable parsing rules, batch processing, and advanced error handling, Mega Parse is well-equipped to meet the demands of modern enterprises. Its scalability and cost-effectiveness further reinforce its position as a top choice for document parsing and conversion needs. By leveraging Mega Parse, organizations can improve their workflow efficiency, reduce operational costs, and better manage their information assets in the age of big data and artificial intelligence.


--------------------------------------------------------------------------------
/libs/megaparse/tests/pdf/mlbook.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QuivrHQ/MegaParse/ba9a24aec950d6cf14834b8e2e11f5725778f12e/libs/megaparse/tests/pdf/mlbook.pdf


--------------------------------------------------------------------------------
/libs/megaparse/tests/pdf/native/0168011.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QuivrHQ/MegaParse/ba9a24aec950d6cf14834b8e2e11f5725778f12e/libs/megaparse/tests/pdf/native/0168011.pdf


--------------------------------------------------------------------------------
/libs/megaparse/tests/pdf/native/0168014.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QuivrHQ/MegaParse/ba9a24aec950d6cf14834b8e2e11f5725778f12e/libs/megaparse/tests/pdf/native/0168014.pdf


--------------------------------------------------------------------------------
/libs/megaparse/tests/pdf/native/0168029.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QuivrHQ/MegaParse/ba9a24aec950d6cf14834b8e2e11f5725778f12e/libs/megaparse/tests/pdf/native/0168029.pdf


--------------------------------------------------------------------------------
/libs/megaparse/tests/pdf/ocr/0168003.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QuivrHQ/MegaParse/ba9a24aec950d6cf14834b8e2e11f5725778f12e/libs/megaparse/tests/pdf/ocr/0168003.pdf


--------------------------------------------------------------------------------
/libs/megaparse/tests/pdf/ocr/0168004.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QuivrHQ/MegaParse/ba9a24aec950d6cf14834b8e2e11f5725778f12e/libs/megaparse/tests/pdf/ocr/0168004.pdf


--------------------------------------------------------------------------------
/libs/megaparse/tests/pdf/ocr/0168119.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QuivrHQ/MegaParse/ba9a24aec950d6cf14834b8e2e11f5725778f12e/libs/megaparse/tests/pdf/ocr/0168119.pdf


--------------------------------------------------------------------------------
/libs/megaparse/tests/pdf/ocr/0168120.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QuivrHQ/MegaParse/ba9a24aec950d6cf14834b8e2e11f5725778f12e/libs/megaparse/tests/pdf/ocr/0168120.pdf


--------------------------------------------------------------------------------
/libs/megaparse/tests/pdf/ocr/0168123.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QuivrHQ/MegaParse/ba9a24aec950d6cf14834b8e2e11f5725778f12e/libs/megaparse/tests/pdf/ocr/0168123.pdf


--------------------------------------------------------------------------------
/libs/megaparse/tests/pdf/ocr/0168126.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QuivrHQ/MegaParse/ba9a24aec950d6cf14834b8e2e11f5725778f12e/libs/megaparse/tests/pdf/ocr/0168126.pdf


--------------------------------------------------------------------------------
/libs/megaparse/tests/pdf/ocr/0168127.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QuivrHQ/MegaParse/ba9a24aec950d6cf14834b8e2e11f5725778f12e/libs/megaparse/tests/pdf/ocr/0168127.pdf


--------------------------------------------------------------------------------
/libs/megaparse/tests/pdf/ocr/0168322.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QuivrHQ/MegaParse/ba9a24aec950d6cf14834b8e2e11f5725778f12e/libs/megaparse/tests/pdf/ocr/0168322.pdf


--------------------------------------------------------------------------------
/libs/megaparse/tests/pdf/rust.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QuivrHQ/MegaParse/ba9a24aec950d6cf14834b8e2e11f5725778f12e/libs/megaparse/tests/pdf/rust.pdf


--------------------------------------------------------------------------------
/libs/megaparse/tests/pdf/sample_native.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QuivrHQ/MegaParse/ba9a24aec950d6cf14834b8e2e11f5725778f12e/libs/megaparse/tests/pdf/sample_native.pdf


--------------------------------------------------------------------------------
/libs/megaparse/tests/pdf/sample_pdf.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QuivrHQ/MegaParse/ba9a24aec950d6cf14834b8e2e11f5725778f12e/libs/megaparse/tests/pdf/sample_pdf.pdf


--------------------------------------------------------------------------------
/libs/megaparse/tests/pdf/sample_table.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QuivrHQ/MegaParse/ba9a24aec950d6cf14834b8e2e11f5725778f12e/libs/megaparse/tests/pdf/sample_table.pdf


--------------------------------------------------------------------------------
/libs/megaparse/tests/pdf/test_detect_ocr.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import pypdfium2
 4 | import pytest
 5 | from megaparse.megaparse import MegaParse
 6 | from megaparse.utils.strategy import determine_global_strategy
 7 | from megaparse_sdk.schema.parser_config import StrategyEnum
 8 | 
 9 | ocr_pdfs = os.listdir("./tests/pdf/ocr")
10 | native_pdfs = os.listdir("./tests/pdf/native")
11 | 
12 | megaparse = MegaParse()
13 | 
14 | 
15 | @pytest.mark.parametrize("hi_res_pdf", ocr_pdfs)
16 | def test_hi_res_strategy(hi_res_pdf):
17 |     if hi_res_pdf == "0168004.pdf":
18 |         pytest.skip("Skip 0168004.pdf as it is flaky currently")
19 | 
20 |     pdf_doc = pypdfium2.PdfDocument(f"./tests/pdf/ocr/{hi_res_pdf}")
21 |     pages = megaparse.extract_page_strategies(pdf_doc)
22 |     assert (
23 |         determine_global_strategy(
24 |             pages, megaparse.config.auto_config.document_threshold
25 |         )
26 |         == StrategyEnum.HI_RES
27 |     )
28 | 
29 | 
30 | @pytest.mark.parametrize("native_pdf", native_pdfs)
31 | def test_fast_strategy(native_pdf):
32 |     if native_pdf == "0168029.pdf":
33 |         pytest.skip("Skip 0168029.pdf as it is too long to process")
34 | 
35 |     pdf_doc = pypdfium2.PdfDocument(f"./tests/pdf/native/{native_pdf}")
36 |     pages = megaparse.extract_page_strategies(pdf_doc)
37 | 
38 |     assert (
39 |         determine_global_strategy(
40 |             pages, megaparse.config.auto_config.document_threshold
41 |         )
42 |         == StrategyEnum.FAST
43 |     )
44 | 


--------------------------------------------------------------------------------
/libs/megaparse/tests/pdf/test_pdf_processing.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | import pypdfium2
 4 | import pytest
 5 | from megaparse.configs.auto import (
 6 |     DeviceEnum,
 7 |     MegaParseConfig,
 8 | )
 9 | from megaparse.megaparse import MegaParse
10 | from megaparse.utils.strategy import determine_global_strategy
11 | from megaparse_sdk.schema.extensions import FileExtension
12 | from megaparse_sdk.schema.parser_config import StrategyEnum
13 | 
14 | 
15 | @pytest.fixture
16 | def native_pdf() -> Path:
17 |     p = Path("./tests/pdf/sample_native.pdf")
18 |     return p
19 | 
20 | 
21 | @pytest.fixture
22 | def scanned_pdf() -> Path:
23 |     p = Path("./tests/pdf/sample_pdf.pdf")
24 |     return p
25 | 
26 | 
27 | # def test_get_default_processors_megaparse():
28 | #     megaparse = MegaParse()
29 | #     assert type(megaparse.parser) is UnstructuredParser
30 | 
31 | 
32 | @pytest.mark.asyncio
33 | @pytest.mark.parametrize("pdf_name", ["scanned_pdf", "native_pdf"])
34 | async def test_async_megaparse_pdf_processor_file_path(pdf_name, request):
35 |     pdf = request.getfixturevalue(pdf_name)
36 |     processor = MegaParse(config=MegaParseConfig(device=DeviceEnum.COREML))
37 |     result = await processor.aload(file_path=pdf)
38 |     assert len(str(result)) > 0
39 | 
40 | 
41 | @pytest.mark.parametrize("pdf_name", ["scanned_pdf", "native_pdf"])
42 | def test_sync_megaparse_pdf_processor_file_path(pdf_name, request):
43 |     pdf = request.getfixturevalue(pdf_name)
44 |     processor = MegaParse()
45 |     result = processor.load(file_path=pdf)
46 |     assert len(result) > 0
47 | 
48 | 
49 | @pytest.mark.asyncio
50 | @pytest.mark.parametrize("pdf_name", ["scanned_pdf", "native_pdf"])
51 | async def test_megaparse_pdf_processor_file(pdf_name, request):
52 |     pdf = request.getfixturevalue(pdf_name)
53 |     processor = MegaParse()
54 |     with open(pdf, "rb") as f:
55 |         result = await processor.aload(file=f, file_extension=FileExtension.PDF)
56 |         assert len(str(result)) > 0
57 | 
58 | 
59 | def test_strategy_native(native_pdf):
60 |     processor = MegaParse()
61 |     pdf_doc = pypdfium2.PdfDocument(native_pdf)
62 | 
63 |     pages = processor.extract_page_strategies(pdf_doc)
64 | 
65 |     assert (
66 |         determine_global_strategy(
67 |             pages, processor.config.auto_config.document_threshold
68 |         )
69 |         == StrategyEnum.FAST
70 |     )
71 |     pdf_doc.close()
72 | 
73 | 
74 | def test_strategy_scanned(scanned_pdf):
75 |     processor = MegaParse()
76 |     pdf_doc = pypdfium2.PdfDocument(scanned_pdf)
77 |     pages = processor.extract_page_strategies(pdf_doc)
78 |     assert (
79 |         determine_global_strategy(
80 |             pages, processor.config.auto_config.document_threshold
81 |         )
82 |         == StrategyEnum.HI_RES
83 |     )
84 |     pdf_doc.close()
85 | 


--------------------------------------------------------------------------------
/libs/megaparse/tests/pdf/test_pdfium_parser.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | import pypdfium2 as pdfium
 4 | 
 5 | 
 6 | def test_pdfium():
 7 |     # scanned pdf
 8 |     p = Path("./tests/pdf/mlbook.pdf")
 9 |     document = pdfium.PdfDocument(p)
10 | 
11 |     objs = []
12 |     for page in document:
13 |         for obj in page.get_objects():
14 |             objs.append(obj)
15 | 
16 |     document.close()
17 | 


--------------------------------------------------------------------------------
/libs/megaparse/tests/supported_docs/Sway.epub:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QuivrHQ/MegaParse/ba9a24aec950d6cf14834b8e2e11f5725778f12e/libs/megaparse/tests/supported_docs/Sway.epub


--------------------------------------------------------------------------------
/libs/megaparse/tests/supported_docs/file-sample_500kB.odt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QuivrHQ/MegaParse/ba9a24aec950d6cf14834b8e2e11f5725778f12e/libs/megaparse/tests/supported_docs/file-sample_500kB.odt


--------------------------------------------------------------------------------
/libs/megaparse/tests/supported_docs/file_example_XLSX_50.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QuivrHQ/MegaParse/ba9a24aec950d6cf14834b8e2e11f5725778f12e/libs/megaparse/tests/supported_docs/file_example_XLSX_50.xlsx


--------------------------------------------------------------------------------
/libs/megaparse/tests/supported_docs/file_example_XLS_50.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QuivrHQ/MegaParse/ba9a24aec950d6cf14834b8e2e11f5725778f12e/libs/megaparse/tests/supported_docs/file_example_XLS_50.xls


--------------------------------------------------------------------------------
/libs/megaparse/tests/supported_docs/sample.csv:
--------------------------------------------------------------------------------
1 | Name,Description
2 | MegaParse,"MegaParse is the best parser, even with accents like é, è, and ñ."
3 | OtherParse,"OtherParse is a decent parser, but it struggles with accents."
4 | RandomParse,"RandomParse is another parser, but it often fails with special characters."


--------------------------------------------------------------------------------
/libs/megaparse/tests/supported_docs/sample.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QuivrHQ/MegaParse/ba9a24aec950d6cf14834b8e2e11f5725778f12e/libs/megaparse/tests/supported_docs/sample.docx


--------------------------------------------------------------------------------
/libs/megaparse/tests/supported_docs/sample.markdown:
--------------------------------------------------------------------------------
 1 | # The Difficulty of Parsing Files
 2 | 
 3 | Parsing files can be a challenging task due to several factors:
 4 | 
 5 | ## 1. File Format Variability
 6 | Different file formats (e.g., JSON, XML, CSV) require different parsing techniques. Each format has its own structure and rules, making it necessary to handle each one uniquely.
 7 | 
 8 | ## 2. Inconsistent Data
 9 | Files often contain inconsistent or malformed data. Handling these inconsistencies requires robust error-checking and validation mechanisms.
10 | 
11 | ## 3. Large File Sizes
12 | Parsing large files can be resource-intensive and time-consuming. Efficient algorithms and memory management techniques are essential to handle large datasets.
13 | 
14 | ## 4. Encoding Issues
15 | Files may use different character encodings (e.g., UTF-8, ASCII). Properly detecting and handling these encodings is crucial to avoid data corruption.
16 | 
17 | ## 5. Nested Structures
18 | Some file formats, like JSON and XML, can have deeply nested structures. Parsing these nested structures requires recursive algorithms and careful handling of hierarchical data.
19 | 
20 | ## Conclusion
21 | Despite these challenges, effective file parsing is essential for data processing and analysis. By understanding and addressing these difficulties, developers can create robust parsers that handle a wide variety of file formats and data inconsistencies.
22 | 


--------------------------------------------------------------------------------
/libs/megaparse/tests/supported_docs/sample.md:
--------------------------------------------------------------------------------
 1 | # The Difficulty of Parsing Files
 2 | 
 3 | Parsing files can be a challenging task due to several factors:
 4 | 
 5 | ## 1. File Format Variability
 6 | Different file formats (e.g., JSON, XML, CSV) require different parsing techniques. Each format has its own structure and rules, making it necessary to handle each one uniquely.
 7 | 
 8 | ## 2. Inconsistent Data
 9 | Files often contain inconsistent or malformed data. Handling these inconsistencies requires robust error-checking and validation mechanisms.
10 | 
11 | ## 3. Large File Sizes
12 | Parsing large files can be resource-intensive and time-consuming. Efficient algorithms and memory management techniques are essential to handle large datasets.
13 | 
14 | ## 4. Encoding Issues
15 | Files may use different character encodings (e.g., UTF-8, ASCII). Properly detecting and handling these encodings is crucial to avoid data corruption.
16 | 
17 | ## 5. Nested Structures
18 | Some file formats, like JSON and XML, can have deeply nested structures. Parsing these nested structures requires recursive algorithms and careful handling of hierarchical data.
19 | 
20 | ## Conclusion
21 | Despite these challenges, effective file parsing is essential for data processing and analysis. By understanding and addressing these difficulties, developers can create robust parsers that handle a wide variety of file formats and data inconsistencies.
22 | 


--------------------------------------------------------------------------------
/libs/megaparse/tests/supported_docs/sample.otf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QuivrHQ/MegaParse/ba9a24aec950d6cf14834b8e2e11f5725778f12e/libs/megaparse/tests/supported_docs/sample.otf


--------------------------------------------------------------------------------
/libs/megaparse/tests/supported_docs/sample.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QuivrHQ/MegaParse/ba9a24aec950d6cf14834b8e2e11f5725778f12e/libs/megaparse/tests/supported_docs/sample.pptx


--------------------------------------------------------------------------------
/libs/megaparse/tests/supported_docs/sample.txt:
--------------------------------------------------------------------------------
 1 | Lorem ipsum 
 2 | 
 3 | Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nunc ac faucibus odio. 
 4 | 
 5 | Vestibulum neque massa, scelerisque sit amet ligula eu, congue molestie mi. Praesent ut varius sem. Nullam at porttitor arcu, nec lacinia nisi. Ut ac dolor vitae odio interdum condimentum. Vivamus dapibus sodales ex, vitae malesuada ipsum cursus convallis. Maecenas sed egestas nulla, ac condimentum orci. Mauris diam felis, vulputate ac suscipit et, iaculis non est. Curabitur semper arcu ac ligula semper, nec luctus nisl blandit. Integer lacinia ante ac libero lobortis imperdiet. Nullam mollis convallis ipsum, ac accumsan nunc vehicula vitae. Nulla eget justo in felis tristique fringilla. Morbi sit amet tortor quis risus auctor condimentum. Morbi in ullamcorper elit. Nulla iaculis tellus sit amet mauris tempus fringilla.
 6 | Maecenas mauris lectus, lobortis et purus mattis, blandit dictum tellus.
 7 | Maecenas non lorem quis tellus placerat varius. 
 8 | Nulla facilisi. 
 9 | Aenean congue fringilla justo ut aliquam. 
10 | Mauris id ex erat. Nunc vulputate neque vitae justo facilisis, non condimentum ante sagittis. 
11 | Morbi viverra semper lorem nec molestie. 
12 | Maecenas tincidunt est efficitur ligula euismod, sit amet ornare est vulputate.
13 | https://github.com/QuivrHQ/MegaParse


--------------------------------------------------------------------------------
/libs/megaparse/tests/supported_docs/sample.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <customers>
 3 |    <customer id="55000">
 4 |       <name>Charter Group</name>
 5 |       <address>
 6 |          <street>100 Main</street>
 7 |          <city>Framingham</city>
 8 |          <state>MA</state>
 9 |          <zip>01701</zip>
10 |       </address>
11 |       <address>
12 |          <street>720 Prospect</street>
13 |          <city>Framingham</city>
14 |          <state>MA</state>
15 |          <zip>01701</zip>
16 |       </address>
17 |       <address>
18 |          <street>120 Ridge</street>
19 |          <state>MA</state>
20 |          <zip>01760</zip>
21 |       </address>
22 |    </customer>
23 | </customers>


--------------------------------------------------------------------------------
/libs/megaparse/tests/supported_docs/sample_complexe.html:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QuivrHQ/MegaParse/ba9a24aec950d6cf14834b8e2e11f5725778f12e/libs/megaparse/tests/supported_docs/sample_complexe.html


--------------------------------------------------------------------------------
/libs/megaparse/tests/supported_docs/sample_native.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QuivrHQ/MegaParse/ba9a24aec950d6cf14834b8e2e11f5725778f12e/libs/megaparse/tests/supported_docs/sample_native.pdf


--------------------------------------------------------------------------------
/libs/megaparse/tests/test_endpoints.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | 
 4 | @pytest.mark.asyncio
 5 | async def test_parse_file_endpoint(test_client):
 6 |     # Simulate a request to the parse endpoint
 7 |     with open("./tests/pdf/sample_pdf.pdf", "rb") as file:
 8 |         response = await test_client.post(
 9 |             "/v1/file",
10 |             files={"file": ("test.pdf", file)},
11 |             data={
12 |                 "method": "unstructured",
13 |                 "strategy": "auto",
14 |                 "language": "en",
15 |                 "check_table": False,
16 |             },
17 |         )
18 |     assert response.status_code == 200
19 |     assert response.json()["message"] == "File parsed successfully"
20 | 
21 | 
22 | @pytest.mark.asyncio
23 | async def test_parse_url_endpoint(test_client):
24 |     response = await test_client.post("/v1/url?url=https://www.quivr.com")
25 |     assert response.status_code == 200
26 |     assert response.json() == {
27 |         "message": "Website content parsed successfully",
28 |         "result": "Fake website content",
29 |     }
30 | 


--------------------------------------------------------------------------------
/libs/megaparse/tests/test_import.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from megaparse import MegaParse
 3 | 
 4 | 
 5 | @pytest.mark.skip("slow test")
 6 | def test_load():
 7 |     megaparse = MegaParse()
 8 |     response = megaparse.load("./tests/data/dummy.pdf")
 9 |     print(response)
10 |     assert response.strip("\n") == "Dummy PDF download"
11 | 


--------------------------------------------------------------------------------
/libs/megaparse/tests/test_parsers.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import pytest
 4 | from megaparse.parser.doctr_parser import DoctrParser
 5 | from megaparse.parser.llama import LlamaParser
 6 | from megaparse.parser.megaparse_vision import MegaParseVision
 7 | from megaparse.parser.unstructured_parser import UnstructuredParser
 8 | from megaparse_sdk.schema.extensions import FileExtension
 9 | 
10 | PARSER_LIST = [
11 |     UnstructuredParser,
12 |     # DoctrParser,
13 | ]
14 | 
15 | 
16 | @pytest.mark.parametrize("parser", PARSER_LIST)
17 | @pytest.mark.parametrize("extension", list(FileExtension))
18 | def test_sync_parser(parser, extension):
19 |     directory = "./tests/supported_docs"
20 |     file_path = next(
21 |         (
22 |             os.path.join(root, file)
23 |             for root, _, files in os.walk(directory)
24 |             for file in files
25 |             if file.endswith(extension.value)
26 |         ),
27 |         None,
28 |     )
29 |     if file_path is None:
30 |         pytest.fail(f"No file with extension {extension.value} found in {directory}")
31 | 
32 |     myparser = parser()
33 |     if extension in myparser.supported_extensions:
34 |         response = myparser.convert(file_path)
35 | 
36 |         assert response
37 |         assert len(str(response)) > 0
38 |     else:
39 |         with pytest.raises(ValueError):
40 |             myparser.convert(file_path)
41 | 


--------------------------------------------------------------------------------
/libs/megaparse_sdk/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # Changelog
 2 | 
 3 | ## [0.1.12](https://github.com/QuivrHQ/MegaParse/compare/megaparse-sdk-v0.1.11...megaparse-sdk-v0.1.12) (2025-02-13)
 4 | 
 5 | 
 6 | ### Features
 7 | 
 8 | * add layout detection ([#228](https://github.com/QuivrHQ/MegaParse/issues/228)) ([77f7040](https://github.com/QuivrHQ/MegaParse/commit/77f7040c9c221a17effce089be7ec575cdd83468))
 9 | 
10 | ## [0.1.11](https://github.com/QuivrHQ/MegaParse/compare/megaparse-sdk-v0.1.10...megaparse-sdk-v0.1.11) (2025-02-11)
11 | 
12 | 
13 | ### Features
14 | 
15 | * add_layout_detection ([#220](https://github.com/QuivrHQ/MegaParse/issues/220)) ([2d2d0b4](https://github.com/QuivrHQ/MegaParse/commit/2d2d0b42bba4c883db423568e932eda42edd60d7))
16 | * Text detection in auto strategy ([#209](https://github.com/QuivrHQ/MegaParse/issues/209)) ([03c7ada](https://github.com/QuivrHQ/MegaParse/commit/03c7ada1dc245e13ef41ffd6fa3a8ed869269d37))
17 | 
18 | 
19 | ### Bug Fixes
20 | 
21 | * Add EngineConfig & StrategyHandler ([#211](https://github.com/QuivrHQ/MegaParse/issues/211)) ([2e1c6dd](https://github.com/QuivrHQ/MegaParse/commit/2e1c6ddd676227d1cbc4cff9771b20595259ba38))
22 | * add parse tests for every supported extensions ([#198](https://github.com/QuivrHQ/MegaParse/issues/198)) ([9dff0de](https://github.com/QuivrHQ/MegaParse/commit/9dff0de0c1de848151fe9a6519b658f0924c1228))
23 | * Strategy heuristic test & fix ([#203](https://github.com/QuivrHQ/MegaParse/issues/203)) ([7b7fb40](https://github.com/QuivrHQ/MegaParse/commit/7b7fb40cae4ed380a5f0ca0035a7bd2bcc9147c3))
24 | 
25 | ## [0.1.10](https://github.com/QuivrHQ/MegaParse/compare/megaparse-sdk-v0.1.9...megaparse-sdk-v0.1.10) (2024-12-16)
26 | 
27 | 
28 | ### Bug Fixes
29 | 
30 | * hatchling version ([#193](https://github.com/QuivrHQ/MegaParse/issues/193)) ([f6070a5](https://github.com/QuivrHQ/MegaParse/commit/f6070a5483a20eeb83751a2dcfc01b7f0fb14473))
31 | 
32 | ## [0.1.9](https://github.com/QuivrHQ/MegaParse/compare/megaparse-sdk-v0.1.8...megaparse-sdk-v0.1.9) (2024-12-13)
33 | 
34 | 
35 | ### Features
36 | 
37 | * small fixes ([#181](https://github.com/QuivrHQ/MegaParse/issues/181)) ([004afe2](https://github.com/QuivrHQ/MegaParse/commit/004afe2f170570075bbebcd32dec5d15ddba4609))
38 | 
39 | ## [0.1.8](https://github.com/QuivrHQ/MegaParse/compare/megaparse-sdk-v0.1.7...megaparse-sdk-v0.1.8) (2024-12-12)
40 | 
41 | 
42 | ### Features
43 | 
44 | * custom auto ([#131](https://github.com/QuivrHQ/MegaParse/issues/131)) ([3cb5be4](https://github.com/QuivrHQ/MegaParse/commit/3cb5be4a8c8eeb6dd6e9b87d7bbca24491db4c29))
45 | * faster ocr ([#180](https://github.com/QuivrHQ/MegaParse/issues/180)) ([5661cb2](https://github.com/QuivrHQ/MegaParse/commit/5661cb2d52d959cbca0f41339791129cd35d4036))
46 | 
47 | ## [0.1.7](https://github.com/QuivrHQ/MegaParse/compare/megaparse-sdk-v0.1.6...megaparse-sdk-v0.1.7) (2024-11-25)
48 | 
49 | 
50 | ### Bug Fixes
51 | 
52 | * Update README.md ([#154](https://github.com/QuivrHQ/MegaParse/issues/154)) ([a103393](https://github.com/QuivrHQ/MegaParse/commit/a1033938184e20c24b0e54ee0db088b28075fd14))
53 | 
54 | ## [0.1.6](https://github.com/QuivrHQ/MegaParse/compare/megaparse-sdk-v0.1.5...megaparse-sdk-v0.1.6) (2024-11-25)
55 | 
56 | 
57 | ### Features
58 | 
59 | * megaparse sdk tests ([#148](https://github.com/QuivrHQ/MegaParse/issues/148)) ([e030285](https://github.com/QuivrHQ/MegaParse/commit/e0302853fc2c1526b8e912bf3ef85b970a5b89bc))
60 | 
61 | ## [0.1.5](https://github.com/QuivrHQ/MegaParse/compare/megaparse-sdk-v0.1.4...megaparse-sdk-v0.1.5) (2024-11-21)
62 | 
63 | 
64 | ### Features
65 | 
66 | * refacto megaparse for service ([#132](https://github.com/QuivrHQ/MegaParse/issues/132)) ([ab9ad7f](https://github.com/QuivrHQ/MegaParse/commit/ab9ad7fb7db580a04a998d144dd2ba3407068334))
67 | * release plz ([#134](https://github.com/QuivrHQ/MegaParse/issues/134)) ([d8a221e](https://github.com/QuivrHQ/MegaParse/commit/d8a221e23f6e15e969c1328f183da3582d0d7925))
68 | 


--------------------------------------------------------------------------------
/libs/megaparse_sdk/README.md:
--------------------------------------------------------------------------------
 1 | ## MegaParse SDK
 2 | 
 3 | Welcome to the MegaParse SDK! This SDK allows you to easily interact with the MegaParse API to upload URLs and files for processing.
 4 | 
 5 | ### Installation
 6 | 
 7 | To install the MegaParse SDK, use pip:
 8 | 
 9 | ```sh
10 | pip install megaparse-sdk
11 | ```
12 | 
13 | ### Usage
14 | 
15 | Here is an example of how to use the MegaParse SDK:
16 | 
17 | #### Uploading URLs
18 | 
19 | ```python
20 | import asyncio
21 | import os
22 | 
23 | from megaparse.sdk import MegaParseSDK
24 | 
25 | async def upload_url():
26 |     api_key = str(os.getenv("MEGAPARSE_API_KEY"))
27 |     megaparse = MegaParseSDK(api_key)
28 | 
29 |     url = "https://www.quivr.com"
30 | 
31 |     # Upload a URL
32 |     url_response = await megaparse.url.upload(url)
33 |     print(f"\n----- URL Response : {url} -----\n")
34 |     print(url_response)
35 | 
36 |     await megaparse.close()
37 | 
38 | if __name__ == "__main__":
39 |     asyncio.run(upload_url())
40 | ```
41 | 
42 | #### Uploading Files
43 | 
44 | ```python
45 | import asyncio
46 | import os
47 | 
48 | from megaparse.sdk import MegaParseSDK
49 | 
50 | async def upload_file():
51 |     api_key = str(os.getenv("MEGAPARSE_API_KEY"))
52 |     megaparse = MegaParseSDK(api_key)
53 | 
54 |     file_path = "your/file/path.pdf"
55 |     # Upload a file
56 |     response = await megaparse.file.upload(
57 |         file_path=file_path,
58 |         method="unstructured",  # unstructured, llama_parser, megaparse_vision
59 |         strategy="auto",
60 |     )
61 |     print(f"\n----- File Response : {file_path} -----\n")
62 |     print(response)
63 | 
64 |     await megaparse.close()
65 | 
66 | if __name__ == "__main__":
67 |     asyncio.run(upload_file())
68 | ```
69 | 
70 | ### Features
71 | 
72 | - **Upload URLs**: Easily upload URLs for processing.
73 | - **Upload Files**: Upload files with different processing methods and strategies.
74 | 
75 | ### Getting Started
76 | 
77 | 1. **Set up your API key**: Make sure to set the `MEGAPARSE_API_KEY` environment variable with your MegaParse API key.
78 | 2. **Run the example**: Use the provided example to see how to upload URLs and files.
79 | 
80 | For more details, refer to the [usage example](#file:usage_example.py-context).
81 | 
82 | We hope you find the MegaParse SDK useful for your projects!
83 | 
84 | Enjoy, _Quivr Team_ !
85 | 


--------------------------------------------------------------------------------
/libs/megaparse_sdk/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/libs/megaparse_sdk/examples/usage_example.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import os
 3 | 
 4 | from megaparse.sdk.megaparse_sdk import MegaParseSDK
 5 | 
 6 | 
 7 | async def main():
 8 |     api_key = str(os.getenv("MEGAPARSE_API_KEY"))
 9 |     megaparse = MegaParseSDK(api_key)
10 | 
11 |     # url = "https://www.quivr.com"
12 | 
13 |     # # Upload a URL
14 |     # url_response = await megaparse.url.upload(url)
15 |     # print(f"\n----- URL Response : {url} -----\n")
16 |     # print(url_response)
17 | 
18 |     # file_path = "megaparse/sdk/pdf/MegaFake_report.pdf"
19 |     file_path = (
20 |         "megaparse/sdk/examples/only_pdfs/4 The Language of Medicine  2024.07.21.pdf"
21 |     )
22 |     # Upload a file
23 |     response = await megaparse.file.upload(
24 |         file_path=file_path,
25 |         method="unstructured",  # type: ignore  # unstructured, llama_parser, megaparse_vision
26 |         strategy="auto",  # type: ignore  # fast, auto, hi_res
27 |     )
28 |     print(f"\n----- File Response : {file_path} -----\n")
29 |     print(response)
30 |     await megaparse.close()
31 | 
32 | 
33 | if __name__ == "__main__":
34 |     asyncio.run(main())
35 | 


--------------------------------------------------------------------------------
/libs/megaparse_sdk/megaparse_sdk/__init__.py:
--------------------------------------------------------------------------------
 1 | from .client import MegaParseClient
 2 | from .endpoints.file_upload import FileUpload
 3 | from .endpoints.url_upload import URLUpload
 4 | 
 5 | 
 6 | class MegaParseSDK:
 7 |     def __init__(self, api_key: str | None = None, base_url: str | None = None):
 8 |         self.client = MegaParseClient(api_key, base_url)
 9 |         self.file = FileUpload(self.client)
10 |         self.url = URLUpload(self.client)
11 | 
12 |     async def close(self):
13 |         await self.client.close()
14 | 


--------------------------------------------------------------------------------
/libs/megaparse_sdk/megaparse_sdk/client.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import enum
  3 | import logging
  4 | import os
  5 | from io import BytesIO
  6 | from pathlib import Path
  7 | from types import TracebackType
  8 | from typing import Any, Self
  9 | 
 10 | import httpx
 11 | import nats
 12 | from nats.errors import NoRespondersError, TimeoutError
 13 | 
 14 | from megaparse_sdk.config import ClientNATSConfig, MegaParseSDKConfig
 15 | from megaparse_sdk.schema.document import Document
 16 | from megaparse_sdk.schema.mp_exceptions import (
 17 |     DownloadError,
 18 |     InternalServiceError,
 19 |     MemoryLimitExceeded,
 20 |     ModelNotSupported,
 21 |     ParsingException,
 22 | )
 23 | from megaparse_sdk.schema.mp_inputs import (
 24 |     FileInput,
 25 |     MPInput,
 26 |     ParseFileConfig,
 27 |     ParseFileInput,
 28 |     ParseUrlInput,
 29 | )
 30 | from megaparse_sdk.schema.mp_outputs import (
 31 |     MPErrorType,
 32 |     MPOutput,
 33 |     MPOutputType,
 34 | )
 35 | from megaparse_sdk.utils.load_ssl import load_ssl_cxt
 36 | 
 37 | logger = logging.getLogger("megparse_sdk")
 38 | 
 39 | 
 40 | class MegaParseClient:
 41 |     def __init__(
 42 |         self,
 43 |         api_key: str | None = None,
 44 |         base_url: str | None = None,
 45 |     ):
 46 |         config = MegaParseSDKConfig()
 47 |         self.base_url = base_url or config.url
 48 |         self.api_key = api_key or config.api_key
 49 |         self.max_retries = config.max_retries
 50 |         if self.api_key:
 51 |             self.session = httpx.AsyncClient(
 52 |                 headers={"x-api-key": self.api_key}, timeout=config.timeout
 53 |             )
 54 |         else:
 55 |             self.session = httpx.AsyncClient(timeout=config.timeout)
 56 | 
 57 |     async def request(self, method: str, endpoint: str, **kwargs: Any) -> Any:
 58 |         url = f"{self.base_url}{endpoint}"
 59 |         client = self.session
 60 |         for attempt in range(self.max_retries):
 61 |             try:
 62 |                 response = await client.request(method, url, **kwargs)
 63 |                 response.raise_for_status()
 64 |                 return response.json()
 65 |             except (httpx.HTTPStatusError, httpx.RequestError):
 66 |                 if attempt < self.max_retries - 1:
 67 |                     await asyncio.sleep(2**attempt)  # Exponential backoff
 68 | 
 69 |         raise RuntimeError(f"Can't send request to the server: {url}")
 70 | 
 71 |     async def close(self):
 72 |         await self.session.aclose()
 73 | 
 74 | 
 75 | class ClientState(enum.Enum):
 76 |     # First state of the client
 77 |     UNOPENED = 1
 78 |     #   Client has either sent a request, or is within a `with` block.
 79 |     OPENED = 2
 80 |     #   Client has either exited the `with` block, or `close()` called.
 81 |     CLOSED = 3
 82 | 
 83 | 
 84 | class MegaParseNATSClient:
 85 |     def __init__(self, config: ClientNATSConfig):
 86 |         self.nc_config = config
 87 |         self.max_retries = self.nc_config.max_retries
 88 |         self.backoff = self.nc_config.backoff
 89 |         if self.nc_config.ssl_config:
 90 |             self.ssl_ctx = load_ssl_cxt(self.nc_config.ssl_config)
 91 |         else:
 92 |             self.ssl_ctx = None
 93 |         # Client connection
 94 |         self._state = ClientState.UNOPENED
 95 |         self._nc = None
 96 | 
 97 |     async def _get_nc(self):
 98 |         if self._nc is None:
 99 |             self._nc = await nats.connect(
100 |                 self.nc_config.endpoint,
101 |                 tls=self.ssl_ctx,
102 |                 connect_timeout=self.nc_config.connect_timeout,
103 |                 reconnect_time_wait=self.nc_config.reconnect_time_wait,
104 |                 max_reconnect_attempts=self.nc_config.max_reconnect_attempts,
105 |             )
106 |             return self._nc
107 |         return self._nc
108 | 
109 |     async def __aenter__(self: Self) -> Self:
110 |         if self._state != ClientState.UNOPENED:
111 |             msg = {
112 |                 ClientState.OPENED: "Cannot open a client instance more than once.",
113 |                 ClientState.CLOSED: (
114 |                     "Cannot reopen a client instance, client was closed."
115 |                 ),
116 |             }[self._state]
117 |             raise RuntimeError(msg)
118 | 
119 |         self._state = ClientState.OPENED
120 | 
121 |         await self._get_nc()
122 |         return self
123 | 
124 |     async def __aexit__(
125 |         self,
126 |         exc_type: type[BaseException] | None = None,
127 |         exc_value: BaseException | None = None,
128 |         traceback: TracebackType | None = None,
129 |     ) -> None:
130 |         self._state = ClientState.CLOSED
131 |         await self.aclose()
132 | 
133 |     async def parse_url(self, url: str):
134 |         url_inp = ParseUrlInput(url=url)
135 |         return await self._send_req(MPInput(input=url_inp))
136 | 
137 |     async def parse_file(
138 |         self, file: Path | BytesIO, file_name: str | None = None
139 |     ) -> str | Document:
140 |         if isinstance(file, Path):
141 |             with open(file, "rb") as f:
142 |                 data = f.read()
143 |             file_name = os.path.basename(file)
144 |         else:
145 |             file.seek(0)
146 |             data = file.read()
147 |             if file_name is None:
148 |                 raise ValueError("please provide file_name if passing ByteIO stream")
149 | 
150 |         file_input = ParseFileInput(
151 |             file_input=FileInput(file_name=file_name, file_size=len(data), data=data),
152 |             parse_config=ParseFileConfig(),
153 |         )
154 | 
155 |         inp = MPInput(input=file_input)
156 |         return await self._send_req(inp)
157 | 
158 |     async def _send_req(self, inp: MPInput) -> str | Document:
159 |         logger.debug(f"Sending {inp} to megaparse service.")
160 | 
161 |         for attempt in range(self.max_retries):
162 |             try:
163 |                 return await self._send_req_inner(inp)
164 |             except (TimeoutError, NoRespondersError) as e:
165 |                 logger.error(f"Sending req error: {e}. Retrying for {attempt} time")
166 |                 if attempt < self.max_retries - 1:
167 |                     logger.debug(f"Backoff for {2**self.backoff}s")
168 |                     await asyncio.sleep(2**self.backoff)
169 |         raise ParsingException
170 | 
171 |     async def _send_req_inner(self, inp: MPInput):
172 |         nc = await self._get_nc()
173 |         raw_response = await nc.request(
174 |             self.nc_config.subject,
175 |             inp.model_dump_json().encode("utf-8"),
176 |             timeout=self.nc_config.timeout,
177 |         )
178 |         response = MPOutput.model_validate_json(raw_response.data.decode("utf-8"))
179 |         return self._handle_mp_output(response)
180 | 
181 |     def _handle_mp_output(self, response: MPOutput) -> str | Document:
182 |         if response.output_type == MPOutputType.PARSE_OK:
183 |             assert response.result, "Parsing OK but response is None"
184 |             return response.result
185 |         elif response.output_type == MPOutputType.PARSE_ERR:
186 |             assert response.err, "Parsing OK but response is None"
187 |             match response.err.mp_err_code:
188 |                 case MPErrorType.MEMORY_LIMIT:
189 |                     raise MemoryLimitExceeded
190 |                 case MPErrorType.INTERNAL_SERVER_ERROR:
191 |                     raise InternalServiceError
192 |                 case MPErrorType.MODEL_NOT_SUPPORTED:
193 |                     raise ModelNotSupported
194 |                 case MPErrorType.DOWNLOAD_ERROR:
195 |                     raise DownloadError
196 |                 case MPErrorType.PARSING_ERROR:
197 |                     raise ParsingException
198 |         raise ValueError(f"unknown service response type: {response}")
199 | 
200 |     async def aclose(self):
201 |         nc = await self._get_nc()
202 |         await nc.close()
203 | 


--------------------------------------------------------------------------------
/libs/megaparse_sdk/megaparse_sdk/config.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel, FilePath
 2 | from pydantic_settings import BaseSettings, SettingsConfigDict
 3 | 
 4 | 
 5 | class MegaParseSDKConfig(BaseSettings):
 6 |     """
 7 |     Configuration for the Megaparse SDK.
 8 |     """
 9 | 
10 |     model_config = SettingsConfigDict(env_prefix="MEGAPARSE_SDK_")
11 |     api_key: str | None = None
12 |     url: str = "https://megaparse.tooling.quivr.app"
13 |     timeout: int = 600
14 |     max_retries: int = 3
15 | 
16 | 
17 | class SSLConfig(BaseModel):
18 |     ssl_key_file: FilePath
19 |     ssl_cert_file: FilePath
20 |     ca_cert_file: FilePath | None = None
21 | 
22 | 
23 | class ClientNATSConfig(BaseSettings):
24 |     model_config = SettingsConfigDict(
25 |         env_prefix="MEGAPARSE_NATS_",
26 |         env_file=(".env.local", ".env"),
27 |         env_nested_delimiter="__",
28 |         extra="ignore",
29 |     )
30 |     subject: str = "parsing"
31 |     endpoint: str = "https://tests@nats.tooling.quivr.app:4222"
32 |     timeout: float = 20
33 |     max_retries: int = 5
34 |     backoff: float = 3
35 |     connect_timeout: int = 5
36 |     reconnect_time_wait: int = 1
37 |     max_reconnect_attempts: int = 20
38 |     ssl_config: SSLConfig | None = None
39 | 


--------------------------------------------------------------------------------
/libs/megaparse_sdk/megaparse_sdk/endpoints/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/libs/megaparse_sdk/megaparse_sdk/endpoints/file_upload.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional
 2 | 
 3 | from httpx import Response
 4 | from pydantic import BaseModel
 5 | 
 6 | from megaparse_sdk.client import MegaParseClient
 7 | from megaparse_sdk.schema.languages import Language
 8 | from megaparse_sdk.schema.parser_config import ParserType, StrategyEnum
 9 | 
10 | 
11 | class UploadFileConfig(BaseModel):
12 |     method: ParserType
13 |     strategy: StrategyEnum
14 |     check_table: bool
15 |     language: Language
16 |     parsing_instruction: str | None = None
17 |     model_name: str = "gpt-4o"
18 | 
19 | 
20 | class FileUpload:
21 |     def __init__(self, client: MegaParseClient):
22 |         self.client = client
23 | 
24 |     async def upload(
25 |         self,
26 |         file_path: str,
27 |         method: ParserType = ParserType.UNSTRUCTURED,
28 |         strategy: StrategyEnum = StrategyEnum.AUTO,
29 |         check_table: bool = False,
30 |         language: Language = Language.ENGLISH,
31 |         parsing_instruction: Optional[str] = None,
32 |         model_name: str = "gpt-4o",
33 |     ) -> Response:
34 |         data = UploadFileConfig(
35 |             method=method,
36 |             strategy=strategy,
37 |             check_table=check_table,
38 |             language=language,
39 |             parsing_instruction=parsing_instruction,
40 |             model_name=model_name,
41 |         )
42 |         with open(file_path, "rb") as file:
43 |             files = {"file": (file_path, file)}
44 | 
45 |             response = await self.client.request(
46 |                 "POST",
47 |                 "/v1/file",
48 |                 files=files,
49 |                 data=data.model_dump(mode="json"),
50 |             )
51 |             return response
52 | 


--------------------------------------------------------------------------------
/libs/megaparse_sdk/megaparse_sdk/endpoints/url_upload.py:
--------------------------------------------------------------------------------
 1 | from httpx import Response
 2 | 
 3 | from megaparse_sdk.client import MegaParseClient
 4 | 
 5 | 
 6 | class URLUpload:
 7 |     def __init__(self, client: MegaParseClient):
 8 |         self.client = client
 9 | 
10 |     async def upload(self, url: str, max_retries: int = 3) -> Response:
11 |         endpoint = f"/v1/url?url={url}"
12 |         headers = {"accept": "application/json"}
13 |         response = await self.client.request("POST", endpoint, headers=headers, data="")
14 |         return response
15 | 


--------------------------------------------------------------------------------
/libs/megaparse_sdk/megaparse_sdk/schema/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/libs/megaparse_sdk/megaparse_sdk/schema/extensions.py:
--------------------------------------------------------------------------------
 1 | from enum import Enum
 2 | 
 3 | 
 4 | class FileExtension(str, Enum):
 5 |     """Supported file extension enumeration."""
 6 | 
 7 |     _mimetype: str
 8 | 
 9 |     def __new__(cls, value: str, mimetype: str):
10 |         obj = str.__new__(cls, value)
11 |         obj._value_ = value
12 |         obj._mimetype = mimetype
13 |         return obj
14 | 
15 |     PDF = (".pdf", "application/pdf")
16 |     DOCX = (
17 |         ".docx",
18 |         "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
19 |     )
20 |     TXT = (".txt", "text/plain")
21 |     OTF = (".odt", "application/vnd.oasis.opendocument.text")
22 |     EPUB = (".epub", "application/epub")
23 |     HTML = (".html", "text/html")
24 |     XML = (".xml", "application/xml")
25 |     CSV = (".csv", "text/csv")
26 |     XLSX = (
27 |         ".xlsx",
28 |         "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
29 |     )
30 |     XLS = (".xls", "application/vnd.ms-excel")
31 |     PPTX = (
32 |         ".pptx",
33 |         "application/vnd.openxmlformats-officedocument.presentationml.presentation",
34 |     )
35 |     MD = (".md", "text/markdown")
36 |     MARKDOWN = (".markdown", "text/markdown")
37 | 
38 |     @property
39 |     def mimetype(self) -> str:
40 |         return self._mimetype
41 | 


--------------------------------------------------------------------------------
/libs/megaparse_sdk/megaparse_sdk/schema/languages.py:
--------------------------------------------------------------------------------
 1 | from enum import Enum
 2 | 
 3 | 
 4 | class Language(str, Enum):
 5 |     BAZA = "abq"
 6 |     ADYGHE = "ady"
 7 |     AFRIKAANS = "af"
 8 |     ANGIKA = "ang"
 9 |     ARABIC = "ar"
10 |     ASSAMESE = "as"
11 |     AVAR = "ava"
12 |     AZERBAIJANI = "az"
13 |     BELARUSIAN = "be"
14 |     BULGARIAN = "bg"
15 |     BIHARI = "bh"
16 |     BHOJPURI = "bho"
17 |     BENGALI = "bn"
18 |     BOSNIAN = "bs"
19 |     SIMPLIFIED_CHINESE = "ch_sim"
20 |     TRADITIONAL_CHINESE = "ch_tra"
21 |     CHECHEN = "che"
22 |     CZECH = "cs"
23 |     WELSH = "cy"
24 |     DANISH = "da"
25 |     DARGWA = "dar"
26 |     GERMAN = "de"
27 |     ENGLISH = "en"
28 |     SPANISH = "es"
29 |     ESTONIAN = "et"
30 |     PERSIAN_FARSI = "fa"
31 |     FRENCH = "fr"
32 |     IRISH = "ga"
33 |     GOAN_KONKANI = "gom"
34 |     HINDI = "hi"
35 |     CROATIAN = "hr"
36 |     HUNGARIAN = "hu"
37 |     INDONESIAN = "id"
38 |     INGUSH = "inh"
39 |     ICELANDIC = "is"
40 |     ITALIAN = "it"
41 |     JAPANESE = "ja"
42 |     KABARDIAN = "kbd"
43 |     KANNADA = "kn"
44 |     KOREAN = "ko"
45 |     KURDISH = "ku"
46 |     LATIN = "la"
47 |     LAK = "lbe"
48 |     LEZGHIAN = "lez"
49 |     LITHUANIAN = "lt"
50 |     LATVIAN = "lv"
51 |     MAGAHI = "mah"
52 |     MAITHILI = "mai"
53 |     MAORI = "mi"
54 |     MONGOLIAN = "mn"
55 |     MARATHI = "mr"
56 |     MALAY = "ms"
57 |     MALTESE = "mt"
58 |     NEPALI = "ne"
59 |     NEWARI = "new"
60 |     DUTCH = "nl"
61 |     NORWEGIAN = "no"
62 |     OCCITAN = "oc"
63 |     PALI = "pi"
64 |     POLISH = "pl"
65 |     PORTUGUESE = "pt"
66 |     ROMANIAN = "ro"
67 |     RUSSIAN = "ru"
68 |     SERBIAN_CYRILLIC = "rs_cyrillic"
69 |     SERBIAN_LATIN = "rs_latin"
70 |     NAGPURI = "sck"
71 |     SLOVAK = "sk"
72 |     SLOVENIAN = "sl"
73 |     ALBANIAN = "sq"
74 |     SWEDISH = "sv"
75 |     SWAHILI = "sw"
76 |     TAMIL = "ta"
77 |     TABASSARAN = "tab"
78 |     TELUGU = "te"
79 |     THAI = "th"
80 |     TAJIK = "tjk"
81 |     TAGALOG = "tl"
82 |     TURKISH = "tr"
83 |     UYGHUR = "ug"
84 |     UKRAINIAN = "uk"
85 |     URDU = "ur"
86 |     UZBEK = "uz"
87 |     VIETNAMESE = "vi"
88 | 


--------------------------------------------------------------------------------
/libs/megaparse_sdk/megaparse_sdk/schema/mp_exceptions.py:
--------------------------------------------------------------------------------
 1 | class ModelNotSupported(Exception):
 2 |     def __init__(
 3 |         self,
 4 |         message: str = "The requested model is not supported yet.",
 5 |     ):
 6 |         super().__init__(message)
 7 | 
 8 | 
 9 | class MemoryLimitExceeded(Exception):
10 |     def __init__(self, message="The service is under high memory pressure"):
11 |         super().__init__(message)
12 | 
13 | 
14 | class InternalServiceError(Exception):
15 |     def __init__(self, message="Internal service error occured"):
16 |         super().__init__(message)
17 | 
18 | 
19 | class DownloadError(Exception):
20 |     def __init__(self, message="Failed to download the file"):
21 |         super().__init__(message)
22 | 
23 | 
24 | class ParsingException(Exception):
25 |     def __init__(self, message="An error occurred during parsing"):
26 |         super().__init__(message)
27 | 


--------------------------------------------------------------------------------
/libs/megaparse_sdk/megaparse_sdk/schema/mp_inputs.py:
--------------------------------------------------------------------------------
 1 | import base64
 2 | from enum import Enum
 3 | from typing import Literal, Union
 4 | 
 5 | from pydantic import BaseModel, Field, field_serializer, field_validator
 6 | 
 7 | from .parser_config import ParseFileConfig
 8 | 
 9 | 
10 | class FileInput(BaseModel):
11 |     file_name: str
12 |     file_size: int
13 |     data: bytes
14 | 
15 |     @field_validator("data", mode="before")
16 |     def decode_data(cls, value):
17 |         if isinstance(value, str):
18 |             try:
19 |                 return base64.b64decode(value)
20 |             except Exception:
21 |                 raise ValueError("Invalid Base64 encoding for the 'data' field.")
22 |         return value
23 | 
24 |     # TODO: this is slow !!! Move to reading bytes directly from bucket storage
25 |     # append bytes with CRC32
26 |     @field_serializer("data", return_type=str)
27 |     def serialize_data(self, data: bytes, _info):
28 |         return base64.b64encode(data).decode("utf-8")
29 | 
30 | 
31 | class MPParseType(str, Enum):
32 |     PARSE_FILE = "parse_file"
33 |     PARSE_URL = "parse_url"
34 | 
35 | 
36 | class ParseFileInput(BaseModel):
37 |     mp_parse_type: Literal[MPParseType.PARSE_FILE] = MPParseType.PARSE_FILE
38 |     file_input: FileInput
39 |     parse_config: ParseFileConfig
40 | 
41 | 
42 | class ParseUrlInput(BaseModel):
43 |     mp_parse_type: Literal[MPParseType.PARSE_URL] = MPParseType.PARSE_URL
44 |     url: str
45 | 
46 | 
47 | class MPInput(BaseModel):
48 |     input: Union[ParseFileInput, ParseUrlInput] = Field(
49 |         ..., discriminator="mp_parse_type"
50 |     )
51 | 


--------------------------------------------------------------------------------
/libs/megaparse_sdk/megaparse_sdk/schema/mp_outputs.py:
--------------------------------------------------------------------------------
 1 | from enum import Enum, auto
 2 | from typing import Dict
 3 | 
 4 | from pydantic import BaseModel, Field
 5 | 
 6 | from megaparse_sdk.schema.document import Document
 7 | 
 8 | 
 9 | class MPErrorType(Enum):
10 |     MEMORY_LIMIT = auto()
11 |     INTERNAL_SERVER_ERROR = auto()
12 |     MODEL_NOT_SUPPORTED = auto()
13 |     DOWNLOAD_ERROR = auto()
14 |     PARSING_ERROR = auto()
15 | 
16 | 
17 | class ParseError(BaseModel):
18 |     mp_err_code: MPErrorType
19 |     message: str
20 | 
21 | 
22 | class MPOutputType(str, Enum):
23 |     PARSE_OK = "parse_file_ok"
24 |     PARSE_ERR = "parse_file_err"
25 | 
26 | 
27 | class MPOutput(BaseModel):
28 |     output_type: MPOutputType
29 |     result: str | Document | None
30 |     err: ParseError | None = None
31 |     metadata: Dict[str, str] = Field(default_factory=dict)
32 | 


--------------------------------------------------------------------------------
/libs/megaparse_sdk/megaparse_sdk/schema/parser_config.py:
--------------------------------------------------------------------------------
 1 | from enum import Enum
 2 | from typing import Optional
 3 | 
 4 | from pydantic import BaseModel
 5 | 
 6 | from .languages import Language
 7 | from .supported_models import SupportedModel
 8 | 
 9 | 
10 | class ParserType(str, Enum):
11 |     """Parser type enumeration."""
12 | 
13 |     UNSTRUCTURED = "unstructured"
14 |     LLAMA_PARSER = "llama_parser"
15 |     MEGAPARSE_VISION = "megaparse_vision"
16 | 
17 | 
18 | class StrategyEnum(str, Enum):
19 |     """Method to use for the conversion"""
20 | 
21 |     FAST = "fast"
22 |     AUTO = "auto"
23 |     HI_RES = "hi_res"
24 | 
25 | 
26 | class ParseFileConfig(BaseModel):
27 |     llm_model_name: SupportedModel = SupportedModel.GPT_4
28 |     method: ParserType = ParserType.UNSTRUCTURED
29 |     strategy: StrategyEnum = StrategyEnum.AUTO
30 |     check_table: bool = False
31 |     language: Language = Language.ENGLISH
32 |     parsing_instruction: Optional[str] = None
33 | 


--------------------------------------------------------------------------------
/libs/megaparse_sdk/megaparse_sdk/schema/supported_models.py:
--------------------------------------------------------------------------------
 1 | from enum import Enum
 2 | 
 3 | 
 4 | class SupportedModel(str, Enum):
 5 |     """Supported models enumeration."""
 6 | 
 7 |     # OpenAI Models
 8 |     GPT_4 = "gpt-4"
 9 |     GPT_4_TURBO = "gpt-4-turbo"
10 |     GPT_3_5_TURBO = "gpt-3.5-turbo"
11 |     GPT_4O = "gpt-4o"
12 |     GPT_4O_MINI = "gpt-4o-mini"
13 | 
14 |     # Anthropic Models
15 |     CLAUDE_3_5_SONNET_LATEST = "claude-3-5-sonnet-latest"
16 |     CLAUDE_3_5_SONNET = "claude-3-5-sonnet-20241022"
17 |     CLAUDE_3_5_HAIKU = "claude-3-5-haiku-20241022"
18 |     CLAUDE_3_5_HAIKU_LATEST = "claude-3-5-haiku-latest"
19 |     CLAUDE_3_OPUS = "claude-3-opus-20240229"
20 |     CLAUDE_3_OPUS_LATEST = "claude-3-opus-latest"
21 |     CLAUDE_3_SONNET = "claude-3-sonnet-20240229"
22 |     CLAUDE_3_HAIKU = "claude-3-haiku-20240307"
23 | 
24 |     def __str__(self):
25 |         return self.value
26 | 
27 |     @classmethod
28 |     def is_supported(cls, model_name: str) -> bool:
29 |         """Check if the model is supported."""
30 |         return model_name in cls.__members__.values()
31 | 
32 |     @classmethod
33 |     def get_supported_models(cls) -> list[str]:
34 |         """Get the list of supported models."""
35 |         return list(cls.__members__.values())
36 | 


--------------------------------------------------------------------------------
/libs/megaparse_sdk/megaparse_sdk/utils/load_ssl.py:
--------------------------------------------------------------------------------
 1 | import ssl
 2 | 
 3 | from megaparse_sdk.config import SSLConfig
 4 | 
 5 | 
 6 | def load_ssl_cxt(ssl_config: SSLConfig):
 7 |     context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
 8 |     if ssl_config.ca_cert_file:
 9 |         context.load_verify_locations(cafile=ssl_config.ca_cert_file)
10 |     context.load_cert_chain(
11 |         certfile=ssl_config.ssl_cert_file, keyfile=ssl_config.ssl_key_file
12 |     )
13 |     return context
14 | 


--------------------------------------------------------------------------------
/libs/megaparse_sdk/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "megaparse-sdk"
 3 | version = "0.1.12"
 4 | description = "Megaparse SDK"
 5 | dependencies = [
 6 |     "python-dotenv>=1.0.0",
 7 |     "pycryptodome>=3.21.0",
 8 |     "psutil>=6.1.0",
 9 |     "httpx>=0.27.0",
10 |     "nats-py>=2.9.0",
11 |     "loguru>=0.7.2",
12 | ]
13 | 
14 | readme = "README.md"
15 | requires-python = ">= 3.11"
16 | 
17 | [build-system]
18 | requires = ["hatchling==1.26.3"]
19 | build-backend = "hatchling.build"
20 | 
21 | [tool.rye]
22 | managed = true
23 | dev-dependencies = []
24 | universal = true
25 | 
26 | [tool.hatch.metadata]
27 | allow-direct-references = true
28 | 
29 | [tool.hatch.build.targets.wheel]
30 | packages = ["megaparse_sdk"]
31 | 


--------------------------------------------------------------------------------
/libs/megaparse_sdk/tests/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QuivrHQ/MegaParse/ba9a24aec950d6cf14834b8e2e11f5725778f12e/libs/megaparse_sdk/tests/README.md


--------------------------------------------------------------------------------
/libs/megaparse_sdk/tests/certs/client-cert.pem:
--------------------------------------------------------------------------------
 1 | -----BEGIN CERTIFICATE-----
 2 | MIIEqDCCAxCgAwIBAgIRAITvq6ZEk6paYFDRbueJhEMwDQYJKoZIhvcNAQELBQAw
 3 | gZ0xHjAcBgNVBAoTFW1rY2VydCBkZXZlbG9wbWVudCBDQTE5MDcGA1UECwwwYW1p
 4 | bmVAYW1pbmVzLU1hY0Jvb2stUHJvLmxvY2FsIChhbWluZSBkaXJob3Vzc2kpMUAw
 5 | PgYDVQQDDDdta2NlcnQgYW1pbmVAYW1pbmVzLU1hY0Jvb2stUHJvLmxvY2FsIChh
 6 | bWluZSBkaXJob3Vzc2kpMB4XDTI0MTExOTEwNDgwN1oXDTI3MDIxOTEwNDgwN1ow
 7 | ZDEnMCUGA1UEChMebWtjZXJ0IGRldmVsb3BtZW50IGNlcnRpZmljYXRlMTkwNwYD
 8 | VQQLDDBhbWluZUBhbWluZXMtTWFjQm9vay1Qcm8ubG9jYWwgKGFtaW5lIGRpcmhv
 9 | dXNzaSkwggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEKAoIBAQC2fDlGlKYIj8bp
10 | tlDYh8ooc56Zt+R1HF1GcqF0Gv+oub/dDvsIZnun5bBnA7W3tJ4M6virwg6cBiA5
11 | KDkIbwWfzHatsvFM0gMX3ZEfAwemo9Egi8udOsuAkP0OYlxzAB1PqOKCRfcfzFcH
12 | qmOb/JNlI82LBDLOqJDfGG4cRBYWqWRTYDxHsswSKFr/QHOHpImtrAyqo8qsXobN
13 | gLWSm1cNNtHa5XiCCJ7NUCVZh5cyEeCv1fS2297N+H0W9BxKpb1f9sAQ2N3ZLei8
14 | ghHuQVA8yhUB1YCO/8jsywvXb8EnZctPLvhuLxeCN7A4TESPk5i0LsITqJcl4vQT
15 | WWVVcNfJAgMBAAGjgZowgZcwDgYDVR0PAQH/BAQDAgWgMCcGA1UdJQQgMB4GCCsG
16 | AQUFBwMCBggrBgEFBQcDAQYIKwYBBQUHAwQwHwYDVR0jBBgwFoAUV2w3gvQM5La1
17 | 2fk80tJXoM/14l4wOwYDVR0RBDQwMoIJbG9jYWxob3N0gRNtZWdhcGFyc2VAcXVp
18 | dnIuYXBwhxAAAAAAAAAAAAAAAAAAAAABMA0GCSqGSIb3DQEBCwUAA4IBgQAYq4VZ
19 | 6spwGvcqg8kCOghu6o54UPYo/NLzh3oYewJnDJ+2XD786TpTgjZMGA6Ms+det6oV
20 | HdT5s77VFgJiJloHlD0fpKkRxjzyBOk5/bQcCKkTMBVfgJbMoAfa2gq+/7zxmLcn
21 | AmNg7BkmsTtHWPsLyN3rYI4dkkDKWkxp8Sezm9WPEa9OGJDJSYf4Dq9pN1lUoP1p
22 | vxsq7sW0HDWnx/I2zWuz3AaT9b4UayRnk4IRYxAuYYN/k0GNjVmmDveywNoNlkmW
23 | 0Az6ycPN+vvz8Jpm3CbZSIQLO8Yn57H/aU4DmOtunm3VLUiLucmfOggv8Sq5n2g9
24 | ze61UJu9lr2/nWOXnErl3V9UL3kJ1OlbFzTWDGm9zX7boo6MLXy+fAj+Tw0sCeMr
25 | drdxo8IUYYU6HUdtuLGMFznBFFUNhfFSwFANGPB38NyofwLPSZM0hYntQqBMt/P7
26 | /E+wQ67hSEutkIbOD3kGkGREIk3dVyUeajO9DFTaQ+yTnNtnuUbxs5LkRlw=
27 | -----END CERTIFICATE-----
28 | 


--------------------------------------------------------------------------------
/libs/megaparse_sdk/tests/certs/client-key.pem:
--------------------------------------------------------------------------------
 1 | -----BEGIN PRIVATE KEY-----
 2 | MIIEvgIBADANBgkqhkiG9w0BAQEFAASCBKgwggSkAgEAAoIBAQC2fDlGlKYIj8bp
 3 | tlDYh8ooc56Zt+R1HF1GcqF0Gv+oub/dDvsIZnun5bBnA7W3tJ4M6virwg6cBiA5
 4 | KDkIbwWfzHatsvFM0gMX3ZEfAwemo9Egi8udOsuAkP0OYlxzAB1PqOKCRfcfzFcH
 5 | qmOb/JNlI82LBDLOqJDfGG4cRBYWqWRTYDxHsswSKFr/QHOHpImtrAyqo8qsXobN
 6 | gLWSm1cNNtHa5XiCCJ7NUCVZh5cyEeCv1fS2297N+H0W9BxKpb1f9sAQ2N3ZLei8
 7 | ghHuQVA8yhUB1YCO/8jsywvXb8EnZctPLvhuLxeCN7A4TESPk5i0LsITqJcl4vQT
 8 | WWVVcNfJAgMBAAECggEBAIK2AlSzHyacze8UH16qDTzibGVRGjxkf895Rnqi6COU
 9 | QYD3PQrsVYCS/sMbHiujHV7FZC+rRcmufaBTVl7bH10yGIQc28iZ2YtbsppTEkTj
10 | rGUynTtXJPNHZ2vJOs1I9LXdk7maogPN2zzraIQP7AgTGCSOclIi3fpfRmfKwUOj
11 | BkEzj7CbaAGtW9vTamPJG/+wgaaBcPhplQk4cD2mjdaMLfGQXNXiYgp09kf0hJ2k
12 | 0QbsQBC85bMSfmPAsoTRLxi94S12at3SABgF0oOCy9FZs/sWsdJRI6nbfvZ3C4xo
13 | 8y+rH7Yaej7AYK+jbU3Uk/1473cuCAnNKg65UyU4+gECgYEA2/ZQYRDU3JWNHQGy
14 | dJXZRl6hSFCw9y9RUc/QjcRs+VlnXE5UK1eLwfcKh0YYRhIWSE8z3mZmK09M/FG0
15 | xbU4qIZbDYcAI2nCiUeT8HmTjVSPMS1oWZrt7rh00gcyoLQt2TUS3bo2tsmdPyWW
16 | OgEiYfb4MoG/KCdYlACE6O4GMMECgYEA1GIMIHM2x4B1wgLnKeI3X2wYWuYCHtFB
17 | Px56GUFTZytBsHghxtovVlLh88FNS5rthvXuE0FHE9RljKhZaNgqrPOrlAZSuv18
18 | vK7RmG/NPJl2osbs677a/xoxNuVkfrRcxl4cvYOBL5huHo1D5sOitGFW+IlscgWY
19 | nWzXlY7AYQkCgYA6H96hp7b4CzTc42Pq1uYxaDQqTdhVmVVdzxKHQ86gHXXouHIZ
20 | eereeI95q5YifgkRVoyYSmrZKv1m95hTXk34inhpHLF2qi3T5Ow88YOCJ0QndJ5M
21 | f1o8aNXF4k0IllQ/P30axmhK6P/6fc4yybXyOTbg8dQ3oh4QDgsRGkTcgQKBgQCG
22 | qLgJpyN3cPK5FYAeJUl4nh//GlED2yekbp15/9py0pFu42x/GX3kHN8Y31oz8sJh
23 | zPKrkLsRTp0ohuFRwaWlTUZfr3arCugY9jr8jP6zSpZW9QvpGXTfRGsp5F5Im/Eq
24 | 8ScF3ih91gcUJfuEiExUVFeBdBinXvb58bXrJLzDiQKBgG+Z06uj2dWxtK4nqJvP
25 | HllTocAGVm+fEmupVsLU6ksVVrOl8O9TapMbY8pUj9J5oBYJvY+KFGoIoxYwhZrz
26 | 4NqY7iv8w+LQ7mQIwcQ4B67pDAQMJZTShR5v57FlAZldP5UpE5ASt22isBW31sYI
27 | 1OaXIqrCA/V43NydDezh0ylQ
28 | -----END PRIVATE KEY-----
29 | 


--------------------------------------------------------------------------------
/libs/megaparse_sdk/tests/certs/rootCA.pem:
--------------------------------------------------------------------------------
 1 | -----BEGIN CERTIFICATE-----
 2 | MIIFCzCCA3OgAwIBAgIQESt0eck2KvFrAMyiDyceujANBgkqhkiG9w0BAQsFADCB
 3 | nTEeMBwGA1UEChMVbWtjZXJ0IGRldmVsb3BtZW50IENBMTkwNwYDVQQLDDBhbWlu
 4 | ZUBhbWluZXMtTWFjQm9vay1Qcm8ubG9jYWwgKGFtaW5lIGRpcmhvdXNzaSkxQDA+
 5 | BgNVBAMMN21rY2VydCBhbWluZUBhbWluZXMtTWFjQm9vay1Qcm8ubG9jYWwgKGFt
 6 | aW5lIGRpcmhvdXNzaSkwHhcNMjQxMTE5MTAwMTA5WhcNMzQxMTE5MTAwMTA5WjCB
 7 | nTEeMBwGA1UEChMVbWtjZXJ0IGRldmVsb3BtZW50IENBMTkwNwYDVQQLDDBhbWlu
 8 | ZUBhbWluZXMtTWFjQm9vay1Qcm8ubG9jYWwgKGFtaW5lIGRpcmhvdXNzaSkxQDA+
 9 | BgNVBAMMN21rY2VydCBhbWluZUBhbWluZXMtTWFjQm9vay1Qcm8ubG9jYWwgKGFt
10 | aW5lIGRpcmhvdXNzaSkwggGiMA0GCSqGSIb3DQEBAQUAA4IBjwAwggGKAoIBgQCw
11 | 6TX1kvqVMb8ZUQVT/vuDsedmbYgSFn68yJRlmE9BsqG7TLQHl2Kw6VQqZBSIkeZG
12 | CypmUysX/3qrvICeArIdmmsrWUTDYPoauw/a/RY0I07rALj3YR0Y7039Hxf/UPT9
13 | xlUtnM2NafkZyp6WRjEN0N4ETvJDIbUQiosiiPilxhwRbJURhT/JPskaw+OM2Sw5
14 | dFAT20zkYC5VIc4wJBFLAMG0XzI6Sy/4wI1WdRBXd2UMpQU4u7TyD0RB4mnHorV6
15 | kXjtLKD/KWSrSG1nnum9SB9eVatbRD+TUgoclwAKedrlCDEM4EsXVVuUuYCizQNb
16 | +H3BSPfj1upUW5eKfgAyB+8r4QGf2yCY9O8NMMrJ1K5Qv4vSuWAU2tZqAyE8Z4Ke
17 | UtHsl/M0zIvIKwyki2N/rieL/m6lTzS3dwSf9vv7eePEvxd8SBClSF07MUzyxkZ5
18 | UYNxaK5t2ZRADZ6n/9/hAQsMscCkHiX1N2ypBFV+86Pr78BC48JgIyCMwuiBN4sC
19 | AwEAAaNFMEMwDgYDVR0PAQH/BAQDAgIEMBIGA1UdEwEB/wQIMAYBAf8CAQAwHQYD
20 | VR0OBBYEFFdsN4L0DOS2tdn5PNLSV6DP9eJeMA0GCSqGSIb3DQEBCwUAA4IBgQBj
21 | KosfLfW/ZH80NM16pvpyRF3mCi+q+I+P8zrfilMYJBH4EEdEGAUgTO5do1kJXeel
22 | Wky+FNxaP6KCNiT+0amypKg+yjBlnqLKVdnEgR5s12ZfmerV59stx1A/c/bYMEAS
23 | re6xskBkowP2cVQHAC2dy/0Ov+lZsiNaPV2bQx6KUJurveebUQsH3uF3ZEhnUVQ6
24 | rt5+JGY4x9Tr1YMhvHqEDTrsipPdDB1MyW1SnCkqSXrz+DPXGd8BW0O0hpM5la81
25 | J+rfZGinbcUgXM6JMLIHDxLc4Xxzm4NijFzXhbR3XPXqEwsnZOuxcYYFgUGs3FwS
26 | 4ro+34a/O4uKS2KV8wsUWj/tWD2rLpduDgag4WSipCvWtaNve8gPdUiyPxUqxyoZ
27 | aFAFg/izXwmRntogJtV0Zvo3fqAaQQDl8t2s21IIx0wmgHzgmkswb5OwFg3dOn/S
28 | lmaH8v7FCBP7jHx/NCPTT5Sy/1EMRATmhFDUZ8Bod/TIlV3e+FCVqlX3kBBRbAU=
29 | -----END CERTIFICATE-----
30 | 


--------------------------------------------------------------------------------
/libs/megaparse_sdk/tests/pdf/MegaFake_report.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QuivrHQ/MegaParse/ba9a24aec950d6cf14834b8e2e11f5725778f12e/libs/megaparse_sdk/tests/pdf/MegaFake_report.pdf


--------------------------------------------------------------------------------
/libs/megaparse_sdk/tests/pdf/sample_table.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QuivrHQ/MegaParse/ba9a24aec950d6cf14834b8e2e11f5725778f12e/libs/megaparse_sdk/tests/pdf/sample_table.pdf


--------------------------------------------------------------------------------
/libs/megaparse_sdk/tests/test_nats_client.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import logging
  3 | from pathlib import Path
  4 | 
  5 | import nats
  6 | import pytest
  7 | import pytest_asyncio
  8 | from megaparse_sdk.client import ClientState, MegaParseNATSClient
  9 | from megaparse_sdk.config import ClientNATSConfig, SSLConfig
 10 | from megaparse_sdk.schema.mp_exceptions import (
 11 |     DownloadError,
 12 |     InternalServiceError,
 13 |     MemoryLimitExceeded,
 14 |     ModelNotSupported,
 15 |     ParsingException,
 16 | )
 17 | from megaparse_sdk.schema.mp_inputs import MPInput, ParseFileInput, ParseUrlInput
 18 | from megaparse_sdk.schema.mp_outputs import (
 19 |     MPErrorType,
 20 |     MPOutput,
 21 |     MPOutputType,
 22 |     ParseError,
 23 | )
 24 | from nats.aio.client import Client
 25 | 
 26 | logger = logging.getLogger(__name__)
 27 | 
 28 | NATS_URL = "nats://test@127.0.0.1:4222"
 29 | NATS_SUBJECT = "parsing"
 30 | SSL_CERT_FILE = "./tests/certs/client-cert.pem"
 31 | SSL_KEY_FILE = "./tests/certs/client-key.pem"
 32 | CA_CERT_FILE = "./tests/certs/rootCA.pem"
 33 | 
 34 | 
 35 | @pytest.fixture(scope="session")
 36 | def ssl_config() -> SSLConfig:
 37 |     return SSLConfig(
 38 |         ca_cert_file=CA_CERT_FILE,
 39 |         ssl_key_file=SSL_KEY_FILE,
 40 |         ssl_cert_file=SSL_CERT_FILE,
 41 |     )
 42 | 
 43 | 
 44 | @pytest.fixture(scope="session")
 45 | def nc_config(ssl_config: SSLConfig) -> ClientNATSConfig:
 46 |     config = ClientNATSConfig(
 47 |         subject=NATS_SUBJECT,
 48 |         endpoint=NATS_URL,
 49 |         ssl_config=ssl_config,
 50 |         timeout=0.5,
 51 |         max_retries=1,
 52 |         backoff=-1,
 53 |         connect_timeout=1,
 54 |         reconnect_time_wait=1,
 55 |         max_reconnect_attempts=1,
 56 |     )
 57 |     return config
 58 | 
 59 | 
 60 | @pytest_asyncio.fixture(scope="function")
 61 | async def nats_service(nc_config: ClientNATSConfig):
 62 |     # TODO: fix TLS handshake to work in CI
 63 |     # ssl_config = load_ssl_cxt(nc_config.ssl_config)
 64 |     nc = await nats.connect(
 65 |         nc_config.endpoint,
 66 |         tls=ssl_config,
 67 |         connect_timeout=nc_config.connect_timeout,
 68 |         reconnect_time_wait=nc_config.reconnect_time_wait,
 69 |         max_reconnect_attempts=nc_config.max_reconnect_attempts,
 70 |     )
 71 |     yield nc
 72 |     await nc.drain()
 73 | 
 74 | 
 75 | @pytest.mark.asyncio
 76 | async def test_client_state_transition(nc_config: ClientNATSConfig):
 77 |     mpc = MegaParseNATSClient(nc_config)
 78 |     assert mpc._state == ClientState.UNOPENED
 79 |     async with mpc:
 80 |         assert mpc._state == ClientState.OPENED
 81 |     assert mpc._state == ClientState.CLOSED
 82 | 
 83 |     with pytest.raises(RuntimeError):
 84 |         async with mpc:
 85 |             pass
 86 | 
 87 | 
 88 | @pytest.mark.asyncio(loop_scope="session")
 89 | async def test_client_parse_file(nats_service: Client, nc_config: ClientNATSConfig):
 90 |     async def message_handler(msg):
 91 |         parsed_input = MPInput.model_validate_json(msg.data.decode("utf-8")).input
 92 |         assert isinstance(parsed_input, ParseFileInput)
 93 |         output = MPOutput(output_type=MPOutputType.PARSE_OK, result="test")
 94 |         await nats_service.publish(msg.reply, output.model_dump_json().encode("utf-8"))
 95 | 
 96 |     await nats_service.subscribe(NATS_SUBJECT, "worker", cb=message_handler)
 97 | 
 98 |     file_path = Path("./tests/pdf/sample_table.pdf")
 99 |     async with MegaParseNATSClient(nc_config) as mp_client:
100 |         resp = await mp_client.parse_file(file=file_path)
101 |         assert resp == "test"
102 | 
103 | 
104 | @pytest.mark.asyncio(loop_scope="session")
105 | async def test_client_parse_url(nats_service: Client, nc_config: ClientNATSConfig):
106 |     async def message_handler(msg):
107 |         parsed_input = MPInput.model_validate_json(msg.data.decode("utf-8")).input
108 |         assert isinstance(parsed_input, ParseUrlInput)
109 |         output = MPOutput(output_type=MPOutputType.PARSE_OK, result="url")
110 |         await nats_service.publish(msg.reply, output.model_dump_json().encode("utf-8"))
111 | 
112 |     await nats_service.subscribe(NATS_SUBJECT, "worker", cb=message_handler)
113 | 
114 |     async with MegaParseNATSClient(nc_config) as mp_client:
115 |         resp = await mp_client.parse_url(url="this://this")
116 |         assert resp == "url"
117 | 
118 | 
119 | @pytest.mark.asyncio(loop_scope="session")
120 | async def test_client_parse_timeout(nats_service: Client, ssl_config: SSLConfig):
121 |     nc_config = ClientNATSConfig(
122 |         subject=NATS_SUBJECT,
123 |         endpoint=NATS_URL,
124 |         ssl_config=ssl_config,
125 |         timeout=0.1,
126 |         max_retries=1,
127 |         backoff=1,
128 |     )
129 | 
130 |     async def service(msg):
131 |         await asyncio.sleep(2 * nc_config.timeout)
132 | 
133 |     await nats_service.subscribe(NATS_SUBJECT, "worker", cb=service)
134 | 
135 |     file_path = Path("./tests/pdf/sample_table.pdf")
136 |     with pytest.raises(ParsingException):
137 |         async with MegaParseNATSClient(nc_config) as mp_client:
138 |             await mp_client.parse_file(file=file_path)
139 | 
140 | 
141 | @pytest.mark.asyncio(loop_scope="session")
142 | async def test_client_parse_timeout_retry(nats_service: Client, ssl_config: SSLConfig):
143 |     nc_config = ClientNATSConfig(
144 |         subject=NATS_SUBJECT,
145 |         endpoint=NATS_URL,
146 |         ssl_config=ssl_config,
147 |         timeout=0.1,
148 |         max_retries=2,
149 |         backoff=-5,
150 |     )
151 | 
152 |     msgs = []
153 | 
154 |     async def service(msg):
155 |         msgs.append(msg)
156 |         await asyncio.sleep(2 * nc_config.timeout)
157 | 
158 |     await nats_service.subscribe(NATS_SUBJECT, "worker", cb=service)
159 | 
160 |     file_path = Path("./tests/pdf/sample_table.pdf")
161 |     with pytest.raises(ParsingException):
162 |         async with MegaParseNATSClient(nc_config) as mp_client:
163 |             await mp_client.parse_file(file=file_path)
164 |     assert len(msgs) == 2
165 | 
166 | 
167 | @pytest.mark.asyncio(loop_scope="session")
168 | @pytest.mark.parametrize(
169 |     "mp_error_type, exception_class",
170 |     [
171 |         ("MEMORY_LIMIT", MemoryLimitExceeded),
172 |         ("INTERNAL_SERVER_ERROR", InternalServiceError),
173 |         ("MODEL_NOT_SUPPORTED", ModelNotSupported),
174 |         ("DOWNLOAD_ERROR", DownloadError),
175 |         ("PARSING_ERROR", ParsingException),
176 |     ],
177 | )
178 | async def test_client_parse_file_excp(
179 |     nats_service: Client, nc_config: ClientNATSConfig, mp_error_type, exception_class
180 | ):
181 |     async def message_handler(msg):
182 |         parsed_input = MPInput.model_validate_json(msg.data.decode("utf-8")).input
183 |         assert isinstance(parsed_input, ParseFileInput)
184 |         err = ParseError(mp_err_code=MPErrorType[mp_error_type], message="")
185 |         output = MPOutput(
186 |             output_type=MPOutputType.PARSE_ERR,
187 |             err=err,
188 |             result=None,
189 |         )
190 |         await nats_service.publish(msg.reply, output.model_dump_json().encode("utf-8"))
191 | 
192 |     await nats_service.subscribe(NATS_SUBJECT, "worker", cb=message_handler)
193 | 
194 |     file_path = Path("./tests/pdf/sample_table.pdf")
195 |     with pytest.raises(exception_class):
196 |         async with MegaParseNATSClient(nc_config) as mp_client:
197 |             await mp_client.parse_file(file=file_path)
198 | 


--------------------------------------------------------------------------------
/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QuivrHQ/MegaParse/ba9a24aec950d6cf14834b8e2e11f5725778f12e/logo.png


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "megaparse-monorepo"
 3 | version = "0.0.1"
 4 | description = "Megaparse monorepo"
 5 | authors = [
 6 |     { name = "Stan Girard", email = "stan@quivr.app" },
 7 |     { name = "Chloé Daems", email = "chloe@quivr.app" },
 8 |     { name = "Amine Dirhoussi", email = "amine@quivr.app" },
 9 |     { name = "Jacopo Chevallard", email = "jacopo@quivr.app" },
10 | ]
11 | readme = "README.md"
12 | requires-python = ">= 3.11"
13 | dependencies = [
14 |     "packaging>=22.0",
15 | ]
16 | 
17 | [build-system]
18 | requires = ["hatchling==1.26.3"]
19 | build-backend = "hatchling.build"
20 | 
21 | [tool.rye]
22 | python = ">= 3.11"
23 | managed = true
24 | universal = true
25 | dev-dependencies = [
26 |     "mypy>=1.11.1",
27 |     "pre-commit>=3.8.0",
28 |     "ipykernel>=6.29.5",
29 |     "ruff>=0.6.0",
30 |     "flake8>=7.1.1",
31 |     "flake8-black>=0.3.6",
32 |     "pytest-asyncio>=0.23.8",
33 |     "pytest>=8.3.3",
34 |     "pytest-xdist>=3.6.1",
35 |     "pytest-cov>=5.0.0",
36 |     "pytest-profiling>=1.8.1",
37 | ]
38 | 
39 | [tool.rye.workspace]
40 | members = ["libs/*"]
41 | 
42 | [tool.hatch.metadata]
43 | allow-direct-references = true
44 | 
45 | [tool.hatch.build.targets.wheel]
46 | packages = ["src/megaparse"]
47 | 
48 | [tool.ruff]
49 | line-length = 88
50 | exclude = [".git", "__pycache__", ".mypy_cache", ".pytest_cache"]
51 | 
52 | [tool.ruff.lint]
53 | select = [
54 |     "E", # pycodestyle errors
55 |     "W", # pycodestyle warnings
56 |     "F", # pyflakes
57 |     "I", # isort
58 |     "C", # flake8-comprehensions
59 |     "B", # flake8-bugbear
60 | ]
61 | ignore = [
62 |     "B904",
63 |     "B006",
64 |     "E501", # line too long, handled by black
65 |     "B008", # do not perform function calls in argument defaults
66 |     "C901", # too complex
67 | ]
68 | 
69 | [tool.ruff.lint.isort]
70 | order-by-type = true
71 | relative-imports-order = "closest-to-furthest"
72 | extra-standard-library = ["typing"]
73 | section-order = [
74 |     "future",
75 |     "standard-library",
76 |     "third-party",
77 |     "first-party",
78 |     "local-folder",
79 | ]
80 | known-first-party = []
81 | 
82 | 
83 | [tool.pytest.ini_options]
84 | addopts = "--tb=short -ra -v"
85 | asyncio_default_fixture_loop_scope = "session"
86 | filterwarnings = ["ignore::DeprecationWarning"]
87 | markers = [
88 |     "slow: marks tests as slow (deselect with '-m \"not slow\"')",
89 |     "base: these tests require quivr-core with extra `base` to be installed",
90 |     "tika: these tests require a tika server to be running",
91 |     "unstructured: these tests require `unstructured` dependency",
92 | ]
93 | 


--------------------------------------------------------------------------------
/release-please-config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "$schema": "https://raw.githubusercontent.com/googleapis/release-please/main/schemas/config.json",
 3 |   "separate-pull-requests": true,
 4 |   "include-v-in-tag": true,
 5 |   "bump-patch-for-minor-pre-major": true,
 6 |   "include-component-in-tag": true,
 7 |   "packages": {
 8 |     "libs/megaparse": {
 9 |       "release-type": "python",
10 |       "package-name": "megaparse",
11 |       "changelog-notes-type": "github"
12 |     },
13 |     "libs/megaparse_sdk": {
14 |       "release-type": "python",
15 |       "package-name": "megaparse-sdk",
16 |       "changelog-notes-type": "github"
17 |     }
18 |   }
19 | }
20 | 


--------------------------------------------------------------------------------