├── .github
└── workflows
│ ├── benchmarks.yml
│ ├── ci.yml
│ ├── cla.yml
│ ├── publish.yml
│ └── scripts.yml
├── .gitignore
├── .pre-commit-config.yaml
├── CLA.md
├── LICENSE
├── README.md
├── benchmarks
├── __init__.py
├── overall
│ ├── __init__.py
│ ├── display
│ │ ├── __init__.py
│ │ ├── dataset.py
│ │ └── table.py
│ ├── download
│ │ ├── __init__.py
│ │ ├── base.py
│ │ ├── llamaparse.py
│ │ ├── main.py
│ │ ├── mathpix.py
│ │ └── mistral.py
│ ├── elo.py
│ ├── methods
│ │ ├── __init__.py
│ │ ├── docling.py
│ │ ├── gt.py
│ │ ├── llamaparse.py
│ │ ├── marker.py
│ │ ├── mathpix.py
│ │ ├── mistral.py
│ │ ├── olmocr.py
│ │ └── schema.py
│ ├── overall.py
│ ├── registry.py
│ ├── schema.py
│ └── scorers
│ │ ├── __init__.py
│ │ ├── clean.py
│ │ ├── heuristic.py
│ │ ├── llm.py
│ │ └── schema.py
├── table
│ ├── __init__.py
│ ├── gemini.py
│ ├── inference.py
│ ├── scoring.py
│ └── table.py
├── throughput
│ ├── __init__.py
│ └── main.py
└── verify_scores.py
├── chunk_convert.py
├── convert.py
├── convert_single.py
├── data
├── .gitignore
├── examples
│ ├── json
│ │ ├── multicolcnn.json
│ │ ├── switch_trans.json
│ │ └── thinkpython.json
│ └── markdown
│ │ ├── multicolcnn
│ │ ├── _page_1_Figure_0.jpeg
│ │ ├── _page_2_Picture_0.jpeg
│ │ ├── _page_6_Figure_0.jpeg
│ │ ├── _page_7_Figure_0.jpeg
│ │ ├── multicolcnn.md
│ │ └── multicolcnn_meta.json
│ │ ├── switch_transformers
│ │ ├── _page_11_Figure_4.jpeg
│ │ ├── _page_12_Figure_4.jpeg
│ │ ├── _page_13_Figure_2.jpeg
│ │ ├── _page_18_Figure_1.jpeg
│ │ ├── _page_18_Figure_3.jpeg
│ │ ├── _page_20_Figure_1.jpeg
│ │ ├── _page_20_Figure_4.jpeg
│ │ ├── _page_27_Figure_1.jpeg
│ │ ├── _page_29_Figure_1.jpeg
│ │ ├── _page_2_Figure_3.jpeg
│ │ ├── _page_30_Figure_1.jpeg
│ │ ├── _page_31_Figure_3.jpeg
│ │ ├── _page_4_Figure_1.jpeg
│ │ ├── _page_5_Figure_3.jpeg
│ │ ├── switch_trans.md
│ │ └── switch_trans_meta.json
│ │ └── thinkpython
│ │ ├── _page_109_Figure_1.jpeg
│ │ ├── _page_115_Figure_1.jpeg
│ │ ├── _page_116_Figure_3.jpeg
│ │ ├── _page_127_Figure_1.jpeg
│ │ ├── _page_128_Figure_1.jpeg
│ │ ├── _page_167_Figure_1.jpeg
│ │ ├── _page_169_Figure_1.jpeg
│ │ ├── _page_173_Figure_1.jpeg
│ │ ├── _page_190_Figure_1.jpeg
│ │ ├── _page_195_Figure_1.jpeg
│ │ ├── _page_205_Figure_1.jpeg
│ │ ├── _page_230_Figure_1.jpeg
│ │ ├── _page_233_Figure_1.jpeg
│ │ ├── _page_233_Figure_3.jpeg
│ │ ├── _page_234_Figure_1.jpeg
│ │ ├── _page_235_Figure_1.jpeg
│ │ ├── _page_236_Figure_1.jpeg
│ │ ├── _page_236_Figure_3.jpeg
│ │ ├── _page_237_Figure_1.jpeg
│ │ ├── _page_238_Figure_1.jpeg
│ │ ├── _page_23_Figure_1.jpeg
│ │ ├── _page_23_Figure_3.jpeg
│ │ ├── _page_46_Figure_1.jpeg
│ │ ├── _page_60_Figure_1.jpeg
│ │ ├── _page_60_Figure_3.jpeg
│ │ ├── _page_67_Figure_1.jpeg
│ │ ├── _page_71_Figure_1.jpeg
│ │ ├── _page_78_Figure_1.jpeg
│ │ ├── _page_85_Figure_1.jpeg
│ │ ├── _page_94_Figure_1.jpeg
│ │ ├── _page_99_Figure_17.jpeg
│ │ ├── _page_99_Figure_178.jpeg
│ │ ├── thinkpython.md
│ │ └── thinkpython_meta.json
├── images
│ ├── overall.png
│ ├── per_doc.png
│ └── table.png
└── latex_to_md.sh
├── extraction_app.py
├── marker
├── builders
│ ├── __init__.py
│ ├── document.py
│ ├── layout.py
│ ├── line.py
│ ├── llm_layout.py
│ ├── ocr.py
│ └── structure.py
├── config
│ ├── __init__.py
│ ├── crawler.py
│ ├── parser.py
│ └── printer.py
├── converters
│ ├── __init__.py
│ ├── extraction.py
│ ├── ocr.py
│ ├── pdf.py
│ └── table.py
├── extractors
│ ├── __init__.py
│ └── page.py
├── logger.py
├── models.py
├── output.py
├── processors
│ ├── __init__.py
│ ├── blockquote.py
│ ├── code.py
│ ├── debug.py
│ ├── document_toc.py
│ ├── equation.py
│ ├── footnote.py
│ ├── ignoretext.py
│ ├── line_merge.py
│ ├── line_numbers.py
│ ├── list.py
│ ├── llm
│ │ ├── __init__.py
│ │ ├── llm_complex.py
│ │ ├── llm_equation.py
│ │ ├── llm_form.py
│ │ ├── llm_handwriting.py
│ │ ├── llm_image_description.py
│ │ ├── llm_mathblock.py
│ │ ├── llm_meta.py
│ │ ├── llm_table.py
│ │ └── llm_table_merge.py
│ ├── order.py
│ ├── page_header.py
│ ├── reference.py
│ ├── sectionheader.py
│ ├── table.py
│ ├── text.py
│ └── util.py
├── providers
│ ├── __init__.py
│ ├── document.py
│ ├── epub.py
│ ├── html.py
│ ├── image.py
│ ├── pdf.py
│ ├── powerpoint.py
│ ├── registry.py
│ ├── spreadsheet.py
│ └── utils.py
├── renderers
│ ├── __init__.py
│ ├── extraction.py
│ ├── html.py
│ ├── json.py
│ ├── markdown.py
│ └── ocr_json.py
├── schema
│ ├── __init__.py
│ ├── blocks
│ │ ├── __init__.py
│ │ ├── base.py
│ │ ├── basetable.py
│ │ ├── caption.py
│ │ ├── code.py
│ │ ├── complexregion.py
│ │ ├── equation.py
│ │ ├── figure.py
│ │ ├── footnote.py
│ │ ├── form.py
│ │ ├── handwriting.py
│ │ ├── inlinemath.py
│ │ ├── listitem.py
│ │ ├── pagefooter.py
│ │ ├── pageheader.py
│ │ ├── picture.py
│ │ ├── reference.py
│ │ ├── sectionheader.py
│ │ ├── table.py
│ │ ├── tablecell.py
│ │ ├── text.py
│ │ └── toc.py
│ ├── document.py
│ ├── groups
│ │ ├── __init__.py
│ │ ├── base.py
│ │ ├── figure.py
│ │ ├── list.py
│ │ ├── page.py
│ │ ├── picture.py
│ │ └── table.py
│ ├── polygon.py
│ ├── registry.py
│ └── text
│ │ ├── __init__.py
│ │ ├── char.py
│ │ ├── line.py
│ │ └── span.py
├── scripts
│ ├── __init__.py
│ ├── chunk_convert.py
│ ├── chunk_convert.sh
│ ├── common.py
│ ├── convert.py
│ ├── convert_single.py
│ ├── extraction_app.py
│ ├── file_to_s3.py
│ ├── run_streamlit_app.py
│ ├── server.py
│ └── streamlit_app.py
├── services
│ ├── __init__.py
│ ├── claude.py
│ ├── gemini.py
│ ├── ollama.py
│ ├── openai.py
│ └── vertex.py
├── settings.py
└── util.py
├── marker_app.py
├── marker_server.py
├── poetry.lock
├── pyproject.toml
├── pytest.ini
├── signatures
└── version1
│ └── cla.json
├── static
└── fonts
│ └── .gitignore
└── tests
├── builders
├── test_blank_page.py
├── test_document_builder.py
├── test_garbled_pdf.py
├── test_layout_replace.py
├── test_line_builder.py
├── test_merged_lines.py
├── test_ocr_builder.py
├── test_ocr_pipeline.py
├── test_overriding.py
├── test_pdf_links.py
├── test_rotated_bboxes.py
├── test_strip_existing_ocr.py
└── test_structure.py
├── config
└── test_config.py
├── conftest.py
├── converters
├── test_extraction_converter.py
├── test_ocr_converter.py
├── test_pdf_converter.py
└── test_table_converter.py
├── processors
├── test_document_toc_processor.py
├── test_equation_processor.py
├── test_footnote_processor.py
├── test_ignoretext.py
├── test_llm_processors.py
├── test_table_merge.py
└── test_table_processor.py
├── providers
├── test_document_providers.py
├── test_image_provider.py
└── test_pdf_provider.py
├── renderers
├── test_extract_images.py
├── test_json_renderer.py
└── test_markdown_renderer.py
├── schema
└── groups
│ └── test_list_grouping.py
├── services
└── test_service_init.py
└── utils.py
/.github/workflows/benchmarks.yml:
--------------------------------------------------------------------------------
1 | name: Integration test
2 |
3 | on: [push]
4 |
5 | env:
6 | PYTHONIOENCODING: "utf-8"
7 |
8 | jobs:
9 | benchmark:
10 | runs-on: ubuntu-latest
11 | steps:
12 | - uses: actions/checkout@v3
13 | - name: Set up Python 3.11
14 | uses: actions/setup-python@v4
15 | with:
16 | python-version: 3.11
17 | - name: Install apt dependencies
18 | run: |
19 | sudo apt-get update
20 | sudo apt-get install -y pandoc
21 | - name: Install python dependencies
22 | run: |
23 | pip install poetry
24 | poetry install --extras "full"
25 | - name: Run benchmark test
26 | run: |
27 | poetry run python benchmarks/overall/overall.py --max_rows 5
28 | poetry run python benchmarks/verify_scores.py conversion_results/benchmark/overall/result.json --type marker
29 | - name: Run table benchmark
30 | run: |
31 | poetry run python benchmarks/table/table.py --max_rows 5
32 | poetry run python benchmarks/verify_scores.py conversion_results/benchmark/table/table.json --type table
--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
1 | name: CI tests
2 |
3 | on: [push]
4 |
5 | jobs:
6 | tests:
7 | runs-on: ubuntu-latest
8 | steps:
9 | - uses: actions/checkout@v3
10 | - name: Set up Python 3.11
11 | uses: actions/setup-python@v4
12 | with:
13 | python-version: 3.11
14 | - name: Install python dependencies
15 | run: |
16 | pip install poetry
17 | poetry install --extras "full"
18 | - name: Run tests
19 | env:
20 | HF_TOKEN: ${{ secrets.HF_TOKEN }}
21 | run: poetry run pytest
22 |
--------------------------------------------------------------------------------
/.github/workflows/cla.yml:
--------------------------------------------------------------------------------
1 | name: "Marker CLA Assistant"
2 | on:
3 | issue_comment:
4 | types: [created]
5 | pull_request_target:
6 | types: [opened,closed,synchronize]
7 |
8 | # explicitly configure permissions, in case your GITHUB_TOKEN workflow permissions are set to read-only in repository settings
9 | permissions:
10 | actions: write
11 | contents: write
12 | pull-requests: write
13 | statuses: write
14 |
15 | jobs:
16 | CLAAssistant:
17 | runs-on: ubuntu-latest
18 | steps:
19 | - name: "Marker CLA Assistant"
20 | if: (github.event.comment.body == 'recheck' || github.event.comment.body == 'I have read the CLA Document and I hereby sign the CLA') || github.event_name == 'pull_request_target'
21 | uses: contributor-assistant/github-action@v2.3.0
22 | env:
23 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
24 | # the below token should have repo scope and must be manually added by you in the repository's secret
25 | # This token is required only if you have configured to store the signatures in a remote repository/organization
26 | PERSONAL_ACCESS_TOKEN: ${{ secrets.PERSONAL_ACCESS_TOKEN }}
27 | with:
28 | path-to-signatures: 'signatures/version1/cla.json'
29 | path-to-document: 'https://github.com/VikParuchuri/marker/blob/master/CLA.md'
30 | # branch should not be protected
31 | branch: 'master'
32 | allowlist: VikParuchuri
--------------------------------------------------------------------------------
/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
1 | name: Python package
2 | on:
3 | push:
4 | tags:
5 | - "v*.*.*"
6 | jobs:
7 | build:
8 | runs-on: ubuntu-latest
9 | steps:
10 | - uses: actions/checkout@v3
11 | - name: Set up Python 3.11
12 | uses: actions/setup-python@v4
13 | with:
14 | python-version: 3.11
15 | - name: Install python dependencies
16 | run: |
17 | pip install poetry
18 | poetry install --extras "full"
19 | - name: Build package
20 | run: |
21 | poetry build
22 | - name: Publish package
23 | env:
24 | PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}
25 | run: |
26 | poetry config pypi-token.pypi "$PYPI_TOKEN"
27 | poetry publish
28 |
--------------------------------------------------------------------------------
/.github/workflows/scripts.yml:
--------------------------------------------------------------------------------
1 | name: Test CLI scripts
2 |
3 | on: [push]
4 |
5 | jobs:
6 | tests:
7 | runs-on: ubuntu-latest
8 | steps:
9 | - uses: actions/checkout@v3
10 | - name: Set up Python 3.11
11 | uses: actions/setup-python@v4
12 | with:
13 | python-version: 3.11
14 | - name: Install python dependencies
15 | run: |
16 | pip install poetry
17 | poetry install --extras "full"
18 | - name: Download benchmark data
19 | run: |
20 | wget -O benchmark_data.zip "https://drive.google.com/uc?export=download&id=1NHrdYatR1rtqs2gPVfdvO0BAvocH8CJi"
21 | unzip -o benchmark_data.zip
22 | - name: Test single script
23 | run: poetry run marker_single benchmark_data/pdfs/switch_trans.pdf --page_range 0
24 | - name: Test convert script
25 | run: poetry run marker benchmark_data/pdfs --max_files 1 --workers 1 --page_range 0
26 | - name: Text convert script multiple workers
27 | run: poetry run marker benchmark_data/pdfs --max_files 2 --workers 2 --page_range 0-5
28 | - name: Test llm option
29 | run: |
30 | poetry run marker_single benchmark_data/pdfs/switch_trans.pdf --page_range 0 --use_llm > output.txt || echo "Command failed but continuing"
31 | if ! grep -q "UserWarning" output.txt; then
32 | echo "Success: No UserWarning found"
33 | exit 0
34 | else
35 | echo "Error: UserWarning found in output"
36 | exit 1
37 | fi
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | repos:
2 | - repo: https://github.com/astral-sh/ruff-pre-commit
3 | # Ruff version.
4 | rev: v0.9.10
5 | hooks:
6 | # Run the linter.
7 | - id: ruff
8 | types_or: [ python, pyi ]
9 | args: [ --fix ]
10 | # Run the formatter.
11 | - id: ruff-format
12 | types_or: [ python, pyi ]
--------------------------------------------------------------------------------
/benchmarks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/benchmarks/__init__.py
--------------------------------------------------------------------------------
/benchmarks/overall/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/benchmarks/overall/__init__.py
--------------------------------------------------------------------------------
/benchmarks/overall/display/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/benchmarks/overall/display/__init__.py
--------------------------------------------------------------------------------
/benchmarks/overall/display/dataset.py:
--------------------------------------------------------------------------------
1 | import json
2 | from typing import List
3 |
4 | import datasets
5 | from tqdm import tqdm
6 |
7 | from benchmarks.overall.registry import METHOD_REGISTRY
8 | from benchmarks.overall.schema import FullResult
9 |
10 |
11 | def build_dataset(bench_dataset: datasets.Dataset, result: FullResult, score_types: List[str], max_rows: int | None = None) -> datasets.Dataset:
12 | rows = []
13 | for idx, sample in tqdm(enumerate(bench_dataset), desc="Building dataset"):
14 | if idx not in result["markdown"]:
15 | continue
16 |
17 | if max_rows is not None and idx >= max_rows:
18 | break
19 |
20 | row = {
21 | "uuid": sample["uuid"],
22 | "classification": sample["classification"],
23 | "language": sample["language"],
24 | "img": sample["img"],
25 | }
26 | for method in result["markdown"][idx]:
27 | if method == "gt":
28 | continue
29 |
30 | method_cls = METHOD_REGISTRY[method]()
31 | md = result["markdown"][idx][method]
32 | try:
33 | method_img = method_cls.render(result["markdown"][idx][method])
34 | except Exception as e:
35 | # This can happen when the markdown is None
36 | method_img = PIL.Image.new("RGB", (200, 200))
37 |
38 | row[f"{method}_md"] = md
39 | row[f"{method}_img"] = method_img
40 |
41 | for score_type in score_types:
42 | try:
43 | row[f"{method}_{score_type}"] = result["scores"][idx][method][score_type]["score"]
44 | except KeyError:
45 | row[f"{method}_{score_type}"] = -1.0 # Missing score
46 | try:
47 | row[f"{method}_{score_type}_detail"] = json.dumps(result["scores"][idx][method][score_type]["specific_scores"])
48 | except KeyError:
49 | row[f"{method}_{score_type}_detail"] = "" # Missing detail
50 | rows.append(row)
51 | ds = datasets.Dataset.from_list(rows)
52 | return ds
53 |
54 |
--------------------------------------------------------------------------------
/benchmarks/overall/display/table.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 | from typing import Dict, List
3 |
4 | import tabulate
5 |
6 | from benchmarks.overall.schema import FullResult
7 |
8 | def write_table(title: str, rows: list, headers: list, out_path: Path, filename: str):
9 | table = tabulate.tabulate(rows, headers=headers, tablefmt="github")
10 | with open(out_path / filename, "w", encoding="utf-8") as f:
11 | f.write(f"# {title}\n")
12 | f.write(table)
13 | print(title)
14 | print(table)
15 |
16 |
17 | def print_scores(result: FullResult, out_path: Path, methods: List[str], score_types: List[str], default_score_type="heuristic", default_method="marker"):
18 | document_types = list(result["averages_by_type"][default_method][default_score_type].keys())
19 | headers = ["Document Type"]
20 | for method in methods:
21 | for score_type in score_types:
22 | headers.append(f"{method} {score_type}")
23 |
24 | document_rows = [[k] for k in document_types]
25 | for i, doc_type in enumerate(document_types):
26 | for method in methods:
27 | for score_type in score_types:
28 | avg_score = sum(result["averages_by_type"][method][score_type][doc_type]) / max(1, len(result["averages_by_type"][method][score_type][doc_type]))
29 | document_rows[i].append(avg_score)
30 |
31 | write_table("Document Types", document_rows, headers, out_path, "document_types.md")
32 |
33 | headers = ["Block Type"]
34 | block_types = list(result["averages_by_block_type"][default_method][default_score_type].keys()) # all possible blocks
35 | block_score_types = list(result["averages_by_block_type"][default_method].keys())
36 | for method in methods:
37 | for score_type in block_score_types:
38 | headers.append(f"{method} {score_type}")
39 |
40 | block_rows = [[k] for k in block_types]
41 | for i, block_type in enumerate(block_types):
42 | for method in methods:
43 | for score_type in block_score_types:
44 | avg_score = sum(result["averages_by_block_type"][method][score_type][block_type]) / max(1, len(result["averages_by_block_type"][method][score_type][block_type]))
45 | block_rows[i].append(avg_score)
46 |
47 | write_table("Block types", block_rows, headers, out_path, "block_types.md")
48 |
49 | headers = ["Method", "Avg Time"] + score_types
50 | inference_rows = [[k] for k in methods]
51 | all_raw_scores = [result["scores"][i] for i in result["scores"]]
52 | for i, method in enumerate(methods):
53 | avg_time = sum(result["average_times"][method]) / max(1, len(result["average_times"][method]))
54 | inference_rows[i].append(avg_time)
55 | for score_type in score_types:
56 | scores_lst = []
57 | for ar in all_raw_scores:
58 | try:
59 | # Sometimes a few llm scores are missing
60 | scores_lst.append(ar[method][score_type]["score"])
61 | except KeyError:
62 | continue
63 | avg_score = sum(scores_lst) / max(1, len(scores_lst))
64 | inference_rows[i].append(avg_score)
65 |
66 | write_table("Overall Results", inference_rows, headers, out_path, "overall.md")
67 |
68 | print("Scores computed by aligning ground truth markdown blocks with predicted markdown for each method. The scores are 0-100 based on edit distance.")
--------------------------------------------------------------------------------
/benchmarks/overall/download/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/benchmarks/overall/download/__init__.py
--------------------------------------------------------------------------------
/benchmarks/overall/download/base.py:
--------------------------------------------------------------------------------
1 | import json
2 | from json import JSONDecodeError
3 | from pathlib import Path
4 |
5 | import datasets
6 | from tqdm import tqdm
7 |
8 |
9 | class Downloader:
10 | cache_path: Path = Path("cache")
11 | service: str
12 |
13 | def __init__(self, api_key, app_id, max_rows: int = 2200):
14 | self.cache_path.mkdir(exist_ok=True)
15 | self.max_rows = max_rows
16 | self.api_key = api_key
17 | self.app_id = app_id
18 | self.ds = datasets.load_dataset("datalab-to/marker_benchmark", split="train")
19 |
20 | def get_html(self, pdf_bytes):
21 | raise NotImplementedError
22 |
23 | def upload_ds(self):
24 | rows = []
25 | for file in self.cache_path.glob("*.json"):
26 | with open(file, "r") as f:
27 | data = json.load(f)
28 | rows.append(data)
29 |
30 | out_ds = datasets.Dataset.from_list(rows, features=datasets.Features({
31 | "md": datasets.Value("string"),
32 | "uuid": datasets.Value("string"),
33 | "time": datasets.Value("float"),
34 | }))
35 | out_ds.push_to_hub(f"datalab-to/marker_benchmark_{self.service}", private=True)
36 |
37 | def generate_data(self):
38 | max_rows = self.max_rows
39 | for idx, sample in tqdm(enumerate(self.ds), desc=f"Saving {self.service} results"):
40 | cache_file = self.cache_path / f"{idx}.json"
41 | if cache_file.exists():
42 | continue
43 |
44 | pdf_bytes = sample["pdf"] # This is a single page PDF
45 | try:
46 | out_data = self.get_html(pdf_bytes)
47 | except JSONDecodeError as e:
48 | print(f"Error with sample {idx}: {e}")
49 | continue
50 | except Exception as e:
51 | print(f"Error with sample {idx}: {e}")
52 | continue
53 | out_data["uuid"] = sample["uuid"]
54 |
55 | with cache_file.open("w") as f:
56 | json.dump(out_data, f)
57 |
58 | if idx >= max_rows:
59 | break
60 |
61 | def __call__(self):
62 | self.generate_data()
63 | self.upload_ds()
64 |
--------------------------------------------------------------------------------
/benchmarks/overall/download/llamaparse.py:
--------------------------------------------------------------------------------
1 | import io
2 | import time
3 |
4 | import requests
5 |
6 | from benchmarks.overall.download.base import Downloader
7 |
8 |
9 | class LlamaParseDownloader(Downloader):
10 | service = "llamaparse"
11 |
12 | def get_html(self, pdf_bytes):
13 | rand_name = str(time.time()) + ".pdf"
14 | start = time.time()
15 | buff = io.BytesIO(pdf_bytes)
16 | md = upload_and_parse_file(self.api_key, rand_name, buff)
17 | end = time.time()
18 | if isinstance(md, bytes):
19 | md = md.decode("utf-8")
20 |
21 | return {
22 | "md": md,
23 | "time": end - start,
24 | }
25 |
26 |
27 | def upload_and_parse_file(api_key: str, fname: str, buff, max_retries: int = 180, delay: int = 1):
28 | headers = {
29 | "Authorization": f"Bearer {api_key}",
30 | "Accept": "application/json"
31 | }
32 |
33 | # Upload file
34 | files = {
35 | 'file': (fname, buff, 'application/pdf')
36 | }
37 | response = requests.post(
38 | 'https://api.cloud.llamaindex.ai/api/v1/parsing/upload',
39 | headers=headers,
40 | files=files
41 | )
42 | response.raise_for_status()
43 | job_id = response.json()['id']
44 |
45 | # Poll for completion
46 | for _ in range(max_retries):
47 | status_response = requests.get(
48 | f'https://api.cloud.llamaindex.ai/api/v1/parsing/job/{job_id}',
49 | headers=headers
50 | )
51 | status_response.raise_for_status()
52 | if status_response.json()['status'] == 'SUCCESS':
53 | # Get results
54 | result_response = requests.get(
55 | f'https://api.cloud.llamaindex.ai/api/v1/parsing/job/{job_id}/result/markdown',
56 | headers=headers
57 | )
58 | result_response.raise_for_status()
59 | return result_response.json()['markdown']
60 |
61 | time.sleep(delay)
62 |
63 | raise TimeoutError("Job did not complete within the maximum retry attempts")
--------------------------------------------------------------------------------
/benchmarks/overall/download/main.py:
--------------------------------------------------------------------------------
1 | import click
2 |
3 | from benchmarks.overall.download.llamaparse import LlamaParseDownloader
4 | from benchmarks.overall.download.mathpix import MathpixDownloader
5 | from benchmarks.overall.download.mistral import MistralDownloader
6 |
7 |
8 | @click.command("Download data from inference services")
9 | @click.argument("service", type=click.Choice(["mathpix", "llamaparse", "mistral"]))
10 | @click.option("--max_rows", type=int, default=2200)
11 | @click.option("--api_key", type=str, default=None)
12 | @click.option("--app_id", type=str, default=None)
13 | def main(service: str, max_rows: int, api_key: str, app_id: str):
14 | registry = {
15 | "mathpix": MathpixDownloader,
16 | "llamaparse": LlamaParseDownloader,
17 | "mistral": MistralDownloader,
18 | }
19 | downloader = registry[service](api_key, app_id, max_rows=max_rows)
20 |
21 | # Generate data and upload to hub
22 | downloader()
23 |
24 | if __name__ == "__main__":
25 | main()
26 |
--------------------------------------------------------------------------------
/benchmarks/overall/download/mathpix.py:
--------------------------------------------------------------------------------
1 | import json
2 | import time
3 |
4 | import requests
5 |
6 | from benchmarks.overall.download.base import Downloader
7 |
8 |
9 | class MathpixDownloader(Downloader):
10 | service = "mathpix"
11 |
12 | def get_html(self, pdf_bytes):
13 | headers = {
14 | "app_id": self.app_id,
15 | "app_key": self.api_key,
16 | }
17 | start = time.time()
18 | pdf_id = mathpix_request(pdf_bytes, headers)
19 | status = mathpix_status(pdf_id, headers)
20 | if status in ["processing", "error"]:
21 | md = ""
22 | else:
23 | md = mathpix_results(pdf_id, headers)
24 | end = time.time()
25 | if isinstance(md, bytes):
26 | md = md.decode("utf-8")
27 |
28 | return {
29 | "md": md,
30 | "time": end - start
31 | }
32 |
33 | def mathpix_request(buffer, headers):
34 | response = requests.post("https://api.mathpix.com/v3/pdf",
35 | headers=headers,
36 | data={
37 | "options_json": json.dumps(
38 | {
39 | "conversion_formats": {
40 | "md": True,
41 | "html": True
42 | }
43 | }
44 | )
45 | },
46 | files={
47 | "file": buffer
48 | }
49 | )
50 | data = response.json()
51 | pdf_id = data["pdf_id"]
52 | return pdf_id
53 |
54 | def mathpix_status(pdf_id, headers):
55 | max_iters = 120
56 | i = 0
57 | status = "processing"
58 | status2 = "processing"
59 | while i < max_iters:
60 | time.sleep(1)
61 | response = requests.get(f"https://api.mathpix.com/v3/converter/{pdf_id}",
62 | headers=headers
63 | )
64 | status_resp = response.json()
65 | if "conversion_status" not in status_resp:
66 | continue
67 | status = status_resp["conversion_status"]["md"]["status"]
68 | status2 = status_resp["conversion_status"]["html"]["status"]
69 | if status == "completed" and status2 == "completed":
70 | break
71 | elif status == "error" or status2 == "error":
72 | break
73 | out_status = "completed" if status == "completed" and status2 == "completed" else "error"
74 | return out_status
75 |
76 | def mathpix_results(pdf_id, headers, ext="md"):
77 | response = requests.get(f"https://api.mathpix.com/v3/converter/{pdf_id}.{ext}",
78 | headers=headers
79 | )
80 | return response.content
81 |
--------------------------------------------------------------------------------
/benchmarks/overall/download/mistral.py:
--------------------------------------------------------------------------------
1 | import io
2 | import time
3 | import requests
4 |
5 | from benchmarks.overall.download.base import Downloader
6 |
7 |
8 | class MistralDownloader(Downloader):
9 | service = "mistral"
10 |
11 | def get_html(self, pdf_bytes):
12 | rand_name = str(time.time()) + ".pdf"
13 | start = time.time()
14 | buff = io.BytesIO(pdf_bytes)
15 | md = upload_and_process_file(self.api_key, rand_name, buff)
16 | end = time.time()
17 | if isinstance(md, bytes):
18 | md = md.decode("utf-8")
19 |
20 | return {
21 | "md": md,
22 | "time": end - start,
23 | }
24 |
25 |
26 | def upload_and_process_file(api_key: str, fname: str, buff):
27 | headers = {
28 | "Authorization": f"Bearer {api_key}"
29 | }
30 |
31 | upload_headers = headers.copy()
32 | files = {
33 | 'file': (fname, buff, 'application/pdf'),
34 | 'purpose': (None, 'ocr')
35 | }
36 |
37 | upload_response = requests.post(
38 | 'https://api.mistral.ai/v1/files',
39 | headers=upload_headers,
40 | files=files
41 | )
42 | upload_response.raise_for_status()
43 | file_id = upload_response.json()['id']
44 |
45 | url_headers = headers.copy()
46 | url_headers["Accept"] = "application/json"
47 |
48 | url_response = requests.get(
49 | f'https://api.mistral.ai/v1/files/{file_id}/url?expiry=24',
50 | headers=url_headers
51 | )
52 | url_response.raise_for_status()
53 | signed_url = url_response.json()['url']
54 |
55 | ocr_headers = headers.copy()
56 | ocr_headers["Content-Type"] = "application/json"
57 |
58 | ocr_data = {
59 | "model": "mistral-ocr-latest",
60 | "document": {
61 | "type": "document_url",
62 | "document_url": signed_url
63 | },
64 | "include_image_base64": True
65 | }
66 | ocr_response = requests.post(
67 | 'https://api.mistral.ai/v1/ocr',
68 | headers=ocr_headers,
69 | json=ocr_data
70 | )
71 | ocr_response.raise_for_status()
72 | result = ocr_response.json()
73 | return result["pages"][0]["markdown"]
--------------------------------------------------------------------------------
/benchmarks/overall/methods/docling.py:
--------------------------------------------------------------------------------
1 | import tempfile
2 | import time
3 |
4 | from benchmarks.overall.methods import BaseMethod, BenchmarkResult
5 |
6 |
7 | class DoclingMethod(BaseMethod):
8 | model_dict: dict = None
9 | use_llm: bool = False
10 |
11 | def __call__(self, sample) -> BenchmarkResult:
12 | from docling.document_converter import DocumentConverter
13 | pdf_bytes = sample["pdf"] # This is a single page PDF
14 | converter = DocumentConverter()
15 |
16 | with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as f:
17 | f.write(pdf_bytes)
18 | start = time.time()
19 | result = converter.convert(f.name)
20 | total = time.time() - start
21 |
22 | return {
23 | "markdown": result.document.export_to_markdown(),
24 | "time": total
25 | }
26 |
27 |
--------------------------------------------------------------------------------
/benchmarks/overall/methods/gt.py:
--------------------------------------------------------------------------------
1 | from typing import List
2 | import json
3 |
4 | from PIL import Image
5 |
6 | from benchmarks.overall.methods import BaseMethod, BenchmarkResult
7 |
8 |
9 | class GTMethod(BaseMethod):
10 | def __call__(self, sample) -> BenchmarkResult:
11 | gt_blocks = json.loads(sample["gt_blocks"])
12 | gt_html = [block["html"] for block in gt_blocks if len(block["html"]) > 0]
13 | gt_markdown = [self.convert_to_md(block) for block in gt_html]
14 | return {
15 | "markdown": gt_markdown,
16 | "time": 0
17 | }
18 |
19 | def render(self, html: List[str]) -> Image.Image:
20 | joined = "\n\n".join(html)
21 | html = f"""
22 |
23 |
24 |
25 | {joined}
26 |
27 |
28 | """.strip()
29 | return self.html_to_image(html)
--------------------------------------------------------------------------------
/benchmarks/overall/methods/llamaparse.py:
--------------------------------------------------------------------------------
1 | import datasets
2 |
3 | from benchmarks.overall.methods import BaseMethod, BenchmarkResult
4 |
5 |
6 | class LlamaParseMethod(BaseMethod):
7 | llamaparse_ds: datasets.Dataset = None
8 |
9 | def __call__(self, sample) -> BenchmarkResult:
10 | uuid = sample["uuid"]
11 | data = None
12 | for row in self.llamaparse_ds:
13 | if str(row["uuid"]) == str(uuid):
14 | data = row
15 | break
16 | if not data:
17 | raise ValueError(f"Could not find data for uuid {uuid}")
18 |
19 | return {
20 | "markdown": data["md"],
21 | "time": data["time"]
22 | }
--------------------------------------------------------------------------------
/benchmarks/overall/methods/marker.py:
--------------------------------------------------------------------------------
1 | import os
2 | import tempfile
3 | import time
4 |
5 | from benchmarks.overall.methods import BaseMethod, BenchmarkResult
6 | from marker.config.parser import ConfigParser
7 | from marker.converters.pdf import PdfConverter
8 |
9 |
10 | class MarkerMethod(BaseMethod):
11 | model_dict: dict = None
12 | use_llm: bool = False
13 |
14 | def __call__(self, sample) -> BenchmarkResult:
15 | pdf_bytes = sample["pdf"] # This is a single page PDF
16 | parser = ConfigParser({
17 | "page_range": "0",
18 | "disable_tqdm": True,
19 | "use_llm": self.use_llm,
20 | "redo_inline_math": self.use_llm,
21 | "llm_service": "marker.services.vertex.GoogleVertexService",
22 | "vertex_project_id": os.getenv("VERTEX_PROJECT_ID"),
23 | })
24 |
25 | block_converter = PdfConverter(
26 | artifact_dict=self.model_dict,
27 | config=parser.generate_config_dict(),
28 | llm_service=parser.get_llm_service()
29 | )
30 |
31 | with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as f:
32 | f.write(pdf_bytes)
33 | start = time.time()
34 | rendered = block_converter(f.name)
35 | total = time.time() - start
36 |
37 | return {
38 | "markdown": rendered.markdown,
39 | "time": total
40 | }
41 |
42 |
--------------------------------------------------------------------------------
/benchmarks/overall/methods/mathpix.py:
--------------------------------------------------------------------------------
1 | import datasets
2 |
3 | from benchmarks.overall.methods import BaseMethod, BenchmarkResult
4 |
5 |
6 | class MathpixMethod(BaseMethod):
7 | mathpix_ds: datasets.Dataset = None
8 |
9 | def __call__(self, sample) -> BenchmarkResult:
10 | uuid = sample["uuid"]
11 | data = None
12 | for row in self.mathpix_ds:
13 | if str(row["uuid"]) == str(uuid):
14 | data = row
15 | break
16 | if not data:
17 | raise ValueError(f"Could not find data for uuid {uuid}")
18 |
19 | return {
20 | "markdown": data["md"],
21 | "time": data["time"]
22 | }
--------------------------------------------------------------------------------
/benchmarks/overall/methods/mistral.py:
--------------------------------------------------------------------------------
1 | import datasets
2 |
3 | from benchmarks.overall.methods import BaseMethod, BenchmarkResult
4 |
5 |
6 | class MistralMethod(BaseMethod):
7 | mistral_ds: datasets.Dataset = None
8 |
9 | def __call__(self, sample) -> BenchmarkResult:
10 | uuid = sample["uuid"]
11 | data = None
12 | for row in self.mistral_ds:
13 | if str(row["uuid"]) == str(uuid):
14 | data = row
15 | break
16 | if not data:
17 | raise ValueError(f"Could not find data for uuid {uuid}")
18 |
19 | return {
20 | "markdown": data["md"],
21 | "time": data["time"]
22 | }
--------------------------------------------------------------------------------
/benchmarks/overall/methods/olmocr.py:
--------------------------------------------------------------------------------
1 | import base64
2 | import json
3 | import tempfile
4 | import time
5 | from io import BytesIO
6 |
7 | import torch
8 | from PIL import Image
9 |
10 | from benchmarks.overall.methods import BaseMethod, BenchmarkResult
11 |
12 |
13 | def convert_single_page(filename: str, model, processor, device):
14 | from olmocr.data.renderpdf import render_pdf_to_base64png
15 | from olmocr.prompts import build_finetuning_prompt
16 | from olmocr.prompts.anchor import get_anchor_text
17 |
18 | image_base64 = render_pdf_to_base64png(filename, 1, target_longest_image_dim=1024)
19 |
20 | # Build the prompt, using document metadata
21 | anchor_text = get_anchor_text(filename, 1, pdf_engine="pdfreport", target_length=4000)
22 | prompt = build_finetuning_prompt(anchor_text)
23 |
24 | # Build the full prompt
25 | messages = [
26 | {
27 | "role": "user",
28 | "content": [
29 | {"type": "text", "text": prompt},
30 | {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
31 | ],
32 | }
33 | ]
34 |
35 | # Apply the chat template and processor
36 | text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
37 | main_image = Image.open(BytesIO(base64.b64decode(image_base64)))
38 |
39 | inputs = processor(
40 | text=[text],
41 | images=[main_image],
42 | padding=True,
43 | return_tensors="pt",
44 | )
45 | inputs = {key: value.to(device) for (key, value) in inputs.items()}
46 |
47 | # Generate the output
48 | output = model.generate(
49 | **inputs,
50 | temperature=0.8,
51 | max_new_tokens=8192,
52 | num_return_sequences=1,
53 | do_sample=True,
54 | )
55 |
56 | # Decode the output
57 | prompt_length = inputs["input_ids"].shape[1]
58 | new_tokens = output[:, prompt_length:]
59 | text_output = processor.tokenizer.batch_decode(
60 | new_tokens, skip_special_tokens=True
61 | )[0]
62 |
63 | try:
64 | text_output = json.loads(text_output)
65 | text = text_output["natural_text"]
66 | except Exception:
67 | try:
68 | text = text_output.split("natural_text")[1].strip()
69 | except Exception:
70 | text = ""
71 |
72 | return text
73 |
74 |
75 | class OlmOCRMethod(BaseMethod):
76 | olmocr_model: dict = None
77 | use_llm: bool = False
78 |
79 | def __call__(self, sample) -> BenchmarkResult:
80 | pdf_bytes = sample["pdf"] # This is a single page PDF
81 |
82 | with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as f:
83 | f.write(pdf_bytes)
84 | start = time.time()
85 | result = convert_single_page(f.name, self.olmocr_model["model"], self.olmocr_model["processor"], self.olmocr_model["model"].device)
86 | total = time.time() - start
87 |
88 | return {
89 | "markdown": result,
90 | "time": total
91 | }
92 |
--------------------------------------------------------------------------------
/benchmarks/overall/methods/schema.py:
--------------------------------------------------------------------------------
1 | from typing import TypedDict, List
2 |
3 |
4 | class BenchmarkResult(TypedDict):
5 | markdown: str | List[str]
6 | time: float | None
--------------------------------------------------------------------------------
/benchmarks/overall/registry.py:
--------------------------------------------------------------------------------
1 | from benchmarks.overall.methods.docling import DoclingMethod
2 | from benchmarks.overall.methods.gt import GTMethod
3 | from benchmarks.overall.methods.llamaparse import LlamaParseMethod
4 | from benchmarks.overall.methods.marker import MarkerMethod
5 | from benchmarks.overall.methods.mathpix import MathpixMethod
6 | from benchmarks.overall.methods.mistral import MistralMethod
7 | from benchmarks.overall.methods.olmocr import OlmOCRMethod
8 | from benchmarks.overall.scorers.heuristic import HeuristicScorer
9 | from benchmarks.overall.scorers.llm import LLMScorer
10 |
11 | SCORE_REGISTRY = {
12 | "heuristic": HeuristicScorer,
13 | "llm": LLMScorer
14 | }
15 |
16 | METHOD_REGISTRY = {
17 | "marker": MarkerMethod,
18 | "gt": GTMethod,
19 | "mathpix": MathpixMethod,
20 | "llamaparse": LlamaParseMethod,
21 | "docling": DoclingMethod,
22 | "olmocr": OlmOCRMethod,
23 | "mistral": MistralMethod
24 | }
--------------------------------------------------------------------------------
/benchmarks/overall/schema.py:
--------------------------------------------------------------------------------
1 | from typing import TypedDict, List, Dict
2 |
3 | from benchmarks.overall.scorers.schema import BlockScores
4 |
5 | AVG_TYPE = Dict[str, Dict[str, Dict[str, List[float]]]]
6 |
7 | class FullResult(TypedDict):
8 | scores: Dict[int, Dict[str, Dict[str, BlockScores]]]
9 | averages_by_type: AVG_TYPE
10 | averages_by_block_type: AVG_TYPE
11 | average_times: Dict[str, List[float]]
12 | markdown: Dict[int, Dict[str, str]]
13 |
--------------------------------------------------------------------------------
/benchmarks/overall/scorers/__init__.py:
--------------------------------------------------------------------------------
1 | from typing import List
2 |
3 | from benchmarks.overall.scorers.schema import BlockScores
4 |
5 |
6 | class BaseScorer:
7 | def __init__(self):
8 | pass
9 |
10 | def __call__(self, sample, gt_markdown: List[str], method_markdown: str) -> BlockScores:
11 | raise NotImplementedError()
--------------------------------------------------------------------------------
/benchmarks/overall/scorers/schema.py:
--------------------------------------------------------------------------------
1 | from typing import TypedDict, List, Optional, Dict
2 |
3 |
4 | class BlockScores(TypedDict):
5 | score: float
6 | specific_scores: Dict[str, float | List[float]]
7 |
--------------------------------------------------------------------------------
/benchmarks/table/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/benchmarks/table/__init__.py
--------------------------------------------------------------------------------
/benchmarks/table/gemini.py:
--------------------------------------------------------------------------------
1 | import json
2 | from PIL import Image
3 | from google import genai
4 | from google.genai import types
5 | from io import BytesIO
6 | from pydantic import BaseModel
7 |
8 | from marker.settings import settings
9 |
10 | prompt = """
11 | You're an expert document analyst who is good at turning tables in documents into HTML. Analyze the provided image, and convert it to a faithful HTML representation.
12 |
13 | Guidelines:
14 | - Keep the HTML simple and concise.
15 | - Only include the tag and contents.
16 | - Only use , , and tags. Only use the colspan and rowspan attributes if necessary. Do not use |
, , or tags.
17 | - Make sure the table is as faithful to the image as possible with the given tags.
18 |
19 | **Instructions**
20 | 1. Analyze the image, and determine the table structure.
21 | 2. Convert the table image to HTML, following the guidelines above.
22 | 3. Output only the HTML for the table, starting with the tag.
23 | """.strip()
24 |
25 | class TableSchema(BaseModel):
26 | table_html: str
27 |
28 | def gemini_table_rec(image: Image.Image):
29 | client = genai.Client(
30 | api_key=settings.GOOGLE_API_KEY,
31 | http_options={"timeout": 60000}
32 | )
33 |
34 | image_bytes = BytesIO()
35 | image.save(image_bytes, format="PNG")
36 |
37 | responses = client.models.generate_content(
38 | model="gemini-2.0-flash",
39 | contents=[types.Part.from_bytes(data=image_bytes.getvalue(), mime_type="image/png"), prompt], # According to gemini docs, it performs better if the image is the first element
40 | config={
41 | "temperature": 0,
42 | "response_schema": TableSchema,
43 | "response_mime_type": "application/json",
44 | },
45 | )
46 |
47 | output = responses.candidates[0].content.parts[0].text
48 | return json.loads(output)["table_html"]
--------------------------------------------------------------------------------
/benchmarks/throughput/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/benchmarks/throughput/__init__.py
--------------------------------------------------------------------------------
/benchmarks/verify_scores.py:
--------------------------------------------------------------------------------
1 | import json
2 | import argparse
3 |
4 |
5 | def verify_scores(file_path):
6 | with open(file_path, 'r') as file:
7 | data = json.load(file)
8 |
9 | raw_scores = [data["scores"][k] for k in data["scores"]]
10 | marker_scores = [r["marker"]["heuristic"]["score"] for r in raw_scores]
11 | marker_score = sum(marker_scores) / len(marker_scores)
12 | if marker_score < 90:
13 | raise ValueError("Marker score below 90")
14 |
15 |
16 | def verify_table_scores(file_path):
17 | with open(file_path, 'r') as file:
18 | data = json.load(file)
19 |
20 | avg = sum([r["marker_score"] for r in data["marker"]]) / len(data)
21 | if avg < 0.7:
22 | raise ValueError("Average score is below the required threshold of 0.7")
23 |
24 |
25 | if __name__ == "__main__":
26 | parser = argparse.ArgumentParser(description="Verify benchmark scores")
27 | parser.add_argument("file_path", type=str, help="Path to the json file")
28 | parser.add_argument("--type", type=str, help="Type of file to verify", default="marker")
29 | args = parser.parse_args()
30 | if args.type == "marker":
31 | verify_scores(args.file_path)
32 | elif args.type == "table":
33 | verify_table_scores(args.file_path)
34 |
--------------------------------------------------------------------------------
/chunk_convert.py:
--------------------------------------------------------------------------------
1 | from marker.scripts.chunk_convert import chunk_convert_cli
2 |
3 | if __name__ == "__main__":
4 | chunk_convert_cli()
--------------------------------------------------------------------------------
/convert.py:
--------------------------------------------------------------------------------
1 | from marker.scripts.convert import convert_cli
2 |
3 | if __name__ == "__main__":
4 | convert_cli()
5 |
--------------------------------------------------------------------------------
/convert_single.py:
--------------------------------------------------------------------------------
1 | from marker.scripts.convert_single import convert_single_cli
2 |
3 | if __name__ == "__main__":
4 | convert_single_cli()
5 |
--------------------------------------------------------------------------------
/data/.gitignore:
--------------------------------------------------------------------------------
1 | latex
2 | pdfs
3 | references
--------------------------------------------------------------------------------
/data/examples/markdown/multicolcnn/_page_1_Figure_0.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/multicolcnn/_page_1_Figure_0.jpeg
--------------------------------------------------------------------------------
/data/examples/markdown/multicolcnn/_page_2_Picture_0.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/multicolcnn/_page_2_Picture_0.jpeg
--------------------------------------------------------------------------------
/data/examples/markdown/multicolcnn/_page_6_Figure_0.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/multicolcnn/_page_6_Figure_0.jpeg
--------------------------------------------------------------------------------
/data/examples/markdown/multicolcnn/_page_7_Figure_0.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/multicolcnn/_page_7_Figure_0.jpeg
--------------------------------------------------------------------------------
/data/examples/markdown/switch_transformers/_page_11_Figure_4.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/switch_transformers/_page_11_Figure_4.jpeg
--------------------------------------------------------------------------------
/data/examples/markdown/switch_transformers/_page_12_Figure_4.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/switch_transformers/_page_12_Figure_4.jpeg
--------------------------------------------------------------------------------
/data/examples/markdown/switch_transformers/_page_13_Figure_2.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/switch_transformers/_page_13_Figure_2.jpeg
--------------------------------------------------------------------------------
/data/examples/markdown/switch_transformers/_page_18_Figure_1.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/switch_transformers/_page_18_Figure_1.jpeg
--------------------------------------------------------------------------------
/data/examples/markdown/switch_transformers/_page_18_Figure_3.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/switch_transformers/_page_18_Figure_3.jpeg
--------------------------------------------------------------------------------
/data/examples/markdown/switch_transformers/_page_20_Figure_1.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/switch_transformers/_page_20_Figure_1.jpeg
--------------------------------------------------------------------------------
/data/examples/markdown/switch_transformers/_page_20_Figure_4.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/switch_transformers/_page_20_Figure_4.jpeg
--------------------------------------------------------------------------------
/data/examples/markdown/switch_transformers/_page_27_Figure_1.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/switch_transformers/_page_27_Figure_1.jpeg
--------------------------------------------------------------------------------
/data/examples/markdown/switch_transformers/_page_29_Figure_1.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/switch_transformers/_page_29_Figure_1.jpeg
--------------------------------------------------------------------------------
/data/examples/markdown/switch_transformers/_page_2_Figure_3.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/switch_transformers/_page_2_Figure_3.jpeg
--------------------------------------------------------------------------------
/data/examples/markdown/switch_transformers/_page_30_Figure_1.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/switch_transformers/_page_30_Figure_1.jpeg
--------------------------------------------------------------------------------
/data/examples/markdown/switch_transformers/_page_31_Figure_3.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/switch_transformers/_page_31_Figure_3.jpeg
--------------------------------------------------------------------------------
/data/examples/markdown/switch_transformers/_page_4_Figure_1.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/switch_transformers/_page_4_Figure_1.jpeg
--------------------------------------------------------------------------------
/data/examples/markdown/switch_transformers/_page_5_Figure_3.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/switch_transformers/_page_5_Figure_3.jpeg
--------------------------------------------------------------------------------
/data/examples/markdown/thinkpython/_page_109_Figure_1.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/thinkpython/_page_109_Figure_1.jpeg
--------------------------------------------------------------------------------
/data/examples/markdown/thinkpython/_page_115_Figure_1.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/thinkpython/_page_115_Figure_1.jpeg
--------------------------------------------------------------------------------
/data/examples/markdown/thinkpython/_page_116_Figure_3.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/thinkpython/_page_116_Figure_3.jpeg
--------------------------------------------------------------------------------
/data/examples/markdown/thinkpython/_page_127_Figure_1.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/thinkpython/_page_127_Figure_1.jpeg
--------------------------------------------------------------------------------
/data/examples/markdown/thinkpython/_page_128_Figure_1.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/thinkpython/_page_128_Figure_1.jpeg
--------------------------------------------------------------------------------
/data/examples/markdown/thinkpython/_page_167_Figure_1.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/thinkpython/_page_167_Figure_1.jpeg
--------------------------------------------------------------------------------
/data/examples/markdown/thinkpython/_page_169_Figure_1.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/thinkpython/_page_169_Figure_1.jpeg
--------------------------------------------------------------------------------
/data/examples/markdown/thinkpython/_page_173_Figure_1.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/thinkpython/_page_173_Figure_1.jpeg
--------------------------------------------------------------------------------
/data/examples/markdown/thinkpython/_page_190_Figure_1.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/thinkpython/_page_190_Figure_1.jpeg
--------------------------------------------------------------------------------
/data/examples/markdown/thinkpython/_page_195_Figure_1.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/thinkpython/_page_195_Figure_1.jpeg
--------------------------------------------------------------------------------
/data/examples/markdown/thinkpython/_page_205_Figure_1.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/thinkpython/_page_205_Figure_1.jpeg
--------------------------------------------------------------------------------
/data/examples/markdown/thinkpython/_page_230_Figure_1.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/thinkpython/_page_230_Figure_1.jpeg
--------------------------------------------------------------------------------
/data/examples/markdown/thinkpython/_page_233_Figure_1.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/thinkpython/_page_233_Figure_1.jpeg
--------------------------------------------------------------------------------
/data/examples/markdown/thinkpython/_page_233_Figure_3.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/thinkpython/_page_233_Figure_3.jpeg
--------------------------------------------------------------------------------
/data/examples/markdown/thinkpython/_page_234_Figure_1.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/thinkpython/_page_234_Figure_1.jpeg
--------------------------------------------------------------------------------
/data/examples/markdown/thinkpython/_page_235_Figure_1.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/thinkpython/_page_235_Figure_1.jpeg
--------------------------------------------------------------------------------
/data/examples/markdown/thinkpython/_page_236_Figure_1.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/thinkpython/_page_236_Figure_1.jpeg
--------------------------------------------------------------------------------
/data/examples/markdown/thinkpython/_page_236_Figure_3.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/thinkpython/_page_236_Figure_3.jpeg
--------------------------------------------------------------------------------
/data/examples/markdown/thinkpython/_page_237_Figure_1.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/thinkpython/_page_237_Figure_1.jpeg
--------------------------------------------------------------------------------
/data/examples/markdown/thinkpython/_page_238_Figure_1.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/thinkpython/_page_238_Figure_1.jpeg
--------------------------------------------------------------------------------
/data/examples/markdown/thinkpython/_page_23_Figure_1.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/thinkpython/_page_23_Figure_1.jpeg
--------------------------------------------------------------------------------
/data/examples/markdown/thinkpython/_page_23_Figure_3.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/thinkpython/_page_23_Figure_3.jpeg
--------------------------------------------------------------------------------
/data/examples/markdown/thinkpython/_page_46_Figure_1.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/thinkpython/_page_46_Figure_1.jpeg
--------------------------------------------------------------------------------
/data/examples/markdown/thinkpython/_page_60_Figure_1.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/thinkpython/_page_60_Figure_1.jpeg
--------------------------------------------------------------------------------
/data/examples/markdown/thinkpython/_page_60_Figure_3.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/thinkpython/_page_60_Figure_3.jpeg
--------------------------------------------------------------------------------
/data/examples/markdown/thinkpython/_page_67_Figure_1.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/thinkpython/_page_67_Figure_1.jpeg
--------------------------------------------------------------------------------
/data/examples/markdown/thinkpython/_page_71_Figure_1.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/thinkpython/_page_71_Figure_1.jpeg
--------------------------------------------------------------------------------
/data/examples/markdown/thinkpython/_page_78_Figure_1.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/thinkpython/_page_78_Figure_1.jpeg
--------------------------------------------------------------------------------
/data/examples/markdown/thinkpython/_page_85_Figure_1.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/thinkpython/_page_85_Figure_1.jpeg
--------------------------------------------------------------------------------
/data/examples/markdown/thinkpython/_page_94_Figure_1.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/thinkpython/_page_94_Figure_1.jpeg
--------------------------------------------------------------------------------
/data/examples/markdown/thinkpython/_page_99_Figure_17.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/thinkpython/_page_99_Figure_17.jpeg
--------------------------------------------------------------------------------
/data/examples/markdown/thinkpython/_page_99_Figure_178.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/thinkpython/_page_99_Figure_178.jpeg
--------------------------------------------------------------------------------
/data/images/overall.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/images/overall.png
--------------------------------------------------------------------------------
/data/images/per_doc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/images/per_doc.png
--------------------------------------------------------------------------------
/data/images/table.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/images/table.png
--------------------------------------------------------------------------------
/data/latex_to_md.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # List all .tex files in the latex folder
4 | FILES=$(find latex -name "*.tex")
5 |
6 | for f in $FILES
7 | do
8 | echo "Processing $f file..."
9 | base_name=$(basename "$f" .tex)
10 | out_file="references/${base_name}.md"
11 |
12 | pandoc --wrap=none \
13 | --no-highlight \
14 | --strip-comments \
15 | --from=latex \
16 | --to=commonmark_x+pipe_tables \
17 | "$f" \
18 | -o "$out_file"
19 | # Replace non-breaking spaces
20 | sed -i .bak 's/ / /g' "$out_file"
21 | sed -i .bak 's/ / /g' "$out_file"
22 | sed -i .bak 's/ / /g' "$out_file"
23 | sed -i .bak 's/ / /g' "$out_file"
24 | sed -i.bak -E 's/`\\cite`//g; s/<[^>]*>//g; s/\{[^}]*\}//g; s/\\cite\{[^}]*\}//g' "$out_file"
25 | sed -i.bak -E '
26 | s/`\\cite`//g; # Remove \cite commands inside backticks
27 | s/::: //g; # Remove the leading ::: for content markers
28 | s/\[//g; # Remove opening square bracket
29 | s/\]//g; # Remove closing square bracket
30 | ' "$out_file"
31 | # Remove .bak file
32 | rm "$out_file.bak"
33 | done
34 |
35 |
--------------------------------------------------------------------------------
/extraction_app.py:
--------------------------------------------------------------------------------
1 | from marker.scripts.run_streamlit_app import extraction_app_cli
2 |
3 | if __name__ == "__main__":
4 | extraction_app_cli()
5 |
--------------------------------------------------------------------------------
/marker/builders/__init__.py:
--------------------------------------------------------------------------------
1 | from typing import Optional
2 |
3 | from pydantic import BaseModel
4 |
5 | from marker.util import assign_config
6 |
7 |
8 | class BaseBuilder:
9 | def __init__(self, config: Optional[BaseModel | dict] = None):
10 | assign_config(self, config)
11 |
12 | def __call__(self, data, *args, **kwargs):
13 | raise NotImplementedError
14 |
--------------------------------------------------------------------------------
/marker/builders/document.py:
--------------------------------------------------------------------------------
1 | from typing import Annotated
2 |
3 | from marker.builders import BaseBuilder
4 | from marker.builders.layout import LayoutBuilder
5 | from marker.builders.line import LineBuilder
6 | from marker.builders.ocr import OcrBuilder
7 | from marker.providers.pdf import PdfProvider
8 | from marker.schema import BlockTypes
9 | from marker.schema.document import Document
10 | from marker.schema.groups.page import PageGroup
11 | from marker.schema.registry import get_block_class
12 |
13 |
14 | class DocumentBuilder(BaseBuilder):
15 | """
16 | Constructs a Document given a PdfProvider, LayoutBuilder, and OcrBuilder.
17 | """
18 | lowres_image_dpi: Annotated[
19 | int,
20 | "DPI setting for low-resolution page images used for Layout and Line Detection.",
21 | ] = 96
22 | highres_image_dpi: Annotated[
23 | int,
24 | "DPI setting for high-resolution page images used for OCR.",
25 | ] = 192
26 | disable_ocr: Annotated[
27 | bool,
28 | "Disable OCR processing.",
29 | ] = False
30 |
31 | def __call__(self, provider: PdfProvider, layout_builder: LayoutBuilder, line_builder: LineBuilder, ocr_builder: OcrBuilder):
32 | document = self.build_document(provider)
33 | layout_builder(document, provider)
34 | line_builder(document, provider)
35 | if not self.disable_ocr:
36 | ocr_builder(document, provider)
37 | return document
38 |
39 | def build_document(self, provider: PdfProvider):
40 | PageGroupClass: PageGroup = get_block_class(BlockTypes.Page)
41 | lowres_images = provider.get_images(provider.page_range, self.lowres_image_dpi)
42 | highres_images = provider.get_images(provider.page_range, self.highres_image_dpi)
43 | initial_pages = [
44 | PageGroupClass(
45 | page_id=p,
46 | lowres_image=lowres_images[i],
47 | highres_image=highres_images[i],
48 | polygon=provider.get_page_bbox(p),
49 | refs=provider.get_page_refs(p)
50 | ) for i, p in enumerate(provider.page_range)
51 | ]
52 | DocumentClass: Document = get_block_class(BlockTypes.Document)
53 | return DocumentClass(filepath=provider.filepath, pages=initial_pages)
54 |
--------------------------------------------------------------------------------
/marker/config/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/marker/config/__init__.py
--------------------------------------------------------------------------------
/marker/converters/__init__.py:
--------------------------------------------------------------------------------
1 | import inspect
2 | from typing import Optional, List, Type
3 |
4 | from pydantic import BaseModel
5 |
6 | from marker.processors import BaseProcessor
7 | from marker.processors.llm import BaseLLMSimpleBlockProcessor
8 | from marker.processors.llm.llm_meta import LLMSimpleBlockMetaProcessor
9 | from marker.util import assign_config, download_font
10 |
11 |
12 | class BaseConverter:
13 | def __init__(self, config: Optional[BaseModel | dict] = None):
14 | assign_config(self, config)
15 | self.config = config
16 | self.llm_service = None
17 |
18 | # Download render font, needed for some providers
19 | download_font()
20 |
21 | def __call__(self, *args, **kwargs):
22 | raise NotImplementedError
23 |
24 | def resolve_dependencies(self, cls):
25 | init_signature = inspect.signature(cls.__init__)
26 | parameters = init_signature.parameters
27 |
28 | resolved_kwargs = {}
29 | for param_name, param in parameters.items():
30 | if param_name == 'self':
31 | continue
32 | elif param_name == 'config':
33 | resolved_kwargs[param_name] = self.config
34 | elif param.name in self.artifact_dict:
35 | resolved_kwargs[param_name] = self.artifact_dict[param_name]
36 | elif param.default != inspect.Parameter.empty:
37 | resolved_kwargs[param_name] = param.default
38 | else:
39 | raise ValueError(f"Cannot resolve dependency for parameter: {param_name}")
40 |
41 | return cls(**resolved_kwargs)
42 |
43 | def initialize_processors(self, processor_cls_lst: List[Type[BaseProcessor]]) -> List[BaseProcessor]:
44 | processors = []
45 | for processor_cls in processor_cls_lst:
46 | processors.append(self.resolve_dependencies(processor_cls))
47 |
48 | simple_llm_processors = [p for p in processors if issubclass(type(p), BaseLLMSimpleBlockProcessor)]
49 | other_processors = [p for p in processors if not issubclass(type(p), BaseLLMSimpleBlockProcessor)]
50 |
51 | if not simple_llm_processors:
52 | return processors
53 |
54 | llm_positions = [i for i, p in enumerate(processors) if issubclass(type(p), BaseLLMSimpleBlockProcessor)]
55 | insert_position = max(0, llm_positions[-1] - len(simple_llm_processors) + 1)
56 |
57 | meta_processor = LLMSimpleBlockMetaProcessor(
58 | processor_lst=simple_llm_processors,
59 | llm_service=self.llm_service,
60 | config=self.config,
61 | )
62 | other_processors.insert(insert_position, meta_processor)
63 | return other_processors
--------------------------------------------------------------------------------
/marker/converters/extraction.py:
--------------------------------------------------------------------------------
1 | import json
2 | import re
3 |
4 | from marker.builders.document import DocumentBuilder
5 | from marker.builders.line import LineBuilder
6 | from marker.builders.ocr import OcrBuilder
7 | from marker.builders.structure import StructureBuilder
8 | from marker.converters.pdf import PdfConverter
9 | from marker.extractors.page import PageExtractor, json_schema_to_base_model
10 | from marker.providers.registry import provider_from_filepath
11 |
12 | from marker.renderers.extraction import ExtractionRenderer, ExtractionOutput
13 | from marker.renderers.markdown import MarkdownRenderer
14 |
15 | from marker.logger import get_logger
16 |
17 | logger = get_logger()
18 |
19 |
20 | class ExtractionConverter(PdfConverter):
21 | pattern: str = r"{\d+\}-{48}\n\n"
22 |
23 | def build_document(self, filepath: str):
24 | provider_cls = provider_from_filepath(filepath)
25 | layout_builder = self.resolve_dependencies(self.layout_builder_class)
26 | line_builder = self.resolve_dependencies(LineBuilder)
27 | ocr_builder = self.resolve_dependencies(OcrBuilder)
28 | provider = provider_cls(filepath, self.config)
29 | document = DocumentBuilder(self.config)(
30 | provider, layout_builder, line_builder, ocr_builder
31 | )
32 | structure_builder_cls = self.resolve_dependencies(StructureBuilder)
33 | structure_builder_cls(document)
34 |
35 | for processor in self.processor_list:
36 | processor(document)
37 |
38 | return document, provider
39 |
40 | def __call__(self, filepath: str) -> ExtractionOutput:
41 | self.config["paginate_output"] = True # Ensure we can split the output properly
42 | self.config["output_format"] = (
43 | "markdown" # Output must be markdown for extraction
44 | )
45 | try:
46 | json_schema_to_base_model(json.loads(self.config["page_schema"]))
47 | except Exception as e:
48 | logger.error(f"Could not parse page schema: {e}")
49 | raise ValueError(
50 | "Could not parse your page schema. Please check the schema format."
51 | )
52 |
53 | document, provider = self.build_document(filepath)
54 | renderer = self.resolve_dependencies(MarkdownRenderer)
55 | output = renderer(document)
56 |
57 | output_pages = re.split(self.pattern, output.markdown)[
58 | 1:
59 | ] # Split output into pages
60 |
61 | # This needs an LLM service for extraction, this sets it in the extractor
62 | if not self.artifact_dict["llm_service"]:
63 | self.artifact_dict["llm_service"] = self.resolve_dependencies(
64 | self.default_llm_service
65 | )
66 |
67 | extractor = self.resolve_dependencies(PageExtractor)
68 | renderer = self.resolve_dependencies(ExtractionRenderer)
69 |
70 | pnums = provider.page_range
71 | all_json = {}
72 | for page, page_md, pnum in zip(document.pages, output_pages, pnums):
73 | extracted_json = extractor(document, page, page_md.strip())
74 | all_json[pnum] = extracted_json
75 |
76 | merged = renderer(all_json)
77 | return merged
78 |
--------------------------------------------------------------------------------
/marker/converters/ocr.py:
--------------------------------------------------------------------------------
1 | from typing import Tuple
2 |
3 | from marker.builders.document import DocumentBuilder
4 | from marker.builders.line import LineBuilder
5 | from marker.builders.ocr import OcrBuilder
6 | from marker.converters.pdf import PdfConverter
7 | from marker.processors import BaseProcessor
8 | from marker.processors.equation import EquationProcessor
9 | from marker.providers.registry import provider_from_filepath
10 | from marker.renderers.ocr_json import OCRJSONRenderer
11 |
12 |
13 | class OCRConverter(PdfConverter):
14 | default_processors: Tuple[BaseProcessor, ...] = (EquationProcessor,)
15 |
16 | def __init__(self, *args, **kwargs):
17 | super().__init__(*args, **kwargs)
18 |
19 | if not self.config:
20 | self.config = {}
21 |
22 | self.config["format_lines"] = True
23 | self.renderer = OCRJSONRenderer
24 |
25 | def build_document(self, filepath: str):
26 | provider_cls = provider_from_filepath(filepath)
27 | layout_builder = self.resolve_dependencies(self.layout_builder_class)
28 | line_builder = self.resolve_dependencies(LineBuilder)
29 | ocr_builder = self.resolve_dependencies(OcrBuilder)
30 | document_builder = DocumentBuilder(self.config)
31 |
32 | provider = provider_cls(filepath, self.config)
33 | document = document_builder(provider, layout_builder, line_builder, ocr_builder)
34 |
35 | for processor in self.processor_list:
36 | processor(document)
37 |
38 | return document
39 |
40 | def __call__(self, filepath: str):
41 | document = self.build_document(filepath)
42 | renderer = self.resolve_dependencies(self.renderer)
43 | return renderer(document)
44 |
--------------------------------------------------------------------------------
/marker/converters/table.py:
--------------------------------------------------------------------------------
1 | from functools import cache
2 | from typing import Tuple, List
3 |
4 | from marker.builders.document import DocumentBuilder
5 | from marker.builders.line import LineBuilder
6 | from marker.builders.ocr import OcrBuilder
7 | from marker.converters.pdf import PdfConverter
8 | from marker.processors import BaseProcessor
9 | from marker.processors.llm.llm_complex import LLMComplexRegionProcessor
10 | from marker.processors.llm.llm_form import LLMFormProcessor
11 | from marker.processors.llm.llm_table import LLMTableProcessor
12 | from marker.processors.llm.llm_table_merge import LLMTableMergeProcessor
13 | from marker.processors.table import TableProcessor
14 | from marker.providers.registry import provider_from_filepath
15 | from marker.schema import BlockTypes
16 |
17 |
18 | class TableConverter(PdfConverter):
19 | default_processors: Tuple[BaseProcessor, ...] = (
20 | TableProcessor,
21 | LLMTableProcessor,
22 | LLMTableMergeProcessor,
23 | LLMFormProcessor,
24 | LLMComplexRegionProcessor,
25 | )
26 | converter_block_types: List[BlockTypes] = (BlockTypes.Table, BlockTypes.Form, BlockTypes.TableOfContents)
27 |
28 | def build_document(self, filepath: str):
29 | provider_cls = provider_from_filepath(filepath)
30 | layout_builder = self.resolve_dependencies(self.layout_builder_class)
31 | line_builder = self.resolve_dependencies(LineBuilder)
32 | ocr_builder = self.resolve_dependencies(OcrBuilder)
33 | document_builder = DocumentBuilder(self.config)
34 | document_builder.disable_ocr = True
35 |
36 | provider = provider_cls(filepath, self.config)
37 | document = document_builder(provider, layout_builder, line_builder, ocr_builder)
38 |
39 | for page in document.pages:
40 | page.structure = [p for p in page.structure if p.block_type in self.converter_block_types]
41 |
42 | for processor in self.processor_list:
43 | processor(document)
44 |
45 | return document
46 |
47 | def __call__(self, filepath: str):
48 | document = self.build_document(filepath)
49 | renderer = self.resolve_dependencies(self.renderer)
50 | return renderer(document)
--------------------------------------------------------------------------------
/marker/extractors/__init__.py:
--------------------------------------------------------------------------------
1 | from typing import Annotated, Sequence, Optional
2 |
3 | from pydantic import BaseModel
4 |
5 | from marker.schema import BlockTypes
6 | from marker.schema.document import Document
7 | from marker.schema.groups import PageGroup
8 | from PIL import Image
9 |
10 | from marker.services import BaseService
11 | from marker.util import assign_config
12 |
13 |
14 | class ExtractionResult(BaseModel):
15 | extracted_data: dict | list
16 | value_confidence: int
17 | existence_confidence: int
18 |
19 |
20 | class BaseExtractor:
21 | """
22 | An extractor that uses a provided service to extract structured data from documents.
23 | """
24 |
25 | max_concurrency: Annotated[
26 | int,
27 | "The maximum number of concurrent requests to make to the Gemini model.",
28 | ] = 3
29 | disable_tqdm: Annotated[
30 | bool,
31 | "Whether to disable the tqdm progress bar.",
32 | ] = False
33 |
34 | def __init__(self, llm_service: BaseService, config=None):
35 | self.llm_service = llm_service
36 | assign_config(self, config)
37 |
38 | def extract_image(
39 | self,
40 | document: Document,
41 | page: PageGroup,
42 | remove_blocks: Sequence[BlockTypes] | None = None,
43 | highres: bool = False, # Default False to save tokens
44 | ) -> Image.Image:
45 | return page.get_image(
46 | document,
47 | highres=highres,
48 | remove_blocks=remove_blocks,
49 | )
50 |
51 | def __call__(
52 | self, document: Document, *args, **kwargs
53 | ) -> Optional[ExtractionResult]:
54 | raise NotImplementedError
55 |
--------------------------------------------------------------------------------
/marker/logger.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import warnings
3 |
4 | from marker.settings import settings
5 |
6 |
7 | def configure_logging():
8 | # Setup marker logger
9 | logger = get_logger()
10 |
11 | if not logger.handlers:
12 | handler = logging.StreamHandler()
13 | formatter = logging.Formatter(
14 | "%(asctime)s [%(levelname)s] %(name)s: %(message)s"
15 | )
16 | handler.setFormatter(formatter)
17 | logger.addHandler(handler)
18 |
19 | logger.setLevel(settings.LOGLEVEL)
20 |
21 | # Ignore future warnings
22 | warnings.simplefilter(action="ignore", category=FutureWarning)
23 |
24 | # Set component loglevels
25 | logging.getLogger("PIL").setLevel(logging.ERROR)
26 | logging.getLogger("fontTools.subset").setLevel(logging.ERROR)
27 | logging.getLogger("fontTools.ttLib.ttFont").setLevel(logging.ERROR)
28 | logging.getLogger("weasyprint").setLevel(logging.CRITICAL)
29 |
30 |
31 | def get_logger():
32 | return logging.getLogger("marker")
33 |
--------------------------------------------------------------------------------
/marker/models.py:
--------------------------------------------------------------------------------
1 | import os
2 | os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" # Transformers uses .isin for an op, which is not supported on MPS
3 |
4 | from surya.detection import DetectionPredictor
5 | from surya.layout import LayoutPredictor
6 | from surya.ocr_error import OCRErrorPredictor
7 | from surya.recognition import RecognitionPredictor
8 | from surya.table_rec import TableRecPredictor
9 |
10 | def create_model_dict(device=None, dtype=None) -> dict:
11 | return {
12 | "layout_model": LayoutPredictor(device=device, dtype=dtype),
13 | "recognition_model": RecognitionPredictor(device=device, dtype=dtype),
14 | "table_rec_model": TableRecPredictor(device=device, dtype=dtype),
15 | "detection_model": DetectionPredictor(device=device, dtype=dtype),
16 | "ocr_error_model": OCRErrorPredictor(device=device, dtype=dtype)
17 | }
--------------------------------------------------------------------------------
/marker/output.py:
--------------------------------------------------------------------------------
1 | import json
2 | import os
3 |
4 | from bs4 import BeautifulSoup, Tag
5 | from pydantic import BaseModel
6 | from PIL import Image
7 |
8 | from marker.renderers.extraction import ExtractionOutput
9 | from marker.renderers.html import HTMLOutput
10 | from marker.renderers.json import JSONOutput, JSONBlockOutput
11 | from marker.renderers.markdown import MarkdownOutput
12 | from marker.renderers.ocr_json import OCRJSONOutput
13 | from marker.schema.blocks import BlockOutput
14 | from marker.settings import settings
15 |
16 |
17 | def unwrap_outer_tag(html: str):
18 | soup = BeautifulSoup(html, "html.parser")
19 | contents = list(soup.contents)
20 | if len(contents) == 1 and isinstance(contents[0], Tag) and contents[0].name == "p":
21 | # Unwrap the p tag
22 | soup.p.unwrap()
23 |
24 | return str(soup)
25 |
26 |
27 | def json_to_html(block: JSONBlockOutput | BlockOutput):
28 | # Utility function to take in json block output and give html for the block.
29 | if not getattr(block, "children", None):
30 | return block.html
31 | else:
32 | child_html = [json_to_html(child) for child in block.children]
33 | child_ids = [child.id for child in block.children]
34 |
35 | soup = BeautifulSoup(block.html, "html.parser")
36 | content_refs = soup.find_all("content-ref")
37 | for ref in content_refs:
38 | src_id = ref.attrs["src"]
39 | if src_id in child_ids:
40 | child_soup = BeautifulSoup(
41 | child_html[child_ids.index(src_id)], "html.parser"
42 | )
43 | ref.replace_with(child_soup)
44 | return str(soup)
45 |
46 |
47 | def output_exists(output_dir: str, fname_base: str):
48 | exts = ["md", "html", "json"]
49 | for ext in exts:
50 | if os.path.exists(os.path.join(output_dir, f"{fname_base}.{ext}")):
51 | return True
52 | return False
53 |
54 |
55 | def text_from_rendered(rendered: BaseModel):
56 | if isinstance(rendered, MarkdownOutput):
57 | return rendered.markdown, "md", rendered.images
58 | elif isinstance(rendered, HTMLOutput):
59 | return rendered.html, "html", rendered.images
60 | elif isinstance(rendered, JSONOutput):
61 | return rendered.model_dump_json(exclude=["metadata"], indent=2), "json", {}
62 | elif isinstance(rendered, OCRJSONOutput):
63 | return rendered.model_dump_json(exclude=["metadata"], indent=2), "json", {}
64 | elif isinstance(rendered, ExtractionOutput):
65 | return rendered.document_json, "json", {}
66 | else:
67 | raise ValueError("Invalid output type")
68 |
69 |
70 | def convert_if_not_rgb(image: Image.Image) -> Image.Image:
71 | if image.mode != "RGB":
72 | image = image.convert("RGB")
73 | return image
74 |
75 |
76 | def save_output(rendered: BaseModel, output_dir: str, fname_base: str):
77 | text, ext, images = text_from_rendered(rendered)
78 | text = text.encode(settings.OUTPUT_ENCODING, errors="replace").decode(
79 | settings.OUTPUT_ENCODING
80 | )
81 |
82 | with open(
83 | os.path.join(output_dir, f"{fname_base}.{ext}"),
84 | "w+",
85 | encoding=settings.OUTPUT_ENCODING,
86 | ) as f:
87 | f.write(text)
88 | with open(
89 | os.path.join(output_dir, f"{fname_base}_meta.json"),
90 | "w+",
91 | encoding=settings.OUTPUT_ENCODING,
92 | ) as f:
93 | f.write(json.dumps(rendered.metadata, indent=2))
94 |
95 | for img_name, img in images.items():
96 | img = convert_if_not_rgb(img) # RGBA images can't save as JPG
97 | img.save(os.path.join(output_dir, img_name), settings.OUTPUT_IMAGE_FORMAT)
98 |
--------------------------------------------------------------------------------
/marker/processors/__init__.py:
--------------------------------------------------------------------------------
1 | from typing import Optional, Tuple
2 |
3 | from pydantic import BaseModel
4 |
5 | from marker.schema import BlockTypes
6 | from marker.schema.document import Document
7 | from marker.util import assign_config
8 |
9 |
10 | class BaseProcessor:
11 | block_types: Tuple[BlockTypes] | None = None # What block types this processor is responsible for
12 |
13 | def __init__(self, config: Optional[BaseModel | dict] = None):
14 | assign_config(self, config)
15 |
16 | def __call__(self, document: Document, *args, **kwargs):
17 | raise NotImplementedError
18 |
--------------------------------------------------------------------------------
/marker/processors/blockquote.py:
--------------------------------------------------------------------------------
1 | from typing import Annotated, Tuple
2 |
3 | from marker.processors import BaseProcessor
4 | from marker.schema import BlockTypes
5 | from marker.schema.document import Document
6 |
7 |
8 | class BlockquoteProcessor(BaseProcessor):
9 | """
10 | A processor for tagging blockquotes.
11 | """
12 | block_types: Annotated[
13 | Tuple[BlockTypes],
14 | "The block types to process.",
15 | ] = (BlockTypes.Text, BlockTypes.TextInlineMath)
16 | min_x_indent: Annotated[
17 | float,
18 | "The minimum horizontal indentation required to consider a block as part of a blockquote.",
19 | "Expressed as a percentage of the block width.",
20 | ] = 0.1
21 | x_start_tolerance: Annotated[
22 | float,
23 | "The maximum allowable difference between the starting x-coordinates of consecutive blocks to consider them aligned.",
24 | "Expressed as a percentage of the block width.",
25 | ] = 0.01
26 | x_end_tolerance: Annotated[
27 | float,
28 | "The maximum allowable difference between the ending x-coordinates of consecutive blocks to consider them aligned.",
29 | "Expressed as a percentage of the block width.",
30 | ] = 0.01
31 |
32 | def __init__(self, config):
33 | super().__init__(config)
34 |
35 | def __call__(self, document: Document):
36 | for page in document.pages:
37 | for block in page.contained_blocks(document, self.block_types):
38 | if block.structure is None:
39 | continue
40 |
41 | if not len(block.structure) >= 2:
42 | continue
43 |
44 | next_block = page.get_next_block(block)
45 | if next_block is None:
46 | continue
47 | if next_block.block_type not in self.block_types:
48 | continue
49 | if next_block.structure is None:
50 | continue
51 | if next_block.ignore_for_output:
52 | continue
53 |
54 | matching_x_end = abs(next_block.polygon.x_end - block.polygon.x_end) < self.x_end_tolerance * block.polygon.width
55 | matching_x_start = abs(next_block.polygon.x_start - block.polygon.x_start) < self.x_start_tolerance * block.polygon.width
56 | x_indent = next_block.polygon.x_start > block.polygon.x_start + (self.min_x_indent * block.polygon.width)
57 | y_indent = next_block.polygon.y_start > block.polygon.y_end
58 |
59 | if block.blockquote:
60 | next_block.blockquote = (matching_x_end and matching_x_start) or (x_indent and y_indent)
61 | next_block.blockquote_level = block.blockquote_level
62 | if (x_indent and y_indent):
63 | next_block.blockquote_level += 1
64 | elif len(next_block.structure) >= 2 and (x_indent and y_indent):
65 | next_block.blockquote = True
66 | next_block.blockquote_level = 1
--------------------------------------------------------------------------------
/marker/processors/code.py:
--------------------------------------------------------------------------------
1 | from marker.processors import BaseProcessor
2 | from marker.schema import BlockTypes
3 | from marker.schema.blocks import Code
4 | from marker.schema.document import Document
5 |
6 |
7 | class CodeProcessor(BaseProcessor):
8 | """
9 | A processor for formatting code blocks.
10 | """
11 | block_types = (BlockTypes.Code, )
12 |
13 | def __call__(self, document: Document):
14 | for page in document.pages:
15 | for block in page.contained_blocks(document, self.block_types):
16 | self.format_block(document, block)
17 |
18 |
19 | def format_block(self, document: Document, block: Code):
20 | min_left = 9999 # will contain x- coord of column 0
21 | total_width = 0
22 | total_chars = 0
23 |
24 | contained_lines = block.contained_blocks(document, (BlockTypes.Line,))
25 | for line in contained_lines:
26 | min_left = min(line.polygon.bbox[0], min_left)
27 | total_width += line.polygon.width
28 | total_chars += len(line.raw_text(document))
29 |
30 | avg_char_width = total_width / max(total_chars, 1)
31 | code_text = ""
32 | is_new_line = False
33 | for line in contained_lines:
34 | text = line.raw_text(document)
35 | if avg_char_width == 0:
36 | prefix = ""
37 | else:
38 | total_spaces = int((line.polygon.bbox[0] - min_left) / avg_char_width)
39 | prefix = " " * max(0, total_spaces)
40 |
41 | if is_new_line:
42 | text = prefix + text
43 |
44 | code_text += text
45 | is_new_line = text.endswith("\n")
46 |
47 | block.code = code_text.rstrip()
48 |
--------------------------------------------------------------------------------
/marker/processors/document_toc.py:
--------------------------------------------------------------------------------
1 | from marker.processors import BaseProcessor
2 | from marker.schema import BlockTypes
3 | from marker.schema.document import Document
4 |
5 |
6 | class DocumentTOCProcessor(BaseProcessor):
7 | """
8 | A processor for generating a table of contents for the document.
9 | """
10 | block_types = (BlockTypes.SectionHeader, )
11 |
12 | def __call__(self, document: Document):
13 | toc = []
14 | for page in document.pages:
15 | for block in page.contained_blocks(document, self.block_types):
16 | toc.append({
17 | "title": block.raw_text(document).strip(),
18 | "heading_level": block.heading_level,
19 | "page_id": page.page_id,
20 | "polygon": block.polygon.polygon
21 | })
22 | document.table_of_contents = toc
23 |
--------------------------------------------------------------------------------
/marker/processors/footnote.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | from marker.processors import BaseProcessor
4 | from marker.schema import BlockTypes
5 | from marker.schema.document import Document
6 | from marker.schema.groups import PageGroup
7 |
8 |
9 | class FootnoteProcessor(BaseProcessor):
10 | """
11 | A processor for pushing footnotes to the bottom, and relabeling mislabeled text blocks.
12 | """
13 | block_types = (BlockTypes.Footnote,)
14 |
15 | def __call__(self, document: Document):
16 | for page in document.pages:
17 | self.push_footnotes_to_bottom(page, document)
18 | self.assign_superscripts(page, document)
19 |
20 | def push_footnotes_to_bottom(self, page: PageGroup, document: Document):
21 | footnote_blocks = page.contained_blocks(document, self.block_types)
22 |
23 | # Push footnotes to the bottom
24 | for block in footnote_blocks:
25 | # Check if it is top-level
26 | if block.id in page.structure:
27 | # Move to bottom if it is
28 | page.structure.remove(block.id)
29 | page.add_structure(block)
30 |
31 | def assign_superscripts(self, page: PageGroup, document: Document):
32 | footnote_blocks = page.contained_blocks(document, self.block_types)
33 |
34 | for block in footnote_blocks:
35 | for span in block.contained_blocks(document, (BlockTypes.Span,)):
36 | if re.match(r"^[0-9\W]+", span.text):
37 | span.has_superscript = True
38 | break
39 |
--------------------------------------------------------------------------------
/marker/processors/llm/llm_image_description.py:
--------------------------------------------------------------------------------
1 | from pydantic import BaseModel
2 |
3 | from marker.processors.llm import PromptData, BaseLLMSimpleBlockProcessor, BlockData
4 |
5 | from marker.schema import BlockTypes
6 | from marker.schema.document import Document
7 |
8 | from typing import Annotated, List
9 |
10 |
11 | class LLMImageDescriptionProcessor(BaseLLMSimpleBlockProcessor):
12 | block_types = (BlockTypes.Picture, BlockTypes.Figure,)
13 | extract_images: Annotated[
14 | bool,
15 | "Extract images from the document."
16 | ] = True
17 | image_description_prompt: Annotated[
18 | str,
19 | "The prompt to use for generating image descriptions.",
20 | "Default is a string containing the Gemini prompt."
21 | ] = """You are a document analysis expert who specializes in creating text descriptions for images.
22 | You will receive an image of a picture or figure. Your job will be to create a short description of the image.
23 | **Instructions:**
24 | 1. Carefully examine the provided image.
25 | 2. Analyze any text that was extracted from within the image.
26 | 3. Output a faithful description of the image. Make sure there is enough specific detail to accurately reconstruct the image. If the image is a figure or contains numeric data, include the numeric data in the output.
27 | **Example:**
28 | Input:
29 | ```text
30 | "Fruit Preference Survey"
31 | 20, 15, 10
32 | Apples, Bananas, Oranges
33 | ```
34 | Output:
35 | In this figure, a bar chart titled "Fruit Preference Survey" is showing the number of people who prefer different types of fruits. The x-axis shows the types of fruits, and the y-axis shows the number of people. The bar chart shows that most people prefer apples, followed by bananas and oranges. 20 people prefer apples, 15 people prefer bananas, and 10 people prefer oranges.
36 | **Input:**
37 | ```text
38 | {raw_text}
39 | ```
40 | """
41 |
42 | def inference_blocks(self, document: Document) -> List[BlockData]:
43 | blocks = super().inference_blocks(document)
44 | if self.extract_images:
45 | return []
46 | return blocks
47 |
48 | def block_prompts(self, document: Document) -> List[PromptData]:
49 | prompt_data = []
50 | for block_data in self.inference_blocks(document):
51 | block = block_data["block"]
52 | prompt = self.image_description_prompt.replace("{raw_text}", block.raw_text(document))
53 | image = self.extract_image(document, block)
54 |
55 | prompt_data.append({
56 | "prompt": prompt,
57 | "image": image,
58 | "block": block,
59 | "schema": ImageSchema,
60 | "page": block_data["page"]
61 | })
62 |
63 | return prompt_data
64 |
65 | def rewrite_block(self, response: dict, prompt_data: PromptData, document: Document):
66 | block = prompt_data["block"]
67 |
68 | if not response or "image_description" not in response:
69 | block.update_metadata(llm_error_count=1)
70 | return
71 |
72 | image_description = response["image_description"]
73 | if len(image_description) < 10:
74 | block.update_metadata(llm_error_count=1)
75 | return
76 |
77 | block.description = image_description
78 |
79 | class ImageSchema(BaseModel):
80 | image_description: str
81 |
--------------------------------------------------------------------------------
/marker/processors/llm/llm_meta.py:
--------------------------------------------------------------------------------
1 | from concurrent.futures import ThreadPoolExecutor
2 | from typing import List, Dict, Any
3 |
4 | from marker.logger import get_logger
5 | from tqdm import tqdm
6 |
7 | from marker.processors.llm import BaseLLMSimpleBlockProcessor, BaseLLMProcessor
8 | from marker.schema.document import Document
9 | from marker.services import BaseService
10 |
11 | logger = get_logger()
12 |
13 |
14 | class LLMSimpleBlockMetaProcessor(BaseLLMProcessor):
15 | """
16 | A wrapper for simple LLM processors, so they can all run in parallel.
17 | """
18 |
19 | def __init__(
20 | self,
21 | processor_lst: List[BaseLLMSimpleBlockProcessor],
22 | llm_service: BaseService,
23 | config=None,
24 | ):
25 | super().__init__(llm_service, config)
26 | self.processors = processor_lst
27 |
28 | def __call__(self, document: Document):
29 | if not self.use_llm or self.llm_service is None:
30 | return
31 |
32 | total = sum(
33 | [len(processor.inference_blocks(document)) for processor in self.processors]
34 | )
35 | pbar = tqdm(
36 | desc="LLM processors running", disable=self.disable_tqdm, total=total
37 | )
38 |
39 | all_prompts = [
40 | processor.block_prompts(document) for processor in self.processors
41 | ]
42 | pending = []
43 | futures_map = {}
44 | with ThreadPoolExecutor(max_workers=self.max_concurrency) as executor:
45 | for i, prompt_lst in enumerate(all_prompts):
46 | for prompt in prompt_lst:
47 | future = executor.submit(self.get_response, prompt)
48 | pending.append(future)
49 | futures_map[future] = {"processor_idx": i, "prompt_data": prompt}
50 |
51 | for future in pending:
52 | try:
53 | result = future.result()
54 | future_data = futures_map.pop(future)
55 | processor: BaseLLMSimpleBlockProcessor = self.processors[
56 | future_data["processor_idx"]
57 | ]
58 | # finalize the result
59 | processor(result, future_data["prompt_data"], document)
60 | except Exception as e:
61 | logger.warning(f"Error processing LLM response: {e}")
62 |
63 | pbar.update(1)
64 |
65 | pbar.close()
66 |
67 | def get_response(self, prompt_data: Dict[str, Any]):
68 | return self.llm_service(
69 | prompt_data["prompt"],
70 | prompt_data["image"],
71 | prompt_data["block"],
72 | prompt_data["schema"],
73 | )
74 |
--------------------------------------------------------------------------------
/marker/processors/order.py:
--------------------------------------------------------------------------------
1 | from statistics import mean
2 | from collections import defaultdict
3 |
4 | from marker.processors import BaseProcessor
5 | from marker.schema import BlockTypes
6 | from marker.schema.document import Document
7 |
8 |
9 | class OrderProcessor(BaseProcessor):
10 | """
11 | A processor for sorting the blocks in order if needed. This can help when the layout image was sliced.
12 | """
13 | block_types = tuple()
14 |
15 | def __call__(self, document: Document):
16 | for page in document.pages:
17 | # Skip OCRed pages
18 | if page.text_extraction_method != "pdftext":
19 | continue
20 |
21 | # Skip pages without layout slicing
22 | if not page.layout_sliced:
23 | continue
24 |
25 | block_idxs = defaultdict(int)
26 | for block_id in page.structure:
27 | block = document.get_block(block_id)
28 | spans = block.contained_blocks(document, (BlockTypes.Span, ))
29 | if len(spans) == 0:
30 | continue
31 |
32 | # Avg span position in original PDF
33 | block_idxs[block_id] = (spans[0].minimum_position + spans[-1].maximum_position) / 2
34 |
35 | for block_id in page.structure:
36 | # Already assigned block id via span position
37 | if block_idxs[block_id] > 0:
38 | continue
39 |
40 | block = document.get_block(block_id)
41 | prev_block = document.get_prev_block(block)
42 | next_block = document.get_next_block(block)
43 |
44 | block_idx_add = 0
45 | if prev_block:
46 | block_idx_add = 1
47 |
48 | while prev_block and prev_block.id not in block_idxs:
49 | prev_block = document.get_prev_block(prev_block)
50 | block_idx_add += 1
51 |
52 | if not prev_block:
53 | block_idx_add = -1
54 | while next_block and next_block.id not in block_idxs:
55 | next_block = document.get_next_block(next_block)
56 | block_idx_add -= 1
57 |
58 | if not next_block and not prev_block:
59 | pass
60 | elif prev_block:
61 | block_idxs[block_id] = block_idxs[prev_block.id] + block_idx_add
62 | else:
63 | block_idxs[block_id] = block_idxs[next_block.id] + block_idx_add
64 |
65 | page.structure = sorted(page.structure, key=lambda x: block_idxs[x])
66 |
67 |
--------------------------------------------------------------------------------
/marker/processors/page_header.py:
--------------------------------------------------------------------------------
1 | from marker.processors import BaseProcessor
2 | from marker.schema import BlockTypes
3 | from marker.schema.document import Document
4 | from marker.schema.groups.page import PageGroup
5 |
6 |
7 | class PageHeaderProcessor(BaseProcessor):
8 | """
9 | A processor for moving PageHeaders to the top
10 | """
11 | block_types = (BlockTypes.PageHeader,)
12 |
13 | def __call__(self, document: Document):
14 | for page in document.pages:
15 | self.move_page_header_to_top(page, document)
16 |
17 | def move_page_header_to_top(self, page: PageGroup, document: Document):
18 | page_header_blocks = page.contained_blocks(document, self.block_types)
19 | page_header_block_ids = [block.id for block in page_header_blocks]
20 | for block_id in page_header_block_ids:
21 | page.structure.remove(block_id)
22 | page.structure[:0] = page_header_block_ids
23 |
24 |
--------------------------------------------------------------------------------
/marker/processors/reference.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | from marker.processors import BaseProcessor
4 | from marker.schema import BlockTypes
5 | from marker.schema.blocks import Reference
6 | from marker.schema.document import Document
7 | from marker.schema.groups.list import ListGroup
8 | from marker.schema.groups.table import TableGroup
9 | from marker.schema.registry import get_block_class
10 | from marker.schema.groups.figure import FigureGroup
11 |
12 |
13 | class ReferenceProcessor(BaseProcessor):
14 | """
15 | A processor for adding references to the document.
16 | """
17 |
18 | def __init__(self, config):
19 | super().__init__(config)
20 |
21 | def __call__(self, document: Document):
22 | ReferenceClass: Reference = get_block_class(BlockTypes.Reference)
23 |
24 | for page in document.pages:
25 | refs = page.refs
26 | ref_starts = np.array([ref.coord for ref in refs])
27 |
28 | blocks = []
29 | for block_id in page.structure:
30 | block = page.get_block(block_id)
31 | if isinstance(block, (ListGroup, FigureGroup, TableGroup)):
32 | blocks.extend([page.get_block(b) for b in block.structure])
33 | else:
34 | blocks.append(block)
35 | blocks = [b for b in blocks if not b.ignore_for_output]
36 |
37 | block_starts = np.array([block.polygon.bbox[:2] for block in blocks])
38 |
39 | if not (len(refs) and len(block_starts)):
40 | continue
41 |
42 | distances = np.linalg.norm(block_starts[:, np.newaxis, :] - ref_starts[np.newaxis, :, :], axis=2)
43 | for ref_idx in range(len(ref_starts)):
44 | block_idx = np.argmin(distances[:, ref_idx])
45 | block = blocks[block_idx]
46 |
47 | ref_block = page.add_full_block(ReferenceClass(
48 | ref=refs[ref_idx].ref,
49 | polygon=block.polygon,
50 | page_id=page.page_id
51 | ))
52 | if block.structure is None:
53 | block.structure = []
54 | block.structure.insert(0, ref_block.id)
55 |
--------------------------------------------------------------------------------
/marker/processors/util.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | from bs4 import BeautifulSoup
4 |
5 | from marker.schema import BlockTypes
6 | from marker.schema.groups import PageGroup
7 | from marker.schema.registry import get_block_class
8 | from marker.schema.text import Line
9 |
10 |
11 | def escape_latex_commands(text: str):
12 | text = (text
13 | .replace('\n', '\\n')
14 | .replace('\t', '\\t')
15 | .replace('\r', '\\r'))
16 | return text
17 |
18 |
19 | def add_math_spans_to_line(corrected_text: str, text_line: Line, page: PageGroup):
20 | SpanClass = get_block_class(BlockTypes.Span)
21 | corrected_spans = text_to_spans(corrected_text)
22 |
23 | for span_idx, span in enumerate(corrected_spans):
24 | if span_idx == len(corrected_spans) - 1:
25 | span['content'] += "\n"
26 |
27 | span_block = page.add_full_block(
28 | SpanClass(
29 | polygon=text_line.polygon,
30 | text=span['content'],
31 | font='Unknown',
32 | font_weight=0,
33 | font_size=0,
34 | minimum_position=0,
35 | maximum_position=0,
36 | formats=[span['type']],
37 | url=span.get('url'),
38 | page_id=text_line.page_id,
39 | text_extraction_method="gemini",
40 | has_superscript=span["has_superscript"],
41 | has_subscript=span["has_subscript"]
42 | )
43 | )
44 | text_line.structure.append(span_block.id)
45 |
46 |
47 | def text_to_spans(text):
48 | soup = BeautifulSoup(text, 'html.parser')
49 |
50 | tag_types = {
51 | 'b': 'bold',
52 | 'i': 'italic',
53 | 'math': 'math',
54 | 'sub': 'plain',
55 | 'sup': 'plain',
56 | 'span': 'plain'
57 | }
58 | spans = []
59 |
60 | for element in soup.descendants:
61 | if not len(list(element.parents)) == 1:
62 | continue
63 |
64 | url = element.attrs.get('href') if hasattr(element, 'attrs') else None
65 |
66 | if element.name in tag_types:
67 | text = element.get_text()
68 | if element.name == "math":
69 | text = escape_latex_commands(text)
70 | spans.append({
71 | 'type': tag_types[element.name],
72 | 'content': text,
73 | 'url': url,
74 | "has_superscript": element.name == "sup",
75 | "has_subscript": element.name == "sub"
76 | })
77 | elif element.string:
78 | spans.append({
79 | 'type': 'plain',
80 | 'content': element.string,
81 | 'url': url,
82 | "has_superscript": False,
83 | "has_subscript": False
84 | })
85 |
86 | return spans
--------------------------------------------------------------------------------
/marker/providers/__init__.py:
--------------------------------------------------------------------------------
1 | from copy import deepcopy
2 | from typing import List, Optional, Dict
3 |
4 | from PIL import Image
5 | from pydantic import BaseModel
6 |
7 | from pdftext.schema import Reference
8 |
9 | from marker.logger import configure_logging
10 | from marker.schema.polygon import PolygonBox
11 | from marker.schema.text import Span
12 | from marker.schema.text.char import Char
13 | from marker.schema.text.line import Line
14 | from marker.settings import settings
15 | from marker.util import assign_config
16 |
17 | configure_logging()
18 |
19 |
20 | class ProviderOutput(BaseModel):
21 | line: Line
22 | spans: List[Span]
23 | chars: Optional[List[List[Char]]] = None
24 |
25 | @property
26 | def raw_text(self):
27 | return "".join(span.text for span in self.spans)
28 |
29 | def __hash__(self):
30 | return hash(tuple(self.line.polygon.bbox))
31 |
32 | def merge(self, other: "ProviderOutput"):
33 | new_output = deepcopy(self)
34 | other_copy = deepcopy(other)
35 |
36 | new_output.spans.extend(other_copy.spans)
37 | if new_output.chars is not None and other_copy.chars is not None:
38 | new_output.chars.extend(other_copy.chars)
39 | elif other_copy.chars is not None:
40 | new_output.chars = other_copy.chars
41 |
42 | new_output.line.polygon = new_output.line.polygon.merge(
43 | [other_copy.line.polygon]
44 | )
45 | return new_output
46 |
47 |
48 | ProviderPageLines = Dict[int, List[ProviderOutput]]
49 |
50 |
51 | class BaseProvider:
52 | def __init__(self, filepath: str, config: Optional[BaseModel | dict] = None):
53 | assign_config(self, config)
54 | self.filepath = filepath
55 |
56 | def __len__(self):
57 | pass
58 |
59 | def get_images(self, idxs: List[int], dpi: int) -> List[Image.Image]:
60 | pass
61 |
62 | def get_page_bbox(self, idx: int) -> PolygonBox | None:
63 | pass
64 |
65 | def get_page_lines(self, idx: int) -> List[Line]:
66 | pass
67 |
68 | def get_page_refs(self, idx: int) -> List[Reference]:
69 | pass
70 |
71 | def __enter__(self):
72 | return self
73 |
74 | @staticmethod
75 | def get_font_css():
76 | from weasyprint import CSS
77 | from weasyprint.text.fonts import FontConfiguration
78 |
79 | font_config = FontConfiguration()
80 | css = CSS(
81 | string=f"""
82 | @font-face {{
83 | font-family: GoNotoCurrent-Regular;
84 | src: url({settings.FONT_PATH});
85 | font-display: swap;
86 | }}
87 | body {{
88 | font-family: {settings.FONT_NAME.split(".")[0]}, sans-serif;
89 | font-variant-ligatures: none;
90 | font-feature-settings: "liga" 0;
91 | text-rendering: optimizeLegibility;
92 | }}
93 | """,
94 | font_config=font_config,
95 | )
96 | return css
97 |
--------------------------------------------------------------------------------
/marker/providers/document.py:
--------------------------------------------------------------------------------
1 | import base64
2 | import os
3 | import re
4 | import tempfile
5 | from io import BytesIO
6 |
7 | from PIL import Image
8 | from marker.logger import get_logger
9 |
10 | from marker.providers.pdf import PdfProvider
11 |
12 | logger = get_logger()
13 |
14 | css = """
15 | @page {
16 | size: A4;
17 | margin: 2cm;
18 | }
19 |
20 | img {
21 | max-width: 100%;
22 | max-height: 25cm;
23 | object-fit: contain;
24 | margin: 12pt auto;
25 | }
26 |
27 | div, p {
28 | max-width: 100%;
29 | word-break: break-word;
30 | font-size: 10pt;
31 | }
32 |
33 | table {
34 | width: 100%;
35 | border-collapse: collapse;
36 | break-inside: auto;
37 | font-size: 10pt;
38 | }
39 |
40 | tr {
41 | break-inside: avoid;
42 | page-break-inside: avoid;
43 | }
44 |
45 | td {
46 | border: 0.75pt solid #000;
47 | padding: 6pt;
48 | }
49 | """
50 |
51 |
52 | class DocumentProvider(PdfProvider):
53 | def __init__(self, filepath: str, config=None):
54 | temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
55 | self.temp_pdf_path = temp_pdf.name
56 | temp_pdf.close()
57 |
58 | # Convert DOCX to PDF
59 | try:
60 | self.convert_docx_to_pdf(filepath)
61 | except Exception as e:
62 | raise RuntimeError(f"Failed to convert {filepath} to PDF: {e}")
63 |
64 | # Initialize the PDF provider with the temp pdf path
65 | super().__init__(self.temp_pdf_path, config)
66 |
67 | def __del__(self):
68 | if os.path.exists(self.temp_pdf_path):
69 | os.remove(self.temp_pdf_path)
70 |
71 | def convert_docx_to_pdf(self, filepath: str):
72 | from weasyprint import CSS, HTML
73 | import mammoth
74 |
75 | with open(filepath, "rb") as docx_file:
76 | # we convert the docx to HTML
77 | result = mammoth.convert_to_html(docx_file)
78 | html = result.value
79 |
80 | # We convert the HTML into a PDF
81 | HTML(string=self._preprocess_base64_images(html)).write_pdf(
82 | self.temp_pdf_path, stylesheets=[CSS(string=css), self.get_font_css()]
83 | )
84 |
85 | @staticmethod
86 | def _preprocess_base64_images(html_content):
87 | pattern = r'data:([^;]+);base64,([^"\'>\s]+)'
88 |
89 | def convert_image(match):
90 | try:
91 | img_data = base64.b64decode(match.group(2))
92 |
93 | with BytesIO(img_data) as bio:
94 | with Image.open(bio) as img:
95 | output = BytesIO()
96 | img.save(output, format=img.format)
97 | new_base64 = base64.b64encode(output.getvalue()).decode()
98 | return f"data:{match.group(1)};base64,{new_base64}"
99 |
100 | except Exception as e:
101 | logger.error(f"Failed to process image: {e}")
102 | return "" # we ditch broken images as that breaks the PDF creation down the line
103 |
104 | return re.sub(pattern, convert_image, html_content)
105 |
--------------------------------------------------------------------------------
/marker/providers/epub.py:
--------------------------------------------------------------------------------
1 | import base64
2 | import os
3 | import tempfile
4 |
5 | from bs4 import BeautifulSoup
6 |
7 | from marker.providers.pdf import PdfProvider
8 |
9 | css = '''
10 | @page {
11 | size: A4;
12 | margin: 2cm;
13 | }
14 |
15 | img {
16 | max-width: 100%;
17 | max-height: 25cm;
18 | object-fit: contain;
19 | margin: 12pt auto;
20 | }
21 |
22 | div, p {
23 | max-width: 100%;
24 | word-break: break-word;
25 | font-size: 10pt;
26 | }
27 |
28 | table {
29 | width: 100%;
30 | border-collapse: collapse;
31 | break-inside: auto;
32 | font-size: 10pt;
33 | }
34 |
35 | tr {
36 | break-inside: avoid;
37 | page-break-inside: avoid;
38 | }
39 |
40 | td {
41 | border: 0.75pt solid #000;
42 | padding: 6pt;
43 | }
44 | '''
45 |
46 |
47 | class EpubProvider(PdfProvider):
48 | def __init__(self, filepath: str, config=None):
49 | temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=f".pdf")
50 | self.temp_pdf_path = temp_pdf.name
51 | temp_pdf.close()
52 |
53 | # Convert Epub to PDF
54 | try:
55 | self.convert_epub_to_pdf(filepath)
56 | except Exception as e:
57 | raise RuntimeError(f"Failed to convert {filepath} to PDF: {e}")
58 |
59 | # Initialize the PDF provider with the temp pdf path
60 | super().__init__(self.temp_pdf_path, config)
61 |
62 | def __del__(self):
63 | if os.path.exists(self.temp_pdf_path):
64 | os.remove(self.temp_pdf_path)
65 |
66 | def convert_epub_to_pdf(self, filepath):
67 | from weasyprint import CSS, HTML
68 | from ebooklib import epub
69 | import ebooklib
70 |
71 | ebook = epub.read_epub(filepath)
72 |
73 | styles = []
74 | html_content = ""
75 | img_tags = {}
76 |
77 | for item in ebook.get_items():
78 | if item.get_type() == ebooklib.ITEM_IMAGE:
79 | img_data = base64.b64encode(item.get_content()).decode("utf-8")
80 | img_tags[item.file_name] = f'data:{item.media_type};base64,{img_data}'
81 | elif item.get_type() == ebooklib.ITEM_STYLE:
82 | styles.append(item.get_content().decode('utf-8'))
83 |
84 | for item in ebook.get_items():
85 | if item.get_type() == ebooklib.ITEM_DOCUMENT:
86 | html_content += item.get_content().decode("utf-8")
87 |
88 | soup = BeautifulSoup(html_content, 'html.parser')
89 | for img in soup.find_all('img'):
90 | src = img.get('src')
91 | if src:
92 | normalized_src = src.replace('../', '')
93 | if normalized_src in img_tags:
94 | img['src'] = img_tags[normalized_src]
95 |
96 | for image in soup.find_all('image'):
97 | src = image.get('xlink:href')
98 | if src:
99 | normalized_src = src.replace('../', '')
100 | if normalized_src in img_tags:
101 | image['xlink:href'] = img_tags[normalized_src]
102 |
103 | html_content = str(soup)
104 | full_style = ''.join([css]) # + styles)
105 |
106 | # we convert the epub to HTML
107 | HTML(string=html_content, base_url=filepath).write_pdf(
108 | self.temp_pdf_path,
109 | stylesheets=[CSS(string=full_style), self.get_font_css()]
110 | )
111 |
--------------------------------------------------------------------------------
/marker/providers/html.py:
--------------------------------------------------------------------------------
1 | import os
2 | import tempfile
3 |
4 | from marker.providers.pdf import PdfProvider
5 |
6 |
7 | class HTMLProvider(PdfProvider):
8 | def __init__(self, filepath: str, config=None):
9 | temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
10 | self.temp_pdf_path = temp_pdf.name
11 | temp_pdf.close()
12 |
13 | # Convert HTML to PDF
14 | try:
15 | self.convert_html_to_pdf(filepath)
16 | except Exception as e:
17 | raise RuntimeError(f"Failed to convert {filepath} to PDF: {e}")
18 |
19 | # Initialize the PDF provider with the temp pdf path
20 | super().__init__(self.temp_pdf_path, config)
21 |
22 | def __del__(self):
23 | if os.path.exists(self.temp_pdf_path):
24 | os.remove(self.temp_pdf_path)
25 |
26 | def convert_html_to_pdf(self, filepath: str):
27 | from weasyprint import HTML
28 |
29 | font_css = self.get_font_css()
30 | HTML(filename=filepath, encoding="utf-8").write_pdf(
31 | self.temp_pdf_path, stylesheets=[font_css]
32 | )
33 |
--------------------------------------------------------------------------------
/marker/providers/image.py:
--------------------------------------------------------------------------------
1 | from typing import List, Annotated
2 | from PIL import Image
3 |
4 | from marker.providers import ProviderPageLines, BaseProvider
5 | from marker.schema.polygon import PolygonBox
6 | from marker.schema.text import Line
7 | from pdftext.schema import Reference
8 |
9 |
10 | class ImageProvider(BaseProvider):
11 | page_range: Annotated[
12 | List[int],
13 | "The range of pages to process.",
14 | "Default is None, which will process all pages.",
15 | ] = None
16 |
17 | image_count: int = 1
18 |
19 | def __init__(self, filepath: str, config=None):
20 | super().__init__(filepath, config)
21 |
22 | self.images = [Image.open(filepath)]
23 | self.page_lines: ProviderPageLines = {i: [] for i in range(self.image_count)}
24 |
25 | if self.page_range is None:
26 | self.page_range = range(self.image_count)
27 |
28 | assert max(self.page_range) < self.image_count and min(self.page_range) >= 0, (
29 | f"Invalid page range, values must be between 0 and {len(self.doc) - 1}. Min of provided page range is {min(self.page_range)} and max is {max(self.page_range)}."
30 | )
31 |
32 | self.page_bboxes = {
33 | i: [0, 0, self.images[i].size[0], self.images[i].size[1]]
34 | for i in self.page_range
35 | }
36 |
37 | def __len__(self):
38 | return self.image_count
39 |
40 | def get_images(self, idxs: List[int], dpi: int) -> List[Image.Image]:
41 | return [self.images[i] for i in idxs]
42 |
43 | def get_page_bbox(self, idx: int) -> PolygonBox | None:
44 | bbox = self.page_bboxes[idx]
45 | if bbox:
46 | return PolygonBox.from_bbox(bbox)
47 |
48 | def get_page_lines(self, idx: int) -> List[Line]:
49 | return self.page_lines[idx]
50 |
51 | def get_page_refs(self, idx: int) -> List[Reference]:
52 | return []
53 |
--------------------------------------------------------------------------------
/marker/providers/registry.py:
--------------------------------------------------------------------------------
1 | import filetype
2 | import filetype.match as file_match
3 | from bs4 import BeautifulSoup
4 | from filetype.types import archive, document, IMAGE
5 |
6 | from marker.providers.document import DocumentProvider
7 | from marker.providers.epub import EpubProvider
8 | from marker.providers.html import HTMLProvider
9 | from marker.providers.image import ImageProvider
10 | from marker.providers.pdf import PdfProvider
11 | from marker.providers.powerpoint import PowerPointProvider
12 | from marker.providers.spreadsheet import SpreadSheetProvider
13 |
14 | DOCTYPE_MATCHERS = {
15 | "image": IMAGE,
16 | "pdf": [
17 | archive.Pdf,
18 | ],
19 | "epub": [
20 | archive.Epub,
21 | ],
22 | "doc": [document.Docx],
23 | "xls": [document.Xlsx],
24 | "ppt": [document.Pptx],
25 | }
26 |
27 |
28 | def load_matchers(doctype: str):
29 | return [cls() for cls in DOCTYPE_MATCHERS[doctype]]
30 |
31 |
32 | def load_extensions(doctype: str):
33 | return [cls.EXTENSION for cls in DOCTYPE_MATCHERS[doctype]]
34 |
35 |
36 | def provider_from_ext(filepath: str):
37 | ext = filepath.rsplit(".", 1)[-1].strip()
38 | if not ext:
39 | return PdfProvider
40 |
41 | if ext in load_extensions("image"):
42 | return ImageProvider
43 | if ext in load_extensions("pdf"):
44 | return PdfProvider
45 | if ext in load_extensions("doc"):
46 | return DocumentProvider
47 | if ext in load_extensions("xls"):
48 | return SpreadSheetProvider
49 | if ext in load_extensions("ppt"):
50 | return PowerPointProvider
51 | if ext in load_extensions("epub"):
52 | return EpubProvider
53 | if ext in ["html"]:
54 | return HTMLProvider
55 |
56 | return PdfProvider
57 |
58 |
59 | def provider_from_filepath(filepath: str):
60 | if filetype.image_match(filepath) is not None:
61 | return ImageProvider
62 | if file_match(filepath, load_matchers("pdf")) is not None:
63 | return PdfProvider
64 | if file_match(filepath, load_matchers("epub")) is not None:
65 | return EpubProvider
66 | if file_match(filepath, load_matchers("doc")) is not None:
67 | return DocumentProvider
68 | if file_match(filepath, load_matchers("xls")) is not None:
69 | return SpreadSheetProvider
70 | if file_match(filepath, load_matchers("ppt")) is not None:
71 | return PowerPointProvider
72 |
73 | try:
74 | with open(filepath, "r", encoding="utf-8") as f:
75 | soup = BeautifulSoup(f.read(), "html.parser")
76 | # Check if there are any HTML tags
77 | if bool(soup.find()):
78 | return HTMLProvider
79 | except Exception:
80 | pass
81 |
82 | # Fallback if we incorrectly detect the file type
83 | return provider_from_ext(filepath)
84 |
--------------------------------------------------------------------------------
/marker/providers/utils.py:
--------------------------------------------------------------------------------
1 | def alphanum_ratio(text):
2 | text = text.replace(" ", "")
3 | text = text.replace("\n", "")
4 | alphanumeric_count = sum([1 for c in text if c.isalnum()])
5 |
6 | if len(text) == 0:
7 | return 1
8 |
9 | ratio = alphanumeric_count / len(text)
10 | return ratio
11 |
--------------------------------------------------------------------------------
/marker/renderers/extraction.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass
2 | from typing import Dict
3 |
4 | from pydantic import BaseModel
5 |
6 | from marker.extractors import ExtractionResult
7 | from marker.renderers import BaseRenderer
8 |
9 |
10 | @dataclass
11 | class MergeData:
12 | confidence_exists_1: float
13 | confidence_exists_2: float
14 | confidence_value_1: float
15 | confidence_value_2: float
16 |
17 |
18 | def merge_keys(
19 | json: dict | list, json2: dict, merge_data: MergeData, confidence_threshold: int = 3
20 | ):
21 | if isinstance(json, list):
22 | json.extend(json2)
23 |
24 | elif isinstance(json, dict):
25 | for key in json:
26 | if isinstance(json[key], dict):
27 | merge_keys(json[key], json2[key], merge_data)
28 | elif isinstance(json[key], list):
29 | json[key] = json[key] + json2[key]
30 | else:
31 | value_2_correct = (
32 | merge_data.confidence_exists_2 > confidence_threshold
33 | and merge_data.confidence_value_2 > confidence_threshold
34 | )
35 |
36 | if value_2_correct and json2[key]:
37 | json[key] = json2[key]
38 |
39 | if not json[key] and json2[key]:
40 | json[key] = json2[key]
41 |
42 |
43 | class ExtractionOutput(BaseModel):
44 | pages: Dict[int, ExtractionResult]
45 | document_json: dict
46 |
47 |
48 | class ExtractionRenderer(BaseRenderer):
49 | def __call__(self, outputs: Dict[int, ExtractionResult]) -> ExtractionOutput:
50 | pnums = sorted(list(outputs.keys()))
51 | merged_result = outputs[pnums[0]].extracted_data.copy()
52 | confidence_exists = outputs[pnums[0]].existence_confidence
53 | confidence_value = outputs[pnums[0]].value_confidence
54 |
55 | for pnum in pnums[1:]:
56 | merge_data = MergeData(
57 | confidence_exists_1=confidence_exists,
58 | confidence_exists_2=outputs[pnum].existence_confidence,
59 | confidence_value_1=confidence_value,
60 | confidence_value_2=outputs[pnum].value_confidence,
61 | )
62 | merge_keys(merged_result, outputs[pnum].extracted_data, merge_data)
63 |
64 | return ExtractionOutput(pages=outputs, document_json=merged_result)
65 |
--------------------------------------------------------------------------------
/marker/renderers/json.py:
--------------------------------------------------------------------------------
1 | from typing import Annotated, Dict, List, Tuple
2 |
3 | from pydantic import BaseModel
4 |
5 | from marker.renderers import BaseRenderer
6 | from marker.schema import BlockTypes
7 | from marker.schema.blocks import Block, BlockOutput
8 | from marker.schema.document import Document
9 | from marker.schema.registry import get_block_class
10 |
11 |
12 | class JSONBlockOutput(BaseModel):
13 | id: str
14 | block_type: str
15 | html: str
16 | polygon: List[List[float]]
17 | bbox: List[float]
18 | children: List['JSONBlockOutput'] | None = None
19 | section_hierarchy: Dict[int, str] | None = None
20 | images: dict | None = None
21 |
22 |
23 | class JSONOutput(BaseModel):
24 | children: List[JSONBlockOutput]
25 | block_type: str = str(BlockTypes.Document)
26 | metadata: dict
27 |
28 |
29 | def reformat_section_hierarchy(section_hierarchy):
30 | new_section_hierarchy = {}
31 | for key, value in section_hierarchy.items():
32 | new_section_hierarchy[key] = str(value)
33 | return new_section_hierarchy
34 |
35 |
36 | class JSONRenderer(BaseRenderer):
37 | """
38 | A renderer for JSON output.
39 | """
40 | image_blocks: Annotated[
41 | Tuple[BlockTypes],
42 | "The list of block types to consider as images.",
43 | ] = (BlockTypes.Picture, BlockTypes.Figure)
44 | page_blocks: Annotated[
45 | Tuple[BlockTypes],
46 | "The list of block types to consider as pages.",
47 | ] = (BlockTypes.Page,)
48 |
49 | def extract_json(self, document: Document, block_output: BlockOutput):
50 | cls = get_block_class(block_output.id.block_type)
51 | if cls.__base__ == Block:
52 | html, images = self.extract_block_html(document, block_output)
53 | return JSONBlockOutput(
54 | html=html,
55 | polygon=block_output.polygon.polygon,
56 | bbox=block_output.polygon.bbox,
57 | id=str(block_output.id),
58 | block_type=str(block_output.id.block_type),
59 | images=images,
60 | section_hierarchy=reformat_section_hierarchy(block_output.section_hierarchy)
61 | )
62 | else:
63 | children = []
64 | for child in block_output.children:
65 | child_output = self.extract_json(document, child)
66 | children.append(child_output)
67 |
68 | return JSONBlockOutput(
69 | html=block_output.html,
70 | polygon=block_output.polygon.polygon,
71 | bbox=block_output.polygon.bbox,
72 | id=str(block_output.id),
73 | block_type=str(block_output.id.block_type),
74 | children=children,
75 | section_hierarchy=reformat_section_hierarchy(block_output.section_hierarchy)
76 | )
77 |
78 | def __call__(self, document: Document) -> JSONOutput:
79 | document_output = document.render()
80 | json_output = []
81 | for page_output in document_output.children:
82 | json_output.append(self.extract_json(document, page_output))
83 | return JSONOutput(
84 | children=json_output,
85 | metadata=self.generate_document_metadata(document, document_output)
86 | )
87 |
--------------------------------------------------------------------------------
/marker/schema/__init__.py:
--------------------------------------------------------------------------------
1 | from enum import auto, Enum
2 |
3 |
4 | class BlockTypes(str, Enum):
5 | Line = auto()
6 | Span = auto()
7 | Char = auto()
8 | FigureGroup = auto()
9 | TableGroup = auto()
10 | ListGroup = auto()
11 | PictureGroup = auto()
12 | Page = auto()
13 | Caption = auto()
14 | Code = auto()
15 | Figure = auto()
16 | Footnote = auto()
17 | Form = auto()
18 | Equation = auto()
19 | Handwriting = auto()
20 | TextInlineMath = auto()
21 | ListItem = auto()
22 | PageFooter = auto()
23 | PageHeader = auto()
24 | Picture = auto()
25 | SectionHeader = auto()
26 | Table = auto()
27 | Text = auto()
28 | TableOfContents = auto()
29 | Document = auto()
30 | ComplexRegion = auto()
31 | TableCell = auto()
32 | Reference = auto()
33 |
34 | def __str__(self):
35 | return self.name
36 |
--------------------------------------------------------------------------------
/marker/schema/blocks/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from marker.schema.blocks.base import Block, BlockId, BlockOutput
4 | from marker.schema.blocks.caption import Caption
5 | from marker.schema.blocks.code import Code
6 | from marker.schema.blocks.figure import Figure
7 | from marker.schema.blocks.footnote import Footnote
8 | from marker.schema.blocks.form import Form
9 | from marker.schema.blocks.equation import Equation
10 | from marker.schema.blocks.handwriting import Handwriting
11 | from marker.schema.blocks.inlinemath import InlineMath
12 | from marker.schema.blocks.listitem import ListItem
13 | from marker.schema.blocks.pagefooter import PageFooter
14 | from marker.schema.blocks.pageheader import PageHeader
15 | from marker.schema.blocks.picture import Picture
16 | from marker.schema.blocks.sectionheader import SectionHeader
17 | from marker.schema.blocks.table import Table
18 | from marker.schema.blocks.text import Text
19 | from marker.schema.blocks.toc import TableOfContents
20 | from marker.schema.blocks.complexregion import ComplexRegion
21 | from marker.schema.blocks.tablecell import TableCell
22 | from marker.schema.blocks.reference import Reference
23 |
--------------------------------------------------------------------------------
/marker/schema/blocks/basetable.py:
--------------------------------------------------------------------------------
1 | from typing import List
2 |
3 | from marker.schema import BlockTypes
4 | from marker.schema.blocks import Block, BlockOutput
5 | from marker.schema.blocks.tablecell import TableCell
6 |
7 |
8 | class BaseTable(Block):
9 | block_type: BlockTypes | None = None
10 | html: str | None = None
11 |
12 | @staticmethod
13 | def format_cells(document, child_blocks, child_cells: List[TableCell] | None = None):
14 | if child_cells is None:
15 | child_cells: List[TableCell] = [document.get_block(c.id) for c in child_blocks if c.id.block_type == BlockTypes.TableCell]
16 |
17 | unique_rows = sorted(list(set([c.row_id for c in child_cells])))
18 | html_repr = ""
19 | for row_id in unique_rows:
20 | row_cells = sorted([c for c in child_cells if c.row_id == row_id], key=lambda x: x.col_id)
21 | html_repr += ""
22 | for cell in row_cells:
23 | html_repr += cell.assemble_html(document, child_blocks, None)
24 | html_repr += " "
25 | html_repr += " "
26 | return html_repr
27 |
28 | def assemble_html(self, document, child_blocks: List[BlockOutput], parent_structure=None):
29 | # Filter out the table cells, so they don't render twice
30 | child_ref_blocks = [block for block in child_blocks if block.id.block_type == BlockTypes.Reference]
31 | template = super().assemble_html(document, child_ref_blocks, parent_structure)
32 |
33 | child_block_types = set([c.id.block_type for c in child_blocks])
34 | if self.html:
35 | # LLM processor
36 | return template + self.html
37 | elif len(child_blocks) > 0 and BlockTypes.TableCell in child_block_types:
38 | # Table processor
39 | return template + self.format_cells(document, child_blocks)
40 | else:
41 | # Default text lines and spans
42 | return f"{template} "
43 |
--------------------------------------------------------------------------------
/marker/schema/blocks/caption.py:
--------------------------------------------------------------------------------
1 | from marker.schema import BlockTypes
2 | from marker.schema.blocks import Block
3 |
4 |
5 | class Caption(Block):
6 | block_type: BlockTypes = BlockTypes.Caption
7 | block_description: str = "A text caption that is directly above or below an image or table. Only used for text describing the image or table. "
8 | replace_output_newlines: bool = True
9 | html: str | None = None
10 |
11 | def assemble_html(self, document, child_blocks, parent_structure):
12 | if self.html:
13 | return super().handle_html_output(document, child_blocks, parent_structure)
14 |
15 | return super().assemble_html(document, child_blocks, parent_structure)
16 |
17 |
--------------------------------------------------------------------------------
/marker/schema/blocks/code.py:
--------------------------------------------------------------------------------
1 | import html
2 |
3 | from marker.schema import BlockTypes
4 | from marker.schema.blocks import Block
5 |
6 |
7 | class Code(Block):
8 | block_type: BlockTypes = BlockTypes.Code
9 | code: str | None = None
10 | block_description: str = "A programming code block."
11 |
12 | def assemble_html(self, document, child_blocks, parent_structure):
13 | code = self.code or ""
14 | return (f""
15 | f"{html.escape(code)}"
16 | f" ")
17 |
--------------------------------------------------------------------------------
/marker/schema/blocks/complexregion.py:
--------------------------------------------------------------------------------
1 | from marker.schema import BlockTypes
2 | from marker.schema.blocks import Block
3 |
4 |
5 | class ComplexRegion(Block):
6 | block_type: BlockTypes = BlockTypes.ComplexRegion
7 | html: str | None = None
8 | block_description: str = "A complex region that can consist of multiple different types of blocks mixed with images. This block is chosen when it is difficult to categorize the region as a single block type."
9 |
10 | def assemble_html(self, document, child_blocks, parent_structure):
11 | if self.html:
12 | child_ref_blocks = [block for block in child_blocks if block.id.block_type == BlockTypes.Reference]
13 | html = super().assemble_html(document, child_ref_blocks, parent_structure)
14 | return html + self.html
15 | else:
16 | template = super().assemble_html(document, child_blocks, parent_structure)
17 | return f"{template} "
18 |
--------------------------------------------------------------------------------
/marker/schema/blocks/equation.py:
--------------------------------------------------------------------------------
1 | from marker.schema import BlockTypes
2 | from marker.schema.blocks import Block
3 |
4 |
5 | class Equation(Block):
6 | block_type: BlockTypes = BlockTypes.Equation
7 | html: str | None = None
8 | block_description: str = "A block math equation."
9 |
10 | def assemble_html(self, document, child_blocks, parent_structure=None):
11 | if self.html:
12 | child_ref_blocks = [block for block in child_blocks if block.id.block_type == BlockTypes.Reference]
13 | html_out = super().assemble_html(document, child_ref_blocks, parent_structure)
14 | html_out += f"""{self.html} """
15 | return html_out
16 | else:
17 | template = super().assemble_html(document, child_blocks, parent_structure)
18 | return f"{template} "
19 |
--------------------------------------------------------------------------------
/marker/schema/blocks/figure.py:
--------------------------------------------------------------------------------
1 | from marker.schema import BlockTypes
2 | from marker.schema.blocks import Block
3 |
4 |
5 | class Figure(Block):
6 | block_type: BlockTypes = BlockTypes.Figure
7 | description: str | None = None
8 | block_description: str = "A chart or other image that contains data."
9 |
10 | def assemble_html(self, document, child_blocks, parent_structure):
11 | child_ref_blocks = [block for block in child_blocks if block.id.block_type == BlockTypes.Reference]
12 | html = super().assemble_html(document, child_ref_blocks, parent_structure)
13 | if self.description:
14 | html += f"Image {self.id} description: {self.description} "
15 | return html
16 |
--------------------------------------------------------------------------------
/marker/schema/blocks/footnote.py:
--------------------------------------------------------------------------------
1 | from marker.schema import BlockTypes
2 | from marker.schema.blocks import Block
3 |
4 |
5 | class Footnote(Block):
6 | block_type: BlockTypes = BlockTypes.Footnote
7 | block_description: str = "A footnote that explains a term or concept in the document."
8 | replace_output_newlines: bool = True
9 | html: str | None = None
10 |
11 | def assemble_html(self, document, child_blocks, parent_structure):
12 | if self.html:
13 | return super().handle_html_output(document, child_blocks, parent_structure)
14 |
15 | return super().assemble_html(document, child_blocks, parent_structure)
16 |
--------------------------------------------------------------------------------
/marker/schema/blocks/form.py:
--------------------------------------------------------------------------------
1 | from typing import List
2 |
3 | from marker.schema import BlockTypes
4 | from marker.schema.blocks.basetable import BaseTable
5 |
6 |
7 | class Form(BaseTable):
8 | block_type: BlockTypes = BlockTypes.Form
9 | block_description: str = "A form, such as a tax form, that contains fields and labels. It most likely doesn't have a table structure."
10 |
--------------------------------------------------------------------------------
/marker/schema/blocks/handwriting.py:
--------------------------------------------------------------------------------
1 | from marker.schema import BlockTypes
2 | from marker.schema.blocks import Block
3 |
4 |
5 | class Handwriting(Block):
6 | block_type: BlockTypes = BlockTypes.Handwriting
7 | block_description: str = "A region that contains handwriting."
8 | html: str | None = None
9 | replace_output_newlines: bool = True
10 |
11 | def assemble_html(self, document, child_blocks, parent_structure):
12 | if self.html:
13 | return self.html
14 | else:
15 | return super().assemble_html(document, child_blocks, parent_structure)
16 |
--------------------------------------------------------------------------------
/marker/schema/blocks/inlinemath.py:
--------------------------------------------------------------------------------
1 | from marker.schema import BlockTypes
2 | from marker.schema.blocks import Block
3 |
4 |
5 | class InlineMath(Block):
6 | block_type: BlockTypes = BlockTypes.TextInlineMath
7 | has_continuation: bool = False
8 | blockquote: bool = False
9 | blockquote_level: int = 0
10 | block_description: str = "A text block that contains inline math. This is not used for italic text or references - only for text that contains math."
11 | html: str | None = None
12 |
13 | def assemble_html(self, document, child_blocks, parent_structure):
14 | if self.ignore_for_output:
15 | return ""
16 |
17 | if self.html:
18 | return super().handle_html_output(document, child_blocks, parent_structure)
19 |
20 | template = super().assemble_html(document, child_blocks, parent_structure)
21 | template = template.replace("\n", " ")
22 |
23 | el_attr = f" block-type='{self.block_type}'"
24 | if self.has_continuation:
25 | el_attr += " class='has-continuation'"
26 |
27 | if self.blockquote:
28 | # Add indentation for blockquote levels
29 | blockquote_prefix = "" * self.blockquote_level
30 | blockquote_suffix = " " * self.blockquote_level
31 | return f"{blockquote_prefix}{template} {blockquote_suffix}"
32 | else:
33 | return f"{template} "
34 |
--------------------------------------------------------------------------------
/marker/schema/blocks/listitem.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | from marker.schema import BlockTypes
4 | from marker.schema.blocks import Block
5 |
6 |
7 | def replace_bullets(child_blocks):
8 | # Replace bullet characters with a -
9 | first_block = None
10 | while len(child_blocks) > 0:
11 | first_block = child_blocks[0]
12 | child_blocks = first_block.children
13 |
14 | if first_block is not None and first_block.id.block_type == BlockTypes.Line:
15 | bullet_pattern = r"(^|[\n ]|<[^>]*>)[•●○ഠ ം◦■▪▫–—-]( )"
16 | first_block.html = re.sub(bullet_pattern, r"\1\2", first_block.html)
17 |
18 |
19 | class ListItem(Block):
20 | block_type: BlockTypes = BlockTypes.ListItem
21 | list_indent_level: int = 0
22 | block_description: str = "A list item that is part of a list. This block is used to represent a single item in a list."
23 | html: str | None = None
24 |
25 | def assemble_html(self, document, child_blocks, parent_structure):
26 | template = super().assemble_html(document, child_blocks, parent_structure)
27 | template = template.replace("\n", " ")
28 | # Remove the first bullet character
29 | replace_bullets(child_blocks)
30 |
31 | if self.html:
32 | template = super().handle_html_output(document, child_blocks, parent_structure).strip()
33 | template = template.replace("", "").replace("", "")
34 |
35 | el_attr = f" block-type='{self.block_type}'"
36 | if self.list_indent_level:
37 | return f""
38 | return f"{template}"
39 |
--------------------------------------------------------------------------------
/marker/schema/blocks/pagefooter.py:
--------------------------------------------------------------------------------
1 | from marker.schema import BlockTypes
2 | from marker.schema.blocks import Block
3 |
4 |
5 | class PageFooter(Block):
6 | block_type: str = BlockTypes.PageFooter
7 | block_description: str = "Text that appears at the bottom of a page, like a page number."
8 | replace_output_newlines: bool = True
9 | ignore_for_output: bool = True
10 |
11 |
--------------------------------------------------------------------------------
/marker/schema/blocks/pageheader.py:
--------------------------------------------------------------------------------
1 | from marker.schema import BlockTypes
2 | from marker.schema.blocks import Block
3 |
4 |
5 | class PageHeader(Block):
6 | block_type: BlockTypes = BlockTypes.PageHeader
7 | block_description: str = "Text that appears at the top of a page, like a page title."
8 | replace_output_newlines: bool = True
9 | ignore_for_output: bool = True
10 |
11 |
--------------------------------------------------------------------------------
/marker/schema/blocks/picture.py:
--------------------------------------------------------------------------------
1 | from marker.schema import BlockTypes
2 | from marker.schema.blocks import Block
3 |
4 |
5 | class Picture(Block):
6 | block_type: BlockTypes = BlockTypes.Picture
7 | description: str | None = None
8 | block_description: str = "An image block that represents a picture."
9 |
10 | def assemble_html(self, document, child_blocks, parent_structure):
11 | child_ref_blocks = [block for block in child_blocks if block.id.block_type == BlockTypes.Reference]
12 | html = super().assemble_html(document, child_ref_blocks, parent_structure)
13 |
14 | if self.description:
15 | return html + f"Image {self.id} description: {self.description} "
16 | return html
17 |
--------------------------------------------------------------------------------
/marker/schema/blocks/reference.py:
--------------------------------------------------------------------------------
1 | from marker.schema import BlockTypes
2 | from marker.schema.blocks import Block
3 |
4 |
5 | class Reference(Block):
6 | block_type: BlockTypes = BlockTypes.Reference
7 | ref: str
8 | block_description: str = "A reference to this block from another block."
9 |
10 | def assemble_html(self, document, child_blocks, parent_structure=None):
11 | template = super().assemble_html(document, child_blocks, parent_structure)
12 | return f"{template}"
13 |
--------------------------------------------------------------------------------
/marker/schema/blocks/sectionheader.py:
--------------------------------------------------------------------------------
1 | from typing import Optional
2 |
3 | from marker.schema import BlockTypes
4 | from marker.schema.blocks import Block
5 |
6 |
7 | class SectionHeader(Block):
8 | block_type: BlockTypes = BlockTypes.SectionHeader
9 | heading_level: Optional[int] = None
10 | block_description: str = "The header of a section of text or other blocks."
11 | html: str | None = None
12 |
13 | def assemble_html(self, document, child_blocks, parent_structure):
14 | if self.ignore_for_output:
15 | return ""
16 |
17 | if self.html:
18 | return super().handle_html_output(document, child_blocks, parent_structure)
19 |
20 | template = super().assemble_html(document, child_blocks, parent_structure)
21 | template = template.replace("\n", " ")
22 | tag = f"h{self.heading_level}" if self.heading_level else "h2"
23 | return f"<{tag}>{template}{tag}>"
24 |
--------------------------------------------------------------------------------
/marker/schema/blocks/table.py:
--------------------------------------------------------------------------------
1 | from marker.schema import BlockTypes
2 | from marker.schema.blocks.basetable import BaseTable
3 |
4 |
5 | class Table(BaseTable):
6 | block_type: BlockTypes = BlockTypes.Table
7 | block_description: str = "A table of data, like a results table. It will be in a tabular format."
8 |
--------------------------------------------------------------------------------
/marker/schema/blocks/tablecell.py:
--------------------------------------------------------------------------------
1 | from typing import List
2 |
3 | from marker.schema import BlockTypes
4 | from marker.schema.blocks import Block
5 |
6 |
7 | class TableCell(Block):
8 | block_type: BlockTypes = BlockTypes.TableCell
9 | rowspan: int
10 | colspan: int
11 | row_id: int
12 | col_id: int
13 | is_header: bool
14 | text_lines: List[str] | None = None
15 | block_description: str = "A cell in a table."
16 |
17 | @property
18 | def text(self):
19 | return "\n".join(self.text_lines)
20 |
21 | def assemble_html(self, document, child_blocks, parent_structure=None):
22 | tag_cls = "th" if self.is_header else "td"
23 | tag = f"<{tag_cls}"
24 | if self.rowspan > 1:
25 | tag += f" rowspan={self.rowspan}"
26 | if self.colspan > 1:
27 | tag += f" colspan={self.colspan}"
28 | if self.text_lines is None:
29 | self.text_lines = []
30 | text = " ".join(self.text_lines)
31 | return f"{tag}>{text}{tag_cls}>"
32 |
--------------------------------------------------------------------------------
/marker/schema/blocks/text.py:
--------------------------------------------------------------------------------
1 | from marker.schema import BlockTypes
2 | from marker.schema.blocks import Block
3 |
4 |
5 | class Text(Block):
6 | block_type: BlockTypes = BlockTypes.Text
7 | has_continuation: bool = False
8 | blockquote: bool = False
9 | blockquote_level: int = 0
10 | html: str | None = None
11 | block_description: str = "A paragraph or line of text."
12 |
13 | def assemble_html(self, document, child_blocks, parent_structure):
14 | if self.ignore_for_output:
15 | return ""
16 |
17 | # This happens when we used an llm processor
18 | if self.html:
19 | return super().handle_html_output(document, child_blocks, parent_structure)
20 |
21 | template = super().assemble_html(document, child_blocks, parent_structure)
22 | template = template.replace("\n", " ")
23 |
24 | el_attr = f" block-type='{self.block_type}'"
25 | if self.has_continuation:
26 | el_attr += " class='has-continuation'"
27 |
28 | if self.blockquote:
29 | blockquote_prefix = "" * self.blockquote_level
30 | blockquote_suffix = " " * self.blockquote_level
31 | return f"{blockquote_prefix}{template} {blockquote_suffix}"
32 | else:
33 | return f"{template} "
34 |
--------------------------------------------------------------------------------
/marker/schema/blocks/toc.py:
--------------------------------------------------------------------------------
1 | from marker.schema import BlockTypes
2 | from marker.schema.blocks.basetable import BaseTable
3 |
4 |
5 | class TableOfContents(BaseTable):
6 | block_type: str = BlockTypes.TableOfContents
7 | block_description: str = "A table of contents."
8 |
--------------------------------------------------------------------------------
/marker/schema/document.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from typing import List, Sequence
4 |
5 | from pydantic import BaseModel
6 |
7 | from marker.schema import BlockTypes
8 | from marker.schema.blocks import Block, BlockId, BlockOutput
9 | from marker.schema.groups.page import PageGroup
10 |
11 |
12 | class DocumentOutput(BaseModel):
13 | children: List[BlockOutput]
14 | html: str
15 | block_type: BlockTypes = BlockTypes.Document
16 |
17 |
18 | class TocItem(BaseModel):
19 | title: str
20 | heading_level: int
21 | page_id: int
22 | polygon: List[List[float]]
23 |
24 |
25 | class Document(BaseModel):
26 | filepath: str
27 | pages: List[PageGroup]
28 | block_type: BlockTypes = BlockTypes.Document
29 | table_of_contents: List[TocItem] | None = None
30 | debug_data_path: str | None = None # Path that debug data was saved to
31 |
32 | def get_block(self, block_id: BlockId):
33 | page = self.get_page(block_id.page_id)
34 | block = page.get_block(block_id)
35 | if block:
36 | return block
37 | return None
38 |
39 | def get_page(self, page_id):
40 | for page in self.pages:
41 | if page.page_id == page_id:
42 | return page
43 | return None
44 |
45 | def get_next_block(self, block: Block, ignored_block_types: List[BlockTypes] = None):
46 | if ignored_block_types is None:
47 | ignored_block_types = []
48 | next_block = None
49 |
50 | # Try to find the next block in the current page
51 | page = self.get_page(block.page_id)
52 | next_block = page.get_next_block(block, ignored_block_types)
53 | if next_block:
54 | return next_block
55 |
56 | # If no block found, search subsequent pages
57 | for page in self.pages[self.pages.index(page) + 1:]:
58 | next_block = page.get_next_block(None, ignored_block_types)
59 | if next_block:
60 | return next_block
61 | return None
62 |
63 | def get_next_page(self, page: PageGroup):
64 | page_idx = self.pages.index(page)
65 | if page_idx + 1 < len(self.pages):
66 | return self.pages[page_idx + 1]
67 | return None
68 |
69 | def get_prev_block(self, block: Block):
70 | page = self.get_page(block.page_id)
71 | prev_block = page.get_prev_block(block)
72 | if prev_block:
73 | return prev_block
74 | prev_page = self.get_prev_page(page)
75 | if not prev_page:
76 | return None
77 | return prev_page.get_block(prev_page.structure[-1])
78 |
79 | def get_prev_page(self, page: PageGroup):
80 | page_idx = self.pages.index(page)
81 | if page_idx > 0:
82 | return self.pages[page_idx - 1]
83 | return None
84 |
85 | def assemble_html(self, child_blocks: List[Block]):
86 | template = ""
87 | for c in child_blocks:
88 | template += f""
89 | return template
90 |
91 | def render(self):
92 | child_content = []
93 | section_hierarchy = None
94 | for page in self.pages:
95 | rendered = page.render(self, None, section_hierarchy)
96 | section_hierarchy = rendered.section_hierarchy.copy()
97 | child_content.append(rendered)
98 |
99 | return DocumentOutput(
100 | children=child_content,
101 | html=self.assemble_html(child_content)
102 | )
103 |
104 | def contained_blocks(self, block_types: Sequence[BlockTypes] = None) -> List[Block]:
105 | blocks = []
106 | for page in self.pages:
107 | blocks += page.contained_blocks(self, block_types)
108 | return blocks
109 |
--------------------------------------------------------------------------------
/marker/schema/groups/__init__.py:
--------------------------------------------------------------------------------
1 | from marker.schema.blocks.base import Block
2 | from marker.schema.groups.figure import FigureGroup
3 | from marker.schema.groups.table import TableGroup
4 | from marker.schema.groups.list import ListGroup
5 | from marker.schema.groups.picture import PictureGroup
6 | from marker.schema.groups.page import PageGroup
7 |
--------------------------------------------------------------------------------
/marker/schema/groups/base.py:
--------------------------------------------------------------------------------
1 | from marker.schema.blocks import Block
2 |
3 |
4 | class Group(Block):
5 | pass
--------------------------------------------------------------------------------
/marker/schema/groups/figure.py:
--------------------------------------------------------------------------------
1 | from marker.schema import BlockTypes
2 | from marker.schema.groups.base import Group
3 |
4 |
5 | class FigureGroup(Group):
6 | block_type: BlockTypes = BlockTypes.FigureGroup
7 | block_description: str = "A group that contains a figure and associated captions."
8 |
--------------------------------------------------------------------------------
/marker/schema/groups/list.py:
--------------------------------------------------------------------------------
1 | from marker.schema import BlockTypes
2 | from marker.schema.groups.base import Group
3 |
4 |
5 | class ListGroup(Group):
6 | block_type: BlockTypes = BlockTypes.ListGroup
7 | has_continuation: bool = False
8 | block_description: str = "A group of list items that should be rendered together."
9 |
10 | def assemble_html(self, document, child_blocks, parent_structure):
11 | template = super().assemble_html(document, child_blocks, parent_structure)
12 |
13 | el_attr = f" block-type='{self.block_type}'"
14 | if self.has_continuation:
15 | el_attr += " class='has-continuation'"
16 | return f""
17 |
--------------------------------------------------------------------------------
/marker/schema/groups/picture.py:
--------------------------------------------------------------------------------
1 | from marker.schema import BlockTypes
2 | from marker.schema.groups.base import Group
3 |
4 |
5 | class PictureGroup(Group):
6 | block_type: BlockTypes = BlockTypes.PictureGroup
7 | block_description: str = "A picture along with associated captions."
8 |
--------------------------------------------------------------------------------
/marker/schema/groups/table.py:
--------------------------------------------------------------------------------
1 | from marker.schema import BlockTypes
2 | from marker.schema.groups.base import Group
3 |
4 |
5 | class TableGroup(Group):
6 | block_type: BlockTypes = BlockTypes.TableGroup
7 | block_description: str = "A table along with associated captions."
8 |
--------------------------------------------------------------------------------
/marker/schema/registry.py:
--------------------------------------------------------------------------------
1 | from typing import Dict, Type
2 | from importlib import import_module
3 |
4 | from marker.schema import BlockTypes
5 | from marker.schema.blocks import (
6 | Block,
7 | Caption,
8 | Code,
9 | Equation,
10 | Figure,
11 | Footnote,
12 | Form,
13 | Handwriting,
14 | InlineMath,
15 | ListItem,
16 | PageFooter,
17 | PageHeader,
18 | Picture,
19 | SectionHeader,
20 | Table,
21 | TableOfContents,
22 | Text,
23 | ComplexRegion,
24 | TableCell,
25 | Reference,
26 | )
27 | from marker.schema.document import Document
28 | from marker.schema.groups import (
29 | FigureGroup,
30 | ListGroup,
31 | PageGroup,
32 | PictureGroup,
33 | TableGroup,
34 | )
35 | from marker.schema.text import Line, Span
36 | from marker.schema.text.char import Char
37 |
38 | BLOCK_REGISTRY: Dict[BlockTypes, str] = {}
39 |
40 |
41 | def register_block_class(block_type: BlockTypes, block_cls: Type[Block]):
42 | BLOCK_REGISTRY[block_type] = f"{block_cls.__module__}.{block_cls.__name__}"
43 |
44 |
45 | def get_block_class(block_type: BlockTypes) -> Type[Block]:
46 | class_path = BLOCK_REGISTRY[block_type]
47 | module_name, class_name = class_path.rsplit(".", 1)
48 | module = import_module(module_name)
49 | return getattr(module, class_name)
50 |
51 |
52 | register_block_class(BlockTypes.Line, Line)
53 | register_block_class(BlockTypes.Span, Span)
54 | register_block_class(BlockTypes.Char, Char)
55 | register_block_class(BlockTypes.FigureGroup, FigureGroup)
56 | register_block_class(BlockTypes.TableGroup, TableGroup)
57 | register_block_class(BlockTypes.ListGroup, ListGroup)
58 | register_block_class(BlockTypes.PictureGroup, PictureGroup)
59 | register_block_class(BlockTypes.Page, PageGroup)
60 | register_block_class(BlockTypes.Caption, Caption)
61 | register_block_class(BlockTypes.Code, Code)
62 | register_block_class(BlockTypes.Figure, Figure)
63 | register_block_class(BlockTypes.Footnote, Footnote)
64 | register_block_class(BlockTypes.Form, Form)
65 | register_block_class(BlockTypes.Equation, Equation)
66 | register_block_class(BlockTypes.Handwriting, Handwriting)
67 | register_block_class(BlockTypes.TextInlineMath, InlineMath)
68 | register_block_class(BlockTypes.ListItem, ListItem)
69 | register_block_class(BlockTypes.PageFooter, PageFooter)
70 | register_block_class(BlockTypes.PageHeader, PageHeader)
71 | register_block_class(BlockTypes.Picture, Picture)
72 | register_block_class(BlockTypes.SectionHeader, SectionHeader)
73 | register_block_class(BlockTypes.Table, Table)
74 | register_block_class(BlockTypes.Text, Text)
75 | register_block_class(BlockTypes.TableOfContents, TableOfContents)
76 | register_block_class(BlockTypes.ComplexRegion, ComplexRegion)
77 | register_block_class(BlockTypes.TableCell, TableCell)
78 | register_block_class(BlockTypes.Reference, Reference)
79 | register_block_class(BlockTypes.Document, Document)
80 |
81 | assert len(BLOCK_REGISTRY) == len(BlockTypes)
82 | assert all(
83 | [
84 | get_block_class(k).model_fields["block_type"].default == k
85 | for k, _ in BLOCK_REGISTRY.items()
86 | ]
87 | )
88 |
--------------------------------------------------------------------------------
/marker/schema/text/__init__.py:
--------------------------------------------------------------------------------
1 | from marker.schema.text.line import Line
2 | from marker.schema.text.span import Span
3 |
--------------------------------------------------------------------------------
/marker/schema/text/char.py:
--------------------------------------------------------------------------------
1 | from marker.schema import BlockTypes
2 | from marker.schema.blocks import Block
3 |
4 |
5 | class Char(Block):
6 | block_type: BlockTypes = BlockTypes.Char
7 | block_description: str = "A single character inside a span."
8 |
9 | text: str
10 | idx: int
11 |
--------------------------------------------------------------------------------
/marker/schema/text/span.py:
--------------------------------------------------------------------------------
1 | import html
2 | import re
3 | from typing import List, Literal, Optional
4 |
5 | from marker.schema import BlockTypes
6 | from marker.schema.blocks import Block
7 |
8 |
9 | def cleanup_text(full_text):
10 | full_text = re.sub(r'(\n\s){3,}', '\n\n', full_text)
11 | full_text = full_text.replace('\xa0', ' ') # Replace non-breaking spaces
12 | return full_text
13 |
14 |
15 | class Span(Block):
16 | block_type: BlockTypes = BlockTypes.Span
17 | block_description: str = "A span of text inside a line."
18 |
19 | text: str
20 | font: str
21 | font_weight: float
22 | font_size: float
23 | minimum_position: int
24 | maximum_position: int
25 | formats: List[Literal['plain', 'math', 'chemical', 'bold', 'italic', 'highlight', 'subscript', 'superscript', 'small', 'code', 'underline']]
26 | has_superscript: bool = False
27 | has_subscript: bool = False
28 | url: Optional[str] = None
29 | html: Optional[str] = None
30 |
31 | @property
32 | def bold(self):
33 | return 'bold' in self.formats
34 |
35 | @property
36 | def italic(self):
37 | return 'italic' in self.formats
38 |
39 | @property
40 | def math(self):
41 | return 'math' in self.formats
42 |
43 | @property
44 | def highlight(self):
45 | return 'highlight' in self.formats
46 |
47 | @property
48 | def superscript(self):
49 | return 'superscript' in self.formats
50 |
51 | @property
52 | def subscript(self):
53 | return 'subscript' in self.formats
54 |
55 | @property
56 | def small(self):
57 | return 'small' in self.formats
58 |
59 | @property
60 | def code(self):
61 | return 'code' in self.formats
62 |
63 | @property
64 | def underline(self):
65 | return 'underline' in self.formats
66 |
67 | def assemble_html(self, document, child_blocks, parent_structure):
68 | if self.ignore_for_output:
69 | return ""
70 |
71 | if self.html:
72 | return self.html
73 |
74 | text = self.text
75 |
76 | # Remove trailing newlines
77 | replaced_newline = False
78 | while len(text) > 0 and text[-1] in ["\n", "\r"]:
79 | text = text[:-1]
80 | replaced_newline = True
81 |
82 | # Remove leading newlines
83 | while len(text) > 0 and text[0] in ["\n", "\r"]:
84 | text = text[1:]
85 |
86 | if replaced_newline and not text.endswith('-'):
87 | text += " "
88 |
89 | text = text.replace("-\n", "") # Remove hyphenated line breaks from the middle of the span
90 | text = html.escape(text)
91 | text = cleanup_text(text)
92 |
93 | if self.has_superscript:
94 | text = re.sub(r"^([0-9\W]+)(.*)", r"\1\2", text)
95 |
96 | # Handle full block superscript
97 | if "" not in text:
98 | text = f"{text}"
99 |
100 | if self.url:
101 | text = f"{text}"
102 |
103 | # TODO Support multiple formats
104 | if self.italic:
105 | text = f"{text}"
106 | elif self.bold:
107 | text = f"{text}"
108 | elif self.math:
109 | text = f""
110 | elif self.highlight:
111 | text = f"{text}"
112 | elif self.subscript:
113 | text = f"{text}"
114 | elif self.superscript:
115 | text = f"{text}"
116 | elif self.underline:
117 | text = f"{text}"
118 | elif self.small:
119 | text = f"{text}"
120 | elif self.code:
121 | text = f"{text} "
122 |
123 | return text
124 |
--------------------------------------------------------------------------------
/marker/scripts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/marker/scripts/__init__.py
--------------------------------------------------------------------------------
/marker/scripts/chunk_convert.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import os
3 | import subprocess
4 | import pkg_resources
5 |
6 |
7 | def chunk_convert_cli():
8 | parser = argparse.ArgumentParser(description="Convert a folder of PDFs to a folder of markdown files in chunks.")
9 | parser.add_argument("in_folder", help="Input folder with pdfs.")
10 | parser.add_argument("out_folder", help="Output folder")
11 | args = parser.parse_args()
12 |
13 | cur_dir = os.path.dirname(os.path.abspath(__file__))
14 | script_path = os.path.join(cur_dir, "chunk_convert.sh")
15 |
16 | # Construct the command
17 | cmd = f"{script_path} {args.in_folder} {args.out_folder}"
18 |
19 | # Execute the shell script
20 | subprocess.run(cmd, shell=True, check=True)
--------------------------------------------------------------------------------
/marker/scripts/chunk_convert.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | trap 'pkill -P $$' SIGINT
4 |
5 | # Check if NUM_DEVICES is set
6 | if [[ -z "$NUM_DEVICES" ]]; then
7 | echo "Please set the NUM_DEVICES environment variable."
8 | exit 1
9 | fi
10 |
11 | if [[ -z "$NUM_WORKERS" ]]; then
12 | echo "Please set the NUM_WORKERS environment variable."
13 | exit 1
14 | fi
15 |
16 | # Get input folder and output folder from args
17 | if [[ -z "$1" ]]; then
18 | echo "Please provide an input folder."
19 | exit 1
20 | fi
21 |
22 | if [[ -z "$2" ]]; then
23 | echo "Please provide an output folder."
24 | exit 1
25 | fi
26 |
27 | INPUT_FOLDER=$1
28 | OUTPUT_FOLDER=$2
29 |
30 | # Ensure output folder exists
31 | mkdir -p "$OUTPUT_FOLDER"
32 |
33 | # Loop from 0 to NUM_DEVICES and run the marker command in parallel
34 | for (( i=0; i<$NUM_DEVICES; i++ )); do
35 | DEVICE_NUM=$i
36 | export DEVICE_NUM
37 | export NUM_DEVICES
38 | export NUM_WORKERS
39 | echo "Running marker on GPU $DEVICE_NUM"
40 | cmd="CUDA_VISIBLE_DEVICES=$DEVICE_NUM marker $INPUT_FOLDER --output_dir $OUTPUT_FOLDER --num_chunks $NUM_DEVICES --chunk_idx $DEVICE_NUM --workers $NUM_WORKERS"
41 | eval $cmd &
42 |
43 | sleep 5
44 | done
45 |
46 | # Wait for all background processes to finish
47 | wait
--------------------------------------------------------------------------------
/marker/scripts/convert_single.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | os.environ["GRPC_VERBOSITY"] = "ERROR"
4 | os.environ["GLOG_minloglevel"] = "2"
5 | os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = (
6 | "1" # Transformers uses .isin for a simple op, which is not supported on MPS
7 | )
8 |
9 | import time
10 | import click
11 |
12 | from marker.config.parser import ConfigParser
13 | from marker.config.printer import CustomClickPrinter
14 | from marker.logger import configure_logging, get_logger
15 | from marker.models import create_model_dict
16 | from marker.output import save_output
17 |
18 | configure_logging()
19 | logger = get_logger()
20 |
21 |
22 | @click.command(cls=CustomClickPrinter, help="Convert a single PDF to markdown.")
23 | @click.argument("fpath", type=str)
24 | @ConfigParser.common_options
25 | def convert_single_cli(fpath: str, **kwargs):
26 | models = create_model_dict()
27 | start = time.time()
28 | config_parser = ConfigParser(kwargs)
29 |
30 | converter_cls = config_parser.get_converter_cls()
31 | converter = converter_cls(
32 | config=config_parser.generate_config_dict(),
33 | artifact_dict=models,
34 | processor_list=config_parser.get_processors(),
35 | renderer=config_parser.get_renderer(),
36 | llm_service=config_parser.get_llm_service(),
37 | )
38 | rendered = converter(fpath)
39 | out_folder = config_parser.get_output_folder(fpath)
40 | save_output(rendered, out_folder, config_parser.get_base_filename(fpath))
41 |
42 | logger.info(f"Saved markdown to {out_folder}")
43 | logger.info(f"Total time: {time.time() - start}")
44 |
--------------------------------------------------------------------------------
/marker/scripts/file_to_s3.py:
--------------------------------------------------------------------------------
1 | import json
2 | import shutil
3 | import datetime
4 | from pathlib import Path
5 | import boto3
6 |
7 | from huggingface_hub import snapshot_download
8 |
9 | import click
10 |
11 | S3_API_URL = "https://1afbe4656a6b40d982ab5e730a39f6b9.r2.cloudflarestorage.com"
12 |
13 | @click.command(help="Uploads files to an S3 bucket")
14 | @click.argument("filepath", type=str)
15 | @click.argument("s3_path", type=str)
16 | @click.option("--bucket_name", type=str, default="datalab")
17 | @click.option("--access_key_id", type=str, default="")
18 | @click.option("--access_key_secret", type=str, default="")
19 | def main(filepath: str, s3_path: str, bucket_name: str, access_key_id: str, access_key_secret: str):
20 | filepath = Path(filepath)
21 | # Upload the files to S3
22 | s3_client = boto3.client(
23 | 's3',
24 | endpoint_url=S3_API_URL,
25 | aws_access_key_id=access_key_id,
26 | aws_secret_access_key=access_key_secret,
27 | region_name="enam"
28 | )
29 |
30 | s3_key = f"{s3_path}/{filepath.name}"
31 |
32 | try:
33 | s3_client.upload_file(
34 | str(filepath),
35 | bucket_name,
36 | s3_key
37 | )
38 | except Exception as e:
39 | print(f"Error uploading {filepath}: {str(e)}")
40 |
41 | print(f"Uploaded files to {s3_path}")
42 |
43 | if __name__ == "__main__":
44 | main()
45 |
46 |
47 |
48 |
--------------------------------------------------------------------------------
/marker/scripts/run_streamlit_app.py:
--------------------------------------------------------------------------------
1 | import subprocess
2 | import os
3 | import sys
4 |
5 |
6 | def streamlit_app_cli(app_name: str = "streamlit_app.py"):
7 | argv = sys.argv[1:]
8 | cur_dir = os.path.dirname(os.path.abspath(__file__))
9 | app_path = os.path.join(cur_dir, app_name)
10 | cmd = [
11 | "streamlit",
12 | "run",
13 | app_path,
14 | "--server.fileWatcherType",
15 | "none",
16 | "--server.headless",
17 | "true",
18 | ]
19 | if argv:
20 | cmd += ["--"] + argv
21 | subprocess.run(cmd, env={**os.environ, "IN_STREAMLIT": "true"})
22 |
23 |
24 | def extraction_app_cli():
25 | streamlit_app_cli("extraction_app.py")
26 |
--------------------------------------------------------------------------------
/marker/services/__init__.py:
--------------------------------------------------------------------------------
1 | from typing import Optional, List, Annotated
2 |
3 | import PIL
4 | from pydantic import BaseModel
5 |
6 | from marker.schema.blocks import Block
7 | from marker.util import assign_config, verify_config_keys
8 |
9 |
10 | class BaseService:
11 | timeout: Annotated[int, "The timeout to use for the service."] = 30
12 | max_retries: Annotated[
13 | int, "The maximum number of retries to use for the service."
14 | ] = 2
15 | retry_wait_time: Annotated[int, "The wait time between retries."] = 3
16 |
17 | def __init__(self, config: Optional[BaseModel | dict] = None):
18 | assign_config(self, config)
19 |
20 | # Ensure we have all necessary fields filled out (API keys, etc.)
21 | verify_config_keys(self)
22 |
23 | def __call__(
24 | self,
25 | prompt: str,
26 | image: PIL.Image.Image | List[PIL.Image.Image],
27 | block: Block,
28 | response_schema: type[BaseModel],
29 | max_retries: int | None = None,
30 | timeout: int | None = None,
31 | ):
32 | raise NotImplementedError
33 |
--------------------------------------------------------------------------------
/marker/services/gemini.py:
--------------------------------------------------------------------------------
1 | import json
2 | import time
3 | from io import BytesIO
4 | from typing import List, Annotated
5 |
6 | import PIL
7 | from google import genai
8 | from google.genai import types
9 | from google.genai.errors import APIError
10 | from marker.logger import get_logger
11 | from pydantic import BaseModel
12 |
13 | from marker.schema.blocks import Block
14 | from marker.services import BaseService
15 |
16 | logger = get_logger()
17 |
18 |
19 | class BaseGeminiService(BaseService):
20 | gemini_model_name: Annotated[
21 | str, "The name of the Google model to use for the service."
22 | ] = "gemini-2.0-flash"
23 |
24 | def img_to_bytes(self, img: PIL.Image.Image):
25 | image_bytes = BytesIO()
26 | img.save(image_bytes, format="WEBP")
27 | return image_bytes.getvalue()
28 |
29 | def get_google_client(self, timeout: int):
30 | raise NotImplementedError
31 |
32 | def __call__(
33 | self,
34 | prompt: str,
35 | image: PIL.Image.Image | List[PIL.Image.Image],
36 | block: Block,
37 | response_schema: type[BaseModel],
38 | max_retries: int | None = None,
39 | timeout: int | None = None,
40 | ):
41 | if max_retries is None:
42 | max_retries = self.max_retries
43 |
44 | if timeout is None:
45 | timeout = self.timeout
46 |
47 | if not isinstance(image, list):
48 | image = [image]
49 |
50 | client = self.get_google_client(timeout=timeout)
51 | image_parts = [
52 | types.Part.from_bytes(data=self.img_to_bytes(img), mime_type="image/webp")
53 | for img in image
54 | ]
55 |
56 | tries = 0
57 | while tries < max_retries:
58 | try:
59 | responses = client.models.generate_content(
60 | model=self.gemini_model_name,
61 | contents=image_parts
62 | + [
63 | prompt
64 | ], # According to gemini docs, it performs better if the image is the first element
65 | config={
66 | "temperature": 0,
67 | "response_schema": response_schema,
68 | "response_mime_type": "application/json",
69 | },
70 | )
71 | output = responses.candidates[0].content.parts[0].text
72 | total_tokens = responses.usage_metadata.total_token_count
73 | block.update_metadata(llm_tokens_used=total_tokens, llm_request_count=1)
74 | return json.loads(output)
75 | except APIError as e:
76 | if e.code in [429, 443, 503]:
77 | # Rate limit exceeded
78 | tries += 1
79 | wait_time = tries * self.retry_wait_time
80 | logger.warning(
81 | f"APIError: {e}. Retrying in {wait_time} seconds... (Attempt {tries}/{max_retries})"
82 | )
83 | time.sleep(wait_time)
84 | else:
85 | logger.error(f"APIError: {e}")
86 | break
87 | except Exception as e:
88 | logger.error(f"Exception: {e}")
89 | break
90 |
91 | return {}
92 |
93 |
94 | class GoogleGeminiService(BaseGeminiService):
95 | gemini_api_key: Annotated[str, "The Google API key to use for the service."] = None
96 |
97 | def get_google_client(self, timeout: int):
98 | return genai.Client(
99 | api_key=self.gemini_api_key,
100 | http_options={"timeout": timeout * 1000}, # Convert to milliseconds
101 | )
102 |
--------------------------------------------------------------------------------
/marker/services/ollama.py:
--------------------------------------------------------------------------------
1 | import base64
2 | import json
3 | from io import BytesIO
4 | from typing import Annotated, List
5 |
6 | import PIL
7 | import requests
8 | from marker.logger import get_logger
9 | from pydantic import BaseModel
10 |
11 | from marker.schema.blocks import Block
12 | from marker.services import BaseService
13 |
14 | logger = get_logger()
15 |
16 |
17 | class OllamaService(BaseService):
18 | ollama_base_url: Annotated[
19 | str, "The base url to use for ollama. No trailing slash."
20 | ] = "http://localhost:11434"
21 | ollama_model: Annotated[str, "The model name to use for ollama."] = (
22 | "llama3.2-vision"
23 | )
24 |
25 | def image_to_base64(self, image: PIL.Image.Image):
26 | image_bytes = BytesIO()
27 | image.save(image_bytes, format="PNG")
28 | return base64.b64encode(image_bytes.getvalue()).decode("utf-8")
29 |
30 | def __call__(
31 | self,
32 | prompt: str,
33 | image: PIL.Image.Image | List[PIL.Image.Image],
34 | block: Block,
35 | response_schema: type[BaseModel],
36 | max_retries: int | None = None,
37 | timeout: int | None = None,
38 | ):
39 | url = f"{self.ollama_base_url}/api/generate"
40 | headers = {"Content-Type": "application/json"}
41 |
42 | schema = response_schema.model_json_schema()
43 | format_schema = {
44 | "type": "object",
45 | "properties": schema["properties"],
46 | "required": schema["required"],
47 | }
48 |
49 | if not isinstance(image, list):
50 | image = [image]
51 |
52 | image_bytes = [self.image_to_base64(img) for img in image]
53 |
54 | payload = {
55 | "model": self.ollama_model,
56 | "prompt": prompt,
57 | "stream": False,
58 | "format": format_schema,
59 | "images": image_bytes,
60 | }
61 |
62 | try:
63 | response = requests.post(url, json=payload, headers=headers)
64 | response.raise_for_status()
65 | response_data = response.json()
66 |
67 | total_tokens = (
68 | response_data["prompt_eval_count"] + response_data["eval_count"]
69 | )
70 | block.update_metadata(llm_request_count=1, llm_tokens_used=total_tokens)
71 |
72 | data = response_data["response"]
73 | return json.loads(data)
74 | except Exception as e:
75 | logger.warning(f"Ollama inference failed: {e}")
76 |
77 | return {}
78 |
--------------------------------------------------------------------------------
/marker/services/vertex.py:
--------------------------------------------------------------------------------
1 | from typing import Annotated
2 |
3 | from google import genai
4 |
5 | from marker.services.gemini import BaseGeminiService
6 |
7 | class GoogleVertexService(BaseGeminiService):
8 | vertex_project_id: Annotated[
9 | str,
10 | "Google Cloud Project ID for Vertex AI.",
11 | ] = None
12 | vertex_location: Annotated[
13 | str,
14 | "Google Cloud Location for Vertex AI.",
15 | ] = "us-central1"
16 | gemini_model_name: Annotated[
17 | str,
18 | "The name of the Google model to use for the service."
19 | ] = "gemini-2.0-flash-001"
20 | vertex_dedicated: Annotated[
21 | bool,
22 | "Whether to use a dedicated Vertex AI instance."
23 | ] = False
24 |
25 | def get_google_client(self, timeout: int):
26 | http_options = {"timeout": timeout * 1000} # Convert to milliseconds
27 | if self.vertex_dedicated:
28 | http_options["headers"] = {"x-vertex-ai-llm-request-type": "dedicated"}
29 | return genai.Client(
30 | vertexai=True,
31 | project=self.vertex_project_id,
32 | location=self.vertex_location,
33 | http_options=http_options,
34 | )
--------------------------------------------------------------------------------
/marker/settings.py:
--------------------------------------------------------------------------------
1 | from typing import Optional
2 |
3 | from dotenv import find_dotenv
4 | from pydantic import computed_field
5 | from pydantic_settings import BaseSettings
6 | import torch
7 | import os
8 |
9 |
10 | class Settings(BaseSettings):
11 | # Paths
12 | BASE_DIR: str = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
13 | OUTPUT_DIR: str = os.path.join(BASE_DIR, "conversion_results")
14 | FONT_DIR: str = os.path.join(BASE_DIR, "static", "fonts")
15 | DEBUG_DATA_FOLDER: str = os.path.join(BASE_DIR, "debug_data")
16 | ARTIFACT_URL: str = "https://models.datalab.to/artifacts"
17 | FONT_NAME: str = "GoNotoCurrent-Regular.ttf"
18 | FONT_PATH: str = os.path.join(FONT_DIR, FONT_NAME)
19 | LOGLEVEL: str = "INFO"
20 |
21 | # General
22 | OUTPUT_ENCODING: str = "utf-8"
23 | OUTPUT_IMAGE_FORMAT: str = "JPEG"
24 |
25 | # LLM
26 | GOOGLE_API_KEY: Optional[str] = ""
27 |
28 | # General models
29 | TORCH_DEVICE: Optional[str] = (
30 | None # Note: MPS device does not work for text detection, and will default to CPU
31 | )
32 |
33 | @computed_field
34 | @property
35 | def TORCH_DEVICE_MODEL(self) -> str:
36 | if self.TORCH_DEVICE is not None:
37 | return self.TORCH_DEVICE
38 |
39 | if torch.cuda.is_available():
40 | return "cuda"
41 |
42 | if torch.backends.mps.is_available():
43 | return "mps"
44 |
45 | return "cpu"
46 |
47 | @computed_field
48 | @property
49 | def MODEL_DTYPE(self) -> torch.dtype:
50 | if self.TORCH_DEVICE_MODEL == "cuda":
51 | return torch.bfloat16
52 | else:
53 | return torch.float32
54 |
55 | class Config:
56 | env_file = find_dotenv("local.env")
57 | extra = "ignore"
58 |
59 |
60 | settings = Settings()
61 |
--------------------------------------------------------------------------------
/marker_app.py:
--------------------------------------------------------------------------------
1 | from marker.scripts.run_streamlit_app import streamlit_app_cli
2 |
3 | if __name__ == "__main__":
4 | streamlit_app_cli()
--------------------------------------------------------------------------------
/marker_server.py:
--------------------------------------------------------------------------------
1 | from marker.scripts.server import server_cli
2 |
3 | if __name__ == "__main__":
4 | server_cli()
5 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.poetry]
2 | name = "marker-pdf"
3 | version = "1.7.4"
4 | description = "Convert documents to markdown with high speed and accuracy."
5 | authors = ["Vik Paruchuri "]
6 | readme = "README.md"
7 | license = "GPL-3.0-or-later"
8 | repository = "https://github.com/VikParuchuri/marker"
9 | keywords = ["pdf", "markdown", "ocr", "nlp"]
10 | packages = [
11 | {include = "marker"}
12 | ]
13 | include = [
14 | "marker/scripts/*.sh",
15 | "marker/scripts/*.html",
16 | ]
17 |
18 | [tool.poetry.dependencies]
19 | python = "^3.10"
20 | Pillow = "^10.1.0"
21 | pydantic = "^2.4.2"
22 | pydantic-settings = "^2.0.3"
23 | transformers = "^4.45.2"
24 | python-dotenv = "^1.0.0"
25 | torch = "^2.7.0"
26 | tqdm = "^4.66.1"
27 | ftfy = "^6.1.1"
28 | rapidfuzz = "^3.8.1"
29 | surya-ocr = "^0.14.5"
30 | regex = "^2024.4.28"
31 | pdftext = "~0.6.2"
32 | markdownify = "^0.13.1"
33 | click = "^8.2.0"
34 | markdown2 = "^2.5.2"
35 | filetype = "^1.2.0"
36 | scikit-learn = "^1.6.1"
37 | google-genai = "^1.0.0"
38 | anthropic = "^0.46.0"
39 | pre-commit = "^4.2.0"
40 |
41 | # Optional dependencies for documents
42 | mammoth = {version = "^1.9.0", optional = true}
43 | openpyxl = {version = "^3.1.5", optional = true}
44 | python-pptx = {version = "^1.0.2", optional = true}
45 | ebooklib = {version = "^0.18", optional = true}
46 | weasyprint = {version = "^63.1", optional = true}
47 | openai = "^1.65.2"
48 |
49 | [tool.poetry.group.dev.dependencies]
50 | jupyter = "^1.0.0"
51 | datasets = "^2.21.0"
52 | streamlit = "^1.37.1"
53 | fastapi = "^0.115.4"
54 | uvicorn = "^0.32.0"
55 | python-multipart = "^0.0.16"
56 | pytest = "^8.3.3"
57 | pytest-mock = "^3.14.0"
58 | apted = "1.0.3"
59 | distance = "0.1.3"
60 | lxml = "5.3.0"
61 | tabulate = "^0.9.0"
62 | latex2mathml = "^3.77.0"
63 | playwright = "^1.49.1"
64 |
65 | [tool.poetry.extras]
66 | full = ["mammoth", "openpyxl", "python-pptx", "ebooklib", "weasyprint"]
67 |
68 | [tool.poetry.scripts]
69 | marker = "marker.scripts.convert:convert_cli"
70 | marker_single = "marker.scripts.convert_single:convert_single_cli"
71 | marker_chunk_convert = "marker.scripts.chunk_convert:chunk_convert_cli"
72 | marker_gui = "marker.scripts.run_streamlit_app:streamlit_app_cli"
73 | marker_extract = "marker.scripts.run_streamlit_app:extraction_app_cli"
74 | marker_server = "marker.scripts.server:server_cli"
75 |
76 | [build-system]
77 | requires = ["poetry-core"]
78 | build-backend = "poetry.core.masonry.api"
--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | testpaths=tests
3 | markers =
4 | filename(name): specify the filename for the pdf_document fixture
5 | filterwarnings =
6 | ignore::Warning
--------------------------------------------------------------------------------
/static/fonts/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore
--------------------------------------------------------------------------------
/tests/builders/test_blank_page.py:
--------------------------------------------------------------------------------
1 | from surya.layout.schema import LayoutResult
2 |
3 | from marker.builders.document import DocumentBuilder
4 | from marker.builders.layout import LayoutBuilder
5 | from marker.builders.line import LineBuilder
6 |
7 |
8 | def test_blank_page(config, doc_provider, layout_model, ocr_error_model, detection_model):
9 | layout_builder = LayoutBuilder(layout_model, config)
10 | line_builder = LineBuilder(detection_model, ocr_error_model)
11 | builder = DocumentBuilder(config)
12 | document = builder.build_document(doc_provider)
13 |
14 | layout_results = [LayoutResult(
15 | bboxes=[],
16 | image_bbox=p.polygon.bbox,
17 | ) for p in document.pages]
18 | provider_lines = {p.page_id: [] for p in document.pages}
19 | ocr_lines = {p.page_id: [] for p in document.pages}
20 |
21 | layout_builder.add_blocks_to_pages(document.pages, layout_results)
22 | line_builder.merge_blocks(document, provider_lines, ocr_lines)
23 |
24 | assert all([isinstance(p.children, list) for p in document.pages])
25 | assert all([isinstance(p.structure, list) for p in document.pages])
--------------------------------------------------------------------------------
/tests/builders/test_document_builder.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from marker.schema import BlockTypes
4 | from marker.schema.text.line import Line
5 |
6 |
7 | @pytest.mark.config({"page_range": [0]})
8 | def test_document_builder(pdf_document):
9 | first_page = pdf_document.pages[0]
10 | assert first_page.structure[0] == '/page/0/SectionHeader/0'
11 |
12 | first_block = first_page.get_block(first_page.structure[0])
13 | assert first_block.block_type == BlockTypes.SectionHeader
14 | assert first_block.text_extraction_method == 'pdftext'
15 |
16 | first_text_block: Line = first_page.get_block(first_block.structure[0])
17 | assert first_text_block.block_type == BlockTypes.Line
18 |
19 | first_span = first_page.get_block(first_text_block.structure[0])
20 | assert first_span.block_type == BlockTypes.Span
21 | assert first_span.text == 'Subspace Adversarial Training'
22 | assert first_span.font == 'NimbusRomNo9L-Medi'
23 | assert first_span.formats == ['plain']
24 |
--------------------------------------------------------------------------------
/tests/builders/test_garbled_pdf.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from marker.builders.document import DocumentBuilder
4 | from marker.builders.line import LineBuilder
5 | from marker.processors.table import TableProcessor
6 | from marker.schema import BlockTypes
7 |
8 |
9 | @pytest.mark.filename("water_damage.pdf")
10 | def test_garbled_pdf(pdf_document, detection_model, recognition_model, table_rec_model):
11 | assert pdf_document.pages[0].structure[0] == "/page/0/Table/0"
12 |
13 | table_block = pdf_document.pages[0].get_block(pdf_document.pages[0].structure[0])
14 | assert table_block.block_type == BlockTypes.Table
15 | assert table_block.structure[0] == "/page/0/Line/1"
16 |
17 | table_cell = pdf_document.pages[0].get_block(table_block.structure[0])
18 | assert table_cell.block_type == BlockTypes.Line
19 |
20 | # We don't OCR in the initial pass, only with the TableProcessor
21 | processor = TableProcessor(detection_model, recognition_model, table_rec_model)
22 | processor(pdf_document)
23 |
24 | table = pdf_document.pages[0].contained_blocks(pdf_document, (BlockTypes.Table,))[0]
25 | assert "варіант" in table.raw_text(pdf_document)
26 |
27 | table_cell = pdf_document.pages[0].get_block(table_block.structure[0])
28 | assert table_cell.block_type == BlockTypes.TableCell
29 |
30 |
31 | @pytest.mark.filename("hindi_judgement.pdf")
32 | @pytest.mark.config({"page_range": [2, 3], "disable_ocr": True})
33 | def test_garbled_builder(config, doc_provider, detection_model, ocr_error_model):
34 | line_builder = LineBuilder(detection_model, ocr_error_model, config)
35 | builder = DocumentBuilder(config)
36 | document = builder.build_document(doc_provider)
37 |
38 | bad_ocr_results = line_builder.ocr_error_detection(
39 | document.pages, doc_provider.page_lines
40 | )
41 | assert len(bad_ocr_results.labels) == 2
42 | assert any([label == "bad" for label in bad_ocr_results.labels])
43 |
44 |
45 | @pytest.mark.filename("adversarial.pdf")
46 | @pytest.mark.config({"page_range": [2, 3], "disable_ocr": True})
47 | def test_nongarbled_builder(config, doc_provider, detection_model, ocr_error_model):
48 | line_builder = LineBuilder(detection_model, ocr_error_model, config)
49 | builder = DocumentBuilder(config)
50 | document = builder.build_document(doc_provider)
51 |
52 | bad_ocr_results = line_builder.ocr_error_detection(
53 | document.pages, doc_provider.page_lines
54 | )
55 | assert len(bad_ocr_results.labels) == 2
56 | assert all([label == "good" for label in bad_ocr_results.labels])
57 |
--------------------------------------------------------------------------------
/tests/builders/test_layout_replace.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from marker.builders.document import DocumentBuilder
4 | from marker.builders.layout import LayoutBuilder
5 | from marker.builders.line import LineBuilder
6 | from marker.renderers.markdown import MarkdownRenderer
7 | from marker.schema import BlockTypes
8 | from marker.schema.registry import get_block_class
9 |
10 |
11 | @pytest.mark.config({"page_range": [0]})
12 | def test_layout_replace(request, config, doc_provider, layout_model, ocr_error_model, detection_model):
13 | # The llm layout builder replaces blocks - this makes sure text is still merged properly
14 | layout_builder = LayoutBuilder(layout_model, config)
15 | line_builder = LineBuilder(detection_model, ocr_error_model, config)
16 | builder = DocumentBuilder(config)
17 | document = builder.build_document(doc_provider)
18 | layout_builder(document, doc_provider)
19 | page = document.pages[0]
20 | new_blocks = []
21 | for block in page.contained_blocks(document, (BlockTypes.Text,)):
22 | generated_block_class = get_block_class(BlockTypes.TextInlineMath)
23 | generated_block = generated_block_class(
24 | polygon=block.polygon,
25 | page_id=block.page_id,
26 | structure=block.structure,
27 | )
28 | page.replace_block(block, generated_block)
29 | new_blocks.append(generated_block)
30 | line_builder(document, doc_provider)
31 |
32 | for block in new_blocks:
33 | assert block.raw_text(document).strip()
34 |
35 | renderer = MarkdownRenderer(config)
36 | rendered = renderer(document)
37 |
38 | assert "worst-case perturbations" in rendered.markdown
39 | assert "projected gradient descent" in rendered.markdown
40 |
41 |
42 |
43 |
--------------------------------------------------------------------------------
/tests/builders/test_line_builder.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from marker.schema import BlockTypes
4 |
5 | # Page contains provider lines that are longer than detected lines
6 | # Any bad merging will cause broken final OCR results with format lines
7 | @pytest.mark.filename("mixed_eng_hindi.pdf")
8 | @pytest.mark.config({"page_range": [2], "format_lines": True})
9 | def test_provider_detected_line_merge(pdf_document):
10 | page = pdf_document.pages[0]
11 | text_lines = page.contained_blocks(pdf_document, (BlockTypes.Line,))
12 |
13 | # This count includes detected lines merged in with provider lines
14 | assert len(text_lines) == 83
15 |
16 | # Page provider lines only contain english, while the hindi is missing
17 | # format_lines should fill in the missing lines
18 | @pytest.mark.filename("mixed_eng_hindi.pdf")
19 | @pytest.mark.config({"page_range": [0], "format_lines": True})
20 | def test_fill_missing_provider_lines(pdf_document):
21 | page = pdf_document.pages[0]
22 | raw_text = page.raw_text(pdf_document)
23 | assert "प्राधिकार से प्रकाशित" in raw_text
24 | assert "खान मंत्रालय" in raw_text
--------------------------------------------------------------------------------
/tests/builders/test_merged_lines.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from marker.schema import BlockTypes
4 |
5 |
6 | @pytest.mark.config({"page_range": [6], "format_lines": True, "disable_ocr": True})
7 | @pytest.mark.filename("bad_math.pdf")
8 | def test_keep_ocr(pdf_document):
9 | contained_lines = pdf_document.pages[0].contained_blocks(
10 | pdf_document, [BlockTypes.Line]
11 | )
12 |
13 | # Check that we grabbed the right text
14 | assert "Lemma" in contained_lines[-1].formatted_text(pdf_document)
15 | assert "distribution" in contained_lines[-2].formatted_text(pdf_document)
16 |
17 | # Line 2 comes after line 1
18 | assert contained_lines[-1].polygon.bbox[1] > contained_lines[-2].polygon.bbox[3]
19 |
--------------------------------------------------------------------------------
/tests/builders/test_ocr_builder.py:
--------------------------------------------------------------------------------
1 | from PIL import Image
2 |
3 | from marker.builders.ocr import OcrBuilder
4 |
5 |
6 | def test_blank_char_builder(recognition_model):
7 | builder = OcrBuilder(recognition_model)
8 | image = Image.new("RGB", (100, 100))
9 | spans = builder.spans_from_html_chars([], None, image) # Test with empty char list
10 | assert len(spans) == 0
11 |
--------------------------------------------------------------------------------
/tests/builders/test_ocr_pipeline.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from marker.schema import BlockTypes
4 | from marker.schema.text.line import Line
5 |
6 |
7 | def _ocr_pipeline_test(pdf_document):
8 | first_page = pdf_document.pages[0]
9 | assert first_page.structure[0] == "/page/0/SectionHeader/0"
10 |
11 | first_block = first_page.get_block(first_page.structure[0])
12 | assert first_block.text_extraction_method == "surya"
13 | assert first_block.block_type == BlockTypes.SectionHeader
14 |
15 | first_text_block: Line = first_page.get_block(first_block.structure[0])
16 | assert first_text_block.block_type == BlockTypes.Line
17 |
18 | first_span = first_page.get_block(first_text_block.structure[0])
19 | assert first_span.block_type == BlockTypes.Span
20 | assert first_span.text.strip() == "Subspace Adversarial Training"
21 |
22 | # Ensure we match all text lines up properly
23 | # Makes sure the OCR bbox is being scaled to the same scale as the layout boxes
24 | text_lines = first_page.contained_blocks(pdf_document, (BlockTypes.Line,))
25 | text_blocks = first_page.contained_blocks(
26 | pdf_document, (BlockTypes.Text, BlockTypes.TextInlineMath)
27 | )
28 | assert len(text_lines) == 83
29 |
30 | # Ensure the bbox sizes match up
31 | max_line_position = max([line.polygon.y_end for line in text_lines])
32 | max_block_position = max(
33 | [block.polygon.y_end for block in text_blocks if block.source == "layout"]
34 | )
35 | assert max_line_position <= (max_block_position * 1.02)
36 |
37 |
38 | @pytest.mark.config({"force_ocr": True, "page_range": [0]})
39 | def test_ocr_pipeline(pdf_document):
40 | _ocr_pipeline_test(pdf_document)
41 |
42 |
43 | @pytest.mark.config({"force_ocr": True, "page_range": [0], "use_llm": True})
44 | def test_ocr_with_inline_pipeline(pdf_document):
45 | _ocr_pipeline_test(pdf_document)
46 |
--------------------------------------------------------------------------------
/tests/builders/test_overriding.py:
--------------------------------------------------------------------------------
1 | import multiprocessing as mp
2 |
3 | import pytest
4 |
5 | from marker.providers.pdf import PdfProvider
6 | from marker.schema import BlockTypes
7 | from marker.schema.blocks import SectionHeader
8 | from marker.schema.document import Document
9 | from marker.schema.registry import register_block_class
10 | from marker.schema.text import Line
11 | from tests.utils import setup_pdf_provider
12 |
13 |
14 | class NewSectionHeader(SectionHeader):
15 | pass
16 |
17 |
18 | class NewLine(Line):
19 | pass
20 |
21 |
22 | @pytest.mark.config({
23 | "page_range": [0],
24 | "override_map": {BlockTypes.SectionHeader: NewSectionHeader}
25 | })
26 | def test_overriding(pdf_document: Document):
27 | assert pdf_document.pages[0]\
28 | .get_block(pdf_document.pages[0].structure[0]).__class__ == NewSectionHeader
29 |
30 |
31 | def get_lines(pdf: str, config=None):
32 | for block_type, block_cls in config["override_map"].items():
33 | register_block_class(block_type, block_cls)
34 |
35 | provider: PdfProvider = setup_pdf_provider(pdf, config)
36 | return provider.get_page_lines(0)
37 |
38 |
39 | def test_overriding_mp():
40 | config = {
41 | "page_range": [0],
42 | "override_map": {BlockTypes.Line: NewLine}
43 | }
44 |
45 | pdf_list = ["adversarial.pdf", "adversarial_rot.pdf"]
46 |
47 | with mp.Pool(processes=2) as pool:
48 | results = pool.starmap(get_lines, [(pdf, config) for pdf in pdf_list])
49 | assert all([r[0].line.__class__ == NewLine for r in results])
50 |
--------------------------------------------------------------------------------
/tests/builders/test_pdf_links.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | import pytest
4 |
5 | from marker.converters.pdf import PdfConverter
6 | from marker.renderers.markdown import MarkdownOutput
7 | from marker.schema import BlockTypes
8 | from marker.schema.document import Document
9 | from marker.util import classes_to_strings
10 |
11 |
12 | @pytest.mark.filename("arxiv_test.pdf")
13 | @pytest.mark.output_format("markdown")
14 | def test_pdf_links(pdf_document: Document, config, renderer, model_dict, temp_doc):
15 | first_page = pdf_document.pages[1]
16 |
17 | processors = ["marker.processors.reference.ReferenceProcessor"]
18 | pdf_converter = PdfConverter(
19 | artifact_dict=model_dict,
20 | processor_list=processors,
21 | renderer=classes_to_strings([renderer])[0],
22 | config=config
23 | )
24 |
25 | for section_header_span in first_page.contained_blocks(pdf_document, (BlockTypes.Span,)):
26 | if "II." in section_header_span.text:
27 | assert section_header_span.url == "#page-1-0"
28 | break
29 | else:
30 | raise ValueError("Could not find II. in the first page")
31 |
32 | section_header_block = first_page.contained_blocks(pdf_document, (BlockTypes.SectionHeader,))[0]
33 | assert section_header_block.raw_text(pdf_document) == 'II. THEORETICAL FRAMEWORK\n'
34 |
35 | assert first_page.refs[0].ref == "page-1-0"
36 |
37 | markdown_output: MarkdownOutput = pdf_converter(temp_doc.name)
38 | markdown = markdown_output.markdown
39 |
40 | assert '[II.](#page-1-0)' in markdown
41 | assert 'II. THEORETICAL FRAMEWORK' in markdown
42 |
43 | for ref in set([f'' for m in re.findall(r'\]\(#page-(\d+)-(\d+)\)', markdown)]):
44 | assert ref in markdown, f"Reference {ref} not found in markdown"
45 |
--------------------------------------------------------------------------------
/tests/builders/test_rotated_bboxes.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from marker.schema import BlockTypes
4 |
5 |
6 | @pytest.mark.config({"page_range": [0]})
7 | @pytest.mark.filename("adversarial_rot.pdf")
8 | def test_rotated_bboxes(pdf_document):
9 | first_page = pdf_document.pages[0]
10 |
11 | # Ensure we match all text lines up properly
12 | text_lines = first_page.contained_blocks(pdf_document, (BlockTypes.Line,))
13 | text_blocks = first_page.contained_blocks(
14 | pdf_document, (BlockTypes.Text, BlockTypes.TextInlineMath)
15 | )
16 | assert len(text_lines) == 85
17 |
18 | # Ensure the bbox sizes match up
19 | max_line_position = max([line.polygon.x_end for line in text_lines])
20 | max_block_position = max(
21 | [block.polygon.x_end for block in text_blocks if block.source == "layout"]
22 | )
23 | assert max_line_position <= max_block_position
24 |
--------------------------------------------------------------------------------
/tests/builders/test_strip_existing_ocr.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 |
4 | @pytest.mark.config({"page_range": [0], "strip_existing_ocr": True})
5 | @pytest.mark.filename("handwritten.pdf")
6 | def test_strip_ocr(doc_provider):
7 | # Ensure that the OCR text isn't extracted
8 | assert len(doc_provider.page_lines) == 0
9 |
10 |
11 | @pytest.mark.config({"page_range": [0]})
12 | @pytest.mark.filename("handwritten.pdf")
13 | def test_keep_ocr(doc_provider):
14 | assert len(doc_provider.page_lines) == 1
15 |
--------------------------------------------------------------------------------
/tests/builders/test_structure.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from marker.builders.structure import StructureBuilder
4 |
5 |
6 | @pytest.mark.config({"page_range": [0]})
7 | def test_structure_builder(pdf_document):
8 | structure = StructureBuilder()
9 | structure(pdf_document)
10 | assert len(pdf_document.pages[0].structure) > 0
11 |
--------------------------------------------------------------------------------
/tests/config/test_config.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from contextlib import suppress
3 | import click
4 |
5 | from marker.config.printer import CustomClickPrinter
6 | from marker.config.crawler import crawler
7 | from marker.config.parser import ConfigParser
8 |
9 |
10 | def capture_kwargs(argv):
11 | command = click.command(cls=CustomClickPrinter)
12 | captured_kwargs = {}
13 |
14 | def parse_args(**kwargs):
15 | captured_kwargs.update(kwargs)
16 | return kwargs
17 |
18 | original_argv = sys.argv
19 | sys.argv = argv
20 | try:
21 | with suppress(SystemExit):
22 | command(ConfigParser.common_options(parse_args))()
23 | finally:
24 | sys.argv = original_argv
25 |
26 | return captured_kwargs
27 |
28 |
29 | def test_config_parser():
30 | sys.argv = [
31 | "test",
32 | "--disable_multiprocessing",
33 | "--output_dir",
34 | "output_dir",
35 | "--height_tolerance",
36 | "0.5",
37 | ]
38 | kwargs = capture_kwargs(sys.argv)
39 | parser = ConfigParser(kwargs)
40 | config_dict = parser.generate_config_dict()
41 |
42 | # Validate kwarg capturing
43 | assert kwargs["disable_multiprocessing"]
44 | assert kwargs["output_dir"] == "output_dir"
45 |
46 | assert config_dict["pdftext_workers"] == 1 # disabling multiprocessing does this
47 | assert config_dict["height_tolerance"] == 0.5
48 | assert "output_dir" not in config_dict # This is not a config key
49 |
50 |
51 | def test_config_none():
52 | kwargs = capture_kwargs(["test"])
53 |
54 | for key in crawler.attr_set:
55 | # We force some options to become flags for ease of use on the CLI
56 | value = None
57 | assert kwargs.get(key) is value
58 |
59 |
60 | def test_config_llm():
61 | kwargs = capture_kwargs(["test", "--use_llm"])
62 | parser = ConfigParser(kwargs)
63 | config_dict = parser.generate_config_dict()
64 |
65 | # Validate kwarg capturing
66 | assert config_dict["use_llm"]
67 |
68 |
69 | def test_config_force_ocr():
70 | kwargs = capture_kwargs(["test", "--force_ocr", "--format_lines"])
71 | parser = ConfigParser(kwargs)
72 | config_dict = parser.generate_config_dict()
73 |
74 | # Validate kwarg capturing
75 | assert config_dict["force_ocr"]
76 | assert config_dict["format_lines"]
77 |
--------------------------------------------------------------------------------
/tests/converters/test_extraction_converter.py:
--------------------------------------------------------------------------------
1 | import json
2 | import pytest
3 |
4 | from marker.converters.extraction import ExtractionConverter
5 | from marker.extractors.page import PageExtractionSchema
6 | from marker.services import BaseService
7 |
8 |
9 | class MockLLMService(BaseService):
10 | def __call__(self, prompt, image=None, page=None, response_schema=None, **kwargs):
11 | assert response_schema == PageExtractionSchema
12 | return {
13 | "description": "Mock extraction description",
14 | "extracted_json": json.dumps({"test_key": "test_value"}),
15 | "existence_confidence": 5,
16 | "value_confidence": 5,
17 | }
18 |
19 |
20 | @pytest.fixture
21 | def mock_llm_service():
22 | return MockLLMService
23 |
24 |
25 | @pytest.fixture
26 | def extraction_converter(config, model_dict, mock_llm_service):
27 | test_schema = {
28 | "title": "TestSchema",
29 | "type": "object",
30 | "properties": {"test_key": {"title": "Test Key", "type": "string"}},
31 | "required": ["test_key"],
32 | }
33 |
34 | config["page_schema"] = json.dumps(test_schema)
35 | config["output_format"] = "markdown"
36 | model_dict["llm_service"] = mock_llm_service
37 |
38 | converter = ExtractionConverter(
39 | artifact_dict=model_dict, processor_list=None, config=config
40 | )
41 | converter.default_llm_service = MockLLMService
42 | return converter
43 |
44 |
45 | @pytest.mark.config({"page_range": [0]})
46 | def test_extraction_converter_invalid_schema(
47 | config, model_dict, mock_llm_service, temp_doc
48 | ):
49 | config["page_schema"] = "invalid json"
50 |
51 | model_dict["llm_service"] = mock_llm_service
52 | converter = ExtractionConverter(
53 | artifact_dict=model_dict, processor_list=None, config=config
54 | )
55 |
56 | with pytest.raises(ValueError):
57 | converter(temp_doc.name)
58 |
59 |
60 | @pytest.mark.config({"page_range": [0, 1]})
61 | def test_extraction_converter_multiple_pages(extraction_converter, temp_doc):
62 | result = extraction_converter(temp_doc.name)
63 |
64 | assert result is not None
65 | assert result.document_json is not None
66 | assert result.document_json == {"test_key": "test_value"}
67 |
--------------------------------------------------------------------------------
/tests/converters/test_ocr_converter.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from marker.converters.ocr import OCRConverter
4 | from marker.renderers.ocr_json import OCRJSONOutput, OCRJSONPageOutput
5 |
6 |
7 | def _ocr_converter(config, model_dict, temp_pdf, line_count: int, eq_count: int):
8 | converter = OCRConverter(artifact_dict=model_dict, config=config)
9 |
10 | ocr_json: OCRJSONOutput = converter(temp_pdf.name)
11 | pages = ocr_json.children
12 |
13 | assert len(pages) == 1
14 | assert len(pages[0].children) == line_count
15 | eqs = [line for line in pages[0].children if line.block_type == "Equation"]
16 | assert len(eqs) == eq_count
17 | return pages
18 |
19 |
20 | def check_bboxes(page: OCRJSONPageOutput, lines):
21 | page_size = page.bbox
22 | for line in lines:
23 | assert len(line.children) > 0
24 | for child in line.children:
25 | bbox = child.bbox
26 | assert all(
27 | [
28 | bbox[0] >= page_size[0],
29 | bbox[1] >= page_size[1],
30 | bbox[2] <= page_size[2],
31 | bbox[3] <= page_size[3],
32 | ]
33 | ), "Child bbox is outside page bbox"
34 |
35 |
36 | @pytest.mark.config({"page_range": [0]})
37 | def test_ocr_converter(config, model_dict, temp_doc):
38 | _ocr_converter(config, model_dict, temp_doc, 84, 2)
39 |
40 |
41 | @pytest.mark.filename("pres.pdf")
42 | @pytest.mark.config({"page_range": [1], "force_ocr": True, "keep_chars": True})
43 | def test_ocr_converter_force(config, model_dict, temp_doc):
44 | pages = _ocr_converter(config, model_dict, temp_doc, 10, 0)
45 | lines = [line for line in pages[0].children if line.block_type == "Line"]
46 | check_bboxes(pages[0], lines)
47 |
48 |
49 | @pytest.mark.filename("pres.pdf")
50 | @pytest.mark.config({"page_range": [1], "keep_chars": True})
51 | def test_ocr_converter_keep(config, model_dict, temp_doc):
52 | pages = _ocr_converter(config, model_dict, temp_doc, 9, 0)
53 | lines = [line for line in pages[0].children if line.block_type == "Line"]
54 | check_bboxes(pages[0], lines)
55 |
--------------------------------------------------------------------------------
/tests/converters/test_table_converter.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | from marker.converters.table import TableConverter
3 | from marker.renderers.markdown import MarkdownOutput
4 | from marker.util import classes_to_strings
5 |
6 | def _table_converter(config, model_dict, renderer, temp_pdf):
7 | converter = TableConverter(
8 | artifact_dict=model_dict,
9 | processor_list=None,
10 | renderer=classes_to_strings([renderer])[0],
11 | config=config
12 | )
13 |
14 | markdown_output: MarkdownOutput = converter(temp_pdf.name)
15 | markdown = markdown_output.markdown
16 |
17 | assert len(markdown) > 0
18 | assert "cyclic" in markdown
19 |
20 |
21 | @pytest.mark.output_format("markdown")
22 | @pytest.mark.config({"page_range": [5]})
23 | def test_table_converter(config, model_dict, renderer, temp_doc):
24 | _table_converter(config, model_dict, renderer, temp_doc)
25 |
26 | @pytest.mark.output_format("markdown")
27 | @pytest.mark.config({"page_range": [5], "force_ocr": True})
28 | def test_table_converter_ocr(config, model_dict, renderer, temp_doc):
29 | _table_converter(config, model_dict, renderer, temp_doc)
30 |
31 |
--------------------------------------------------------------------------------
/tests/processors/test_document_toc_processor.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from marker.processors.document_toc import DocumentTOCProcessor
4 |
5 |
6 | @pytest.mark.config({"page_range": [0]})
7 | def test_document_toc_processor(pdf_document, detection_model, recognition_model, table_rec_model):
8 | processor = DocumentTOCProcessor()
9 | processor(pdf_document)
10 |
11 | assert len(pdf_document.table_of_contents) == 3
12 | assert pdf_document.table_of_contents[0]["title"] == "Subspace Adversarial Training"
13 |
--------------------------------------------------------------------------------
/tests/processors/test_equation_processor.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from marker.schema import BlockTypes
4 | from marker.processors.equation import EquationProcessor
5 |
6 |
7 | @pytest.mark.config({"page_range": [0]})
8 | def test_equation_processor(pdf_document, recognition_model):
9 | processor = EquationProcessor(recognition_model)
10 | processor(pdf_document)
11 |
12 | for block in pdf_document.pages[0].children:
13 | if block.block_type == BlockTypes.Equation:
14 | assert block.html is not None
--------------------------------------------------------------------------------
/tests/processors/test_footnote_processor.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from marker.processors.footnote import FootnoteProcessor
4 | from marker.schema import BlockTypes
5 |
6 |
7 | @pytest.mark.filename("population_stats.pdf")
8 | @pytest.mark.config({"page_range": [4]})
9 | def test_footnote_processor(pdf_document):
10 | processor = FootnoteProcessor()
11 | processor(pdf_document)
12 |
13 | page0_footnotes = pdf_document.pages[0].contained_blocks(pdf_document, [BlockTypes.Footnote])
14 | assert len(page0_footnotes) >= 2
15 |
16 | assert page0_footnotes[-1].raw_text(pdf_document).strip().startswith("5")
17 |
--------------------------------------------------------------------------------
/tests/processors/test_ignoretext.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from marker.processors.ignoretext import IgnoreTextProcessor
4 | from marker.schema import BlockTypes
5 |
6 |
7 | @pytest.mark.filename("bio_pdf.pdf")
8 | @pytest.mark.config({"page_range": list(range(10))})
9 | def test_ignoretext_processor(pdf_document):
10 | processor = IgnoreTextProcessor()
11 | processor(pdf_document)
12 |
13 | page1_header = pdf_document.pages[1].contained_blocks(pdf_document, [BlockTypes.Text])[0]
14 | assert "bioRxiv" in page1_header.raw_text(pdf_document)
15 |
16 | assert page1_header.ignore_for_output is True
17 |
--------------------------------------------------------------------------------
/tests/processors/test_table_merge.py:
--------------------------------------------------------------------------------
1 | from unittest.mock import Mock
2 |
3 | import pytest
4 |
5 | from marker.processors.llm.llm_table_merge import LLMTableMergeProcessor
6 | from marker.processors.table import TableProcessor
7 | from marker.schema import BlockTypes
8 |
9 |
10 | @pytest.mark.filename("table_ex2.pdf")
11 | def test_llm_table_processor_nomerge(pdf_document, detection_model, table_rec_model, recognition_model, mocker):
12 | mock_cls = Mock()
13 | mock_cls.return_value = {
14 | "merge": "true",
15 | "direction": "right"
16 | }
17 |
18 | cell_processor = TableProcessor(detection_model, recognition_model, table_rec_model)
19 | cell_processor(pdf_document)
20 |
21 | tables = pdf_document.contained_blocks((BlockTypes.Table,))
22 | assert len(tables) == 3
23 |
24 | processor = LLMTableMergeProcessor(mock_cls, {"use_llm": True, "gemini_api_key": "test"})
25 | processor(pdf_document)
26 |
27 | tables = pdf_document.contained_blocks((BlockTypes.Table,))
28 | assert len(tables) == 3
--------------------------------------------------------------------------------
/tests/processors/test_table_processor.py:
--------------------------------------------------------------------------------
1 | from typing import List
2 |
3 | import pytest
4 | from marker.renderers.json import JSONRenderer
5 |
6 | from marker.renderers.markdown import MarkdownRenderer
7 | from marker.schema import BlockTypes
8 | from marker.processors.table import TableProcessor
9 | from marker.schema.blocks import TableCell
10 |
11 |
12 | @pytest.mark.config({"page_range": [5]})
13 | def test_table_processor(pdf_document, detection_model, recognition_model, table_rec_model):
14 | processor = TableProcessor(detection_model, recognition_model, table_rec_model)
15 | processor(pdf_document)
16 |
17 | for block in pdf_document.pages[0].children:
18 | if block.block_type == BlockTypes.Table:
19 | children = block.contained_blocks(pdf_document, (BlockTypes.TableCell,))
20 | assert children
21 | assert len(children) > 0
22 | assert isinstance(children[0], TableCell)
23 |
24 | assert len(pdf_document.contained_blocks((BlockTypes.Table,))) == 2
25 |
26 | renderer = MarkdownRenderer()
27 | table_output = renderer(pdf_document)
28 | assert "Schedule" in table_output.markdown
29 |
30 |
31 | @pytest.mark.filename("table_ex.pdf")
32 | @pytest.mark.config({"page_range": [0], "force_ocr": True})
33 | def test_avoid_double_ocr(pdf_document, detection_model, recognition_model, table_rec_model):
34 | tables = pdf_document.contained_blocks((BlockTypes.Table,))
35 | lines = tables[0].contained_blocks(pdf_document, (BlockTypes.Line,))
36 | assert len(lines) == 0
37 |
38 | processor = TableProcessor(detection_model, recognition_model, table_rec_model, config={"force_ocr": True})
39 | processor(pdf_document)
40 |
41 | renderer = MarkdownRenderer()
42 | table_output = renderer(pdf_document)
43 | assert "Participants" in table_output.markdown
44 |
45 |
46 | @pytest.mark.filename("multicol-blocks.pdf")
47 | @pytest.mark.config({"page_range": [3]})
48 | def test_overlap_blocks(pdf_document, detection_model, recognition_model, table_rec_model):
49 | page = pdf_document.pages[0]
50 | assert "Cascading, and the Auxiliary Problem Principle" in page.raw_text(pdf_document)
51 |
52 | processor = TableProcessor(detection_model, recognition_model, table_rec_model)
53 | processor(pdf_document)
54 |
55 | assert "Cascading, and the Auxiliary Problem Principle" in page.raw_text(pdf_document)
56 |
57 |
58 | @pytest.mark.filename("pres.pdf")
59 | @pytest.mark.config({"page_range": [4]})
60 | def test_ocr_table(pdf_document, detection_model, recognition_model, table_rec_model):
61 | processor = TableProcessor(detection_model, recognition_model, table_rec_model)
62 | processor(pdf_document)
63 |
64 | renderer = MarkdownRenderer()
65 | table_output = renderer(pdf_document)
66 | assert "1.2E-38" in table_output.markdown
67 |
68 |
69 | @pytest.mark.config({"page_range": [11]})
70 | def test_split_rows(pdf_document, detection_model, recognition_model, table_rec_model):
71 | processor = TableProcessor(detection_model, recognition_model, table_rec_model)
72 | processor(pdf_document)
73 |
74 | table = pdf_document.contained_blocks((BlockTypes.Table,))[-1]
75 | cells: List[TableCell] = table.contained_blocks(pdf_document, (BlockTypes.TableCell,))
76 | unique_rows = len(set([cell.row_id for cell in cells]))
77 | assert unique_rows == 6
78 |
79 |
80 |
--------------------------------------------------------------------------------
/tests/providers/test_document_providers.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 |
4 | @pytest.mark.config({"page_range": [0]})
5 | @pytest.mark.filename("lambda.pptx")
6 | def test_pptx_provider(doc_provider):
7 | assert doc_provider.get_images([0], 72)[0].size == (842, 596)
8 |
9 | page_lines = doc_provider.get_page_lines(0)
10 |
11 | spans = page_lines[0].spans
12 | assert spans[0].text == "Lambda Calculus"
13 |
14 | spans = page_lines[1].spans
15 | assert spans[0].text == "CSE 340 – Principles of Programming Languages"
16 |
17 |
18 | @pytest.mark.config({"page_range": [0]})
19 | @pytest.mark.filename("manual.epub")
20 | def test_epub_provider(doc_provider):
21 | assert doc_provider.get_images([0], 72)[0].size == (596, 842)
22 |
23 | page_lines = doc_provider.get_page_lines(0)
24 |
25 | spans = page_lines[0].spans
26 | assert spans[0].text == "The Project Gutenberg eBook of Simple"
27 |
28 |
29 | @pytest.mark.config({"page_range": [0]})
30 | @pytest.mark.filename("china.html")
31 | def test_html_provider(doc_provider):
32 | assert doc_provider.get_images([0], 72)[0].size == (596, 842)
33 |
34 | page_lines = doc_provider.get_page_lines(0)
35 |
36 | spans = page_lines[0].spans
37 | assert spans[0].text == "Jump to content"
38 |
39 | @pytest.mark.config({"page_range": [0]})
40 | @pytest.mark.filename("gatsby.docx")
41 | def test_docx_provider(doc_provider):
42 | assert doc_provider.get_images([0], 72)[0].size == (596, 842)
43 |
44 | page_lines = doc_provider.get_page_lines(0)
45 |
46 | spans = page_lines[0].spans
47 | assert spans[0].text == "Themes"
48 |
49 |
50 | @pytest.mark.config({"page_range": [0]})
51 | @pytest.mark.filename("single_sheet.xlsx")
52 | def test_xlsx_provider(doc_provider):
53 | assert doc_provider.get_images([0], 72)[0].size == (842, 596)
54 |
55 | page_lines = doc_provider.get_page_lines(0)
56 |
57 | spans = page_lines[0].spans
58 | assert spans[0].text == "Sheet1"
--------------------------------------------------------------------------------
/tests/providers/test_image_provider.py:
--------------------------------------------------------------------------------
1 | from marker.providers.image import ImageProvider
2 | from marker.renderers.markdown import MarkdownOutput
3 |
4 |
5 | def test_image_provider(config, temp_image):
6 | provider = ImageProvider(temp_image.name, config)
7 | assert len(provider) == 1
8 | assert provider.get_images([0], 72)[0].size == (512, 512)
9 |
10 | page_lines = provider.get_page_lines(0)
11 | assert len(page_lines) == 0
12 |
13 | def test_image_provider_conversion(pdf_converter, temp_image):
14 | markdown_output: MarkdownOutput = pdf_converter(temp_image.name)
15 | assert "Hello, World!" in markdown_output.markdown
16 |
17 |
18 |
--------------------------------------------------------------------------------
/tests/providers/test_pdf_provider.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 |
4 | @pytest.mark.config({"page_range": [0]})
5 | def test_pdf_provider(doc_provider):
6 | assert len(doc_provider) == 12
7 | assert doc_provider.get_images([0], 72)[0].size == (612, 792)
8 | assert doc_provider.get_images([0], 96)[0].size == (816, 1056)
9 |
10 | page_lines = doc_provider.get_page_lines(0)
11 | assert len(page_lines) == 87
12 |
13 | spans = page_lines[0].spans
14 | assert len(spans) == 2
15 | assert spans[0].text == "Subspace Adversarial Training"
16 | assert spans[0].font == "NimbusRomNo9L-Medi"
17 | assert spans[0].formats == ["plain"]
18 |
--------------------------------------------------------------------------------
/tests/renderers/test_extract_images.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from marker.renderers.markdown import MarkdownRenderer
4 |
5 |
6 | @pytest.mark.config({"page_range": [0]})
7 | @pytest.mark.filename("A17_FlightPlan.pdf")
8 | def test_disable_extract_images(pdf_document):
9 | renderer = MarkdownRenderer({"extract_images": False})
10 | md = renderer(pdf_document).markdown
11 |
12 | # Verify markdown
13 | assert len(md) == 0
14 |
15 |
16 | @pytest.mark.config({"page_range": [0]})
17 | @pytest.mark.filename("A17_FlightPlan.pdf")
18 | def test_extract_images(pdf_document):
19 | renderer = MarkdownRenderer()
20 | md = renderer(pdf_document).markdown
21 |
22 | # Verify markdown
23 | assert "jpeg" in md
--------------------------------------------------------------------------------
/tests/renderers/test_json_renderer.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from marker.renderers.json import JSONRenderer
4 |
5 |
6 | @pytest.mark.config({"page_range": [0]})
7 | def test_markdown_renderer_pagination(pdf_document):
8 | renderer = JSONRenderer()
9 | pages = renderer(pdf_document).children
10 |
11 | assert len(pages) == 1
12 | assert pages[0].block_type == "Page"
13 | assert pages[0].children[0].block_type == "SectionHeader"
--------------------------------------------------------------------------------
/tests/renderers/test_markdown_renderer.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from marker.renderers.markdown import MarkdownRenderer
4 | from marker.schema import BlockTypes
5 | from marker.schema.blocks import TableCell
6 |
7 |
8 | @pytest.mark.config({"page_range": [0]})
9 | def test_markdown_renderer(pdf_document):
10 | renderer = MarkdownRenderer()
11 | md = renderer(pdf_document).markdown
12 |
13 | # Verify markdown
14 | assert '# Subspace Adversarial Training' in md
15 |
16 |
17 | @pytest.mark.config({"page_range": [0, 1], "paginate_output": True})
18 | def test_markdown_renderer_pagination(pdf_document):
19 | renderer = MarkdownRenderer({"paginate_output": True})
20 | md = renderer(pdf_document).markdown
21 |
22 | assert "{0}-" in md
23 | assert "{1}-" in md
24 |
25 |
26 | @pytest.mark.config({"page_range": [0, 1]})
27 | def test_markdown_renderer_metadata(pdf_document):
28 | renderer = MarkdownRenderer({"paginate_output": True})
29 | metadata = renderer(pdf_document).metadata
30 | assert "table_of_contents" in metadata
31 |
32 |
33 | @pytest.mark.config({"page_range": [0, 1]})
34 | def test_markdown_renderer_images(pdf_document):
35 | renderer = MarkdownRenderer({"extract_images": False})
36 | markdown_output = renderer(pdf_document)
37 |
38 | assert len(markdown_output.images) == 0
39 | assert '
42 | def test_markdown_renderer_tables(pdf_document):
43 | table = pdf_document.contained_blocks((BlockTypes.Table,))[0]
44 | page = pdf_document.pages[0]
45 |
46 | cell = TableCell(
47 | polygon=table.polygon,
48 | text_lines=["54.4567 89"],
49 | rowspan=1,
50 | colspan=1,
51 | row_id=0,
52 | col_id=0,
53 | is_header=False,
54 | page_id=page.page_id,
55 | )
56 | page.add_full_block(cell)
57 | table.structure = []
58 | table.add_structure(cell)
59 |
60 | renderer = MarkdownRenderer()
61 | md = renderer(pdf_document).markdown
62 | assert "54 .45 67 89 $x$" in md
63 |
64 |
65 |
--------------------------------------------------------------------------------
/tests/schema/groups/test_list_grouping.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from marker.builders.structure import StructureBuilder
4 | from marker.schema import BlockTypes
5 |
6 |
7 | @pytest.mark.config({"page_range": [4]})
8 | def test_list_grouping(pdf_document):
9 | structure = StructureBuilder()
10 | structure(pdf_document)
11 |
12 | page = pdf_document.pages[0]
13 | list_groups = []
14 | for block in page.children:
15 | if block.block_type == BlockTypes.ListGroup:
16 | list_groups.append(block)
17 |
18 | # The model breaks this up, since it has equations in it
19 | assert len(list_groups) == 3
20 |
--------------------------------------------------------------------------------
/tests/services/test_service_init.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from marker.converters.pdf import PdfConverter
4 | from marker.services.gemini import GoogleGeminiService
5 | from marker.services.ollama import OllamaService
6 | from marker.services.vertex import GoogleVertexService
7 | from marker.services.openai import OpenAIService
8 |
9 |
10 | @pytest.mark.output_format("markdown")
11 | @pytest.mark.config({"page_range": [0]})
12 | def test_empty_llm(pdf_converter: PdfConverter, temp_doc):
13 | assert pdf_converter.artifact_dict["llm_service"] is None
14 | assert pdf_converter.llm_service is None
15 |
16 |
17 | def test_llm_no_keys(model_dict, config):
18 | with pytest.raises(AssertionError):
19 | PdfConverter(
20 | artifact_dict=model_dict,
21 | config={"use_llm": True}
22 | )
23 |
24 | @pytest.mark.output_format("markdown")
25 | @pytest.mark.config({"page_range": [0], "use_llm": True, "gemini_api_key": "test"})
26 | def test_llm_gemini(pdf_converter: PdfConverter, temp_doc):
27 | assert pdf_converter.artifact_dict["llm_service"] is not None
28 | assert isinstance(pdf_converter.llm_service, GoogleGeminiService)
29 |
30 |
31 | @pytest.mark.output_format("markdown")
32 | @pytest.mark.config({"page_range": [0], "use_llm": True, "vertex_project_id": "test", "llm_service": "marker.services.vertex.GoogleVertexService"})
33 | def test_llm_vertex(pdf_converter: PdfConverter, temp_doc):
34 | assert pdf_converter.artifact_dict["llm_service"] is not None
35 | assert isinstance(pdf_converter.llm_service, GoogleVertexService)
36 |
37 |
38 | @pytest.mark.output_format("markdown")
39 | @pytest.mark.config({"page_range": [0], "use_llm": True, "llm_service": "marker.services.ollama.OllamaService"})
40 | def test_llm_ollama(pdf_converter: PdfConverter, temp_doc):
41 | assert pdf_converter.artifact_dict["llm_service"] is not None
42 | assert isinstance(pdf_converter.llm_service, OllamaService)
43 |
44 | @pytest.mark.output_format("markdown")
45 | @pytest.mark.config({"page_range": [0], "use_llm": True, "llm_service": "marker.services.openai.OpenAIService", "openai_api_key": "test"})
46 | def test_llm_ollama(pdf_converter: PdfConverter, temp_doc):
47 | assert pdf_converter.artifact_dict["llm_service"] is not None
48 | assert isinstance(pdf_converter.llm_service, OpenAIService)
--------------------------------------------------------------------------------
/tests/utils.py:
--------------------------------------------------------------------------------
1 | from marker.providers.pdf import PdfProvider
2 | import tempfile
3 |
4 | import datasets
5 |
6 |
7 | def setup_pdf_provider(
8 | filename='adversarial.pdf',
9 | config=None,
10 | ) -> PdfProvider:
11 | dataset = datasets.load_dataset("datalab-to/pdfs", split="train")
12 | idx = dataset['filename'].index(filename)
13 |
14 | temp_pdf = tempfile.NamedTemporaryFile(suffix=".pdf")
15 | temp_pdf.write(dataset['pdf'][idx])
16 | temp_pdf.flush()
17 |
18 | provider = PdfProvider(temp_pdf.name, config)
19 | return provider
20 |
--------------------------------------------------------------------------------
|