├── .github └── workflows │ ├── benchmarks.yml │ ├── ci.yml │ ├── cla.yml │ ├── publish.yml │ └── scripts.yml ├── .gitignore ├── .pre-commit-config.yaml ├── CLA.md ├── LICENSE ├── README.md ├── benchmarks ├── __init__.py ├── overall │ ├── __init__.py │ ├── display │ │ ├── __init__.py │ │ ├── dataset.py │ │ └── table.py │ ├── download │ │ ├── __init__.py │ │ ├── base.py │ │ ├── llamaparse.py │ │ ├── main.py │ │ ├── mathpix.py │ │ └── mistral.py │ ├── elo.py │ ├── methods │ │ ├── __init__.py │ │ ├── docling.py │ │ ├── gt.py │ │ ├── llamaparse.py │ │ ├── marker.py │ │ ├── mathpix.py │ │ ├── mistral.py │ │ ├── olmocr.py │ │ └── schema.py │ ├── overall.py │ ├── registry.py │ ├── schema.py │ └── scorers │ │ ├── __init__.py │ │ ├── clean.py │ │ ├── heuristic.py │ │ ├── llm.py │ │ └── schema.py ├── table │ ├── __init__.py │ ├── gemini.py │ ├── inference.py │ ├── scoring.py │ └── table.py ├── throughput │ ├── __init__.py │ └── main.py └── verify_scores.py ├── chunk_convert.py ├── convert.py ├── convert_single.py ├── data ├── .gitignore ├── examples │ ├── json │ │ ├── multicolcnn.json │ │ ├── switch_trans.json │ │ └── thinkpython.json │ └── markdown │ │ ├── multicolcnn │ │ ├── _page_1_Figure_0.jpeg │ │ ├── _page_2_Picture_0.jpeg │ │ ├── _page_6_Figure_0.jpeg │ │ ├── _page_7_Figure_0.jpeg │ │ ├── multicolcnn.md │ │ └── multicolcnn_meta.json │ │ ├── switch_transformers │ │ ├── _page_11_Figure_4.jpeg │ │ ├── _page_12_Figure_4.jpeg │ │ ├── _page_13_Figure_2.jpeg │ │ ├── _page_18_Figure_1.jpeg │ │ ├── _page_18_Figure_3.jpeg │ │ ├── _page_20_Figure_1.jpeg │ │ ├── _page_20_Figure_4.jpeg │ │ ├── _page_27_Figure_1.jpeg │ │ ├── _page_29_Figure_1.jpeg │ │ ├── _page_2_Figure_3.jpeg │ │ ├── _page_30_Figure_1.jpeg │ │ ├── _page_31_Figure_3.jpeg │ │ ├── _page_4_Figure_1.jpeg │ │ ├── _page_5_Figure_3.jpeg │ │ ├── switch_trans.md │ │ └── switch_trans_meta.json │ │ └── thinkpython │ │ ├── _page_109_Figure_1.jpeg │ │ ├── _page_115_Figure_1.jpeg │ │ ├── _page_116_Figure_3.jpeg │ │ ├── _page_127_Figure_1.jpeg │ │ ├── _page_128_Figure_1.jpeg │ │ ├── _page_167_Figure_1.jpeg │ │ ├── _page_169_Figure_1.jpeg │ │ ├── _page_173_Figure_1.jpeg │ │ ├── _page_190_Figure_1.jpeg │ │ ├── _page_195_Figure_1.jpeg │ │ ├── _page_205_Figure_1.jpeg │ │ ├── _page_230_Figure_1.jpeg │ │ ├── _page_233_Figure_1.jpeg │ │ ├── _page_233_Figure_3.jpeg │ │ ├── _page_234_Figure_1.jpeg │ │ ├── _page_235_Figure_1.jpeg │ │ ├── _page_236_Figure_1.jpeg │ │ ├── _page_236_Figure_3.jpeg │ │ ├── _page_237_Figure_1.jpeg │ │ ├── _page_238_Figure_1.jpeg │ │ ├── _page_23_Figure_1.jpeg │ │ ├── _page_23_Figure_3.jpeg │ │ ├── _page_46_Figure_1.jpeg │ │ ├── _page_60_Figure_1.jpeg │ │ ├── _page_60_Figure_3.jpeg │ │ ├── _page_67_Figure_1.jpeg │ │ ├── _page_71_Figure_1.jpeg │ │ ├── _page_78_Figure_1.jpeg │ │ ├── _page_85_Figure_1.jpeg │ │ ├── _page_94_Figure_1.jpeg │ │ ├── _page_99_Figure_17.jpeg │ │ ├── _page_99_Figure_178.jpeg │ │ ├── thinkpython.md │ │ └── thinkpython_meta.json ├── images │ ├── overall.png │ ├── per_doc.png │ └── table.png └── latex_to_md.sh ├── extraction_app.py ├── marker ├── builders │ ├── __init__.py │ ├── document.py │ ├── layout.py │ ├── line.py │ ├── llm_layout.py │ ├── ocr.py │ └── structure.py ├── config │ ├── __init__.py │ ├── crawler.py │ ├── parser.py │ └── printer.py ├── converters │ ├── __init__.py │ ├── extraction.py │ ├── ocr.py │ ├── pdf.py │ └── table.py ├── extractors │ ├── __init__.py │ └── page.py ├── logger.py ├── models.py ├── output.py ├── processors │ ├── __init__.py │ ├── blockquote.py │ ├── code.py │ ├── debug.py │ ├── document_toc.py │ ├── equation.py │ ├── footnote.py │ ├── ignoretext.py │ ├── line_merge.py │ ├── line_numbers.py │ ├── list.py │ ├── llm │ │ ├── __init__.py │ │ ├── llm_complex.py │ │ ├── llm_equation.py │ │ ├── llm_form.py │ │ ├── llm_handwriting.py │ │ ├── llm_image_description.py │ │ ├── llm_mathblock.py │ │ ├── llm_meta.py │ │ ├── llm_table.py │ │ └── llm_table_merge.py │ ├── order.py │ ├── page_header.py │ ├── reference.py │ ├── sectionheader.py │ ├── table.py │ ├── text.py │ └── util.py ├── providers │ ├── __init__.py │ ├── document.py │ ├── epub.py │ ├── html.py │ ├── image.py │ ├── pdf.py │ ├── powerpoint.py │ ├── registry.py │ ├── spreadsheet.py │ └── utils.py ├── renderers │ ├── __init__.py │ ├── extraction.py │ ├── html.py │ ├── json.py │ ├── markdown.py │ └── ocr_json.py ├── schema │ ├── __init__.py │ ├── blocks │ │ ├── __init__.py │ │ ├── base.py │ │ ├── basetable.py │ │ ├── caption.py │ │ ├── code.py │ │ ├── complexregion.py │ │ ├── equation.py │ │ ├── figure.py │ │ ├── footnote.py │ │ ├── form.py │ │ ├── handwriting.py │ │ ├── inlinemath.py │ │ ├── listitem.py │ │ ├── pagefooter.py │ │ ├── pageheader.py │ │ ├── picture.py │ │ ├── reference.py │ │ ├── sectionheader.py │ │ ├── table.py │ │ ├── tablecell.py │ │ ├── text.py │ │ └── toc.py │ ├── document.py │ ├── groups │ │ ├── __init__.py │ │ ├── base.py │ │ ├── figure.py │ │ ├── list.py │ │ ├── page.py │ │ ├── picture.py │ │ └── table.py │ ├── polygon.py │ ├── registry.py │ └── text │ │ ├── __init__.py │ │ ├── char.py │ │ ├── line.py │ │ └── span.py ├── scripts │ ├── __init__.py │ ├── chunk_convert.py │ ├── chunk_convert.sh │ ├── common.py │ ├── convert.py │ ├── convert_single.py │ ├── extraction_app.py │ ├── file_to_s3.py │ ├── run_streamlit_app.py │ ├── server.py │ └── streamlit_app.py ├── services │ ├── __init__.py │ ├── claude.py │ ├── gemini.py │ ├── ollama.py │ ├── openai.py │ └── vertex.py ├── settings.py └── util.py ├── marker_app.py ├── marker_server.py ├── poetry.lock ├── pyproject.toml ├── pytest.ini ├── signatures └── version1 │ └── cla.json ├── static └── fonts │ └── .gitignore └── tests ├── builders ├── test_blank_page.py ├── test_document_builder.py ├── test_garbled_pdf.py ├── test_layout_replace.py ├── test_line_builder.py ├── test_merged_lines.py ├── test_ocr_builder.py ├── test_ocr_pipeline.py ├── test_overriding.py ├── test_pdf_links.py ├── test_rotated_bboxes.py ├── test_strip_existing_ocr.py └── test_structure.py ├── config └── test_config.py ├── conftest.py ├── converters ├── test_extraction_converter.py ├── test_ocr_converter.py ├── test_pdf_converter.py └── test_table_converter.py ├── processors ├── test_document_toc_processor.py ├── test_equation_processor.py ├── test_footnote_processor.py ├── test_ignoretext.py ├── test_llm_processors.py ├── test_table_merge.py └── test_table_processor.py ├── providers ├── test_document_providers.py ├── test_image_provider.py └── test_pdf_provider.py ├── renderers ├── test_extract_images.py ├── test_json_renderer.py └── test_markdown_renderer.py ├── schema └── groups │ └── test_list_grouping.py ├── services └── test_service_init.py └── utils.py /.github/workflows/benchmarks.yml: -------------------------------------------------------------------------------- 1 | name: Integration test 2 | 3 | on: [push] 4 | 5 | env: 6 | PYTHONIOENCODING: "utf-8" 7 | 8 | jobs: 9 | benchmark: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@v3 13 | - name: Set up Python 3.11 14 | uses: actions/setup-python@v4 15 | with: 16 | python-version: 3.11 17 | - name: Install apt dependencies 18 | run: | 19 | sudo apt-get update 20 | sudo apt-get install -y pandoc 21 | - name: Install python dependencies 22 | run: | 23 | pip install poetry 24 | poetry install --extras "full" 25 | - name: Run benchmark test 26 | run: | 27 | poetry run python benchmarks/overall/overall.py --max_rows 5 28 | poetry run python benchmarks/verify_scores.py conversion_results/benchmark/overall/result.json --type marker 29 | - name: Run table benchmark 30 | run: | 31 | poetry run python benchmarks/table/table.py --max_rows 5 32 | poetry run python benchmarks/verify_scores.py conversion_results/benchmark/table/table.json --type table -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI tests 2 | 3 | on: [push] 4 | 5 | jobs: 6 | tests: 7 | runs-on: ubuntu-latest 8 | steps: 9 | - uses: actions/checkout@v3 10 | - name: Set up Python 3.11 11 | uses: actions/setup-python@v4 12 | with: 13 | python-version: 3.11 14 | - name: Install python dependencies 15 | run: | 16 | pip install poetry 17 | poetry install --extras "full" 18 | - name: Run tests 19 | env: 20 | HF_TOKEN: ${{ secrets.HF_TOKEN }} 21 | run: poetry run pytest 22 | -------------------------------------------------------------------------------- /.github/workflows/cla.yml: -------------------------------------------------------------------------------- 1 | name: "Marker CLA Assistant" 2 | on: 3 | issue_comment: 4 | types: [created] 5 | pull_request_target: 6 | types: [opened,closed,synchronize] 7 | 8 | # explicitly configure permissions, in case your GITHUB_TOKEN workflow permissions are set to read-only in repository settings 9 | permissions: 10 | actions: write 11 | contents: write 12 | pull-requests: write 13 | statuses: write 14 | 15 | jobs: 16 | CLAAssistant: 17 | runs-on: ubuntu-latest 18 | steps: 19 | - name: "Marker CLA Assistant" 20 | if: (github.event.comment.body == 'recheck' || github.event.comment.body == 'I have read the CLA Document and I hereby sign the CLA') || github.event_name == 'pull_request_target' 21 | uses: contributor-assistant/github-action@v2.3.0 22 | env: 23 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 24 | # the below token should have repo scope and must be manually added by you in the repository's secret 25 | # This token is required only if you have configured to store the signatures in a remote repository/organization 26 | PERSONAL_ACCESS_TOKEN: ${{ secrets.PERSONAL_ACCESS_TOKEN }} 27 | with: 28 | path-to-signatures: 'signatures/version1/cla.json' 29 | path-to-document: 'https://github.com/VikParuchuri/marker/blob/master/CLA.md' 30 | # branch should not be protected 31 | branch: 'master' 32 | allowlist: VikParuchuri -------------------------------------------------------------------------------- /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | name: Python package 2 | on: 3 | push: 4 | tags: 5 | - "v*.*.*" 6 | jobs: 7 | build: 8 | runs-on: ubuntu-latest 9 | steps: 10 | - uses: actions/checkout@v3 11 | - name: Set up Python 3.11 12 | uses: actions/setup-python@v4 13 | with: 14 | python-version: 3.11 15 | - name: Install python dependencies 16 | run: | 17 | pip install poetry 18 | poetry install --extras "full" 19 | - name: Build package 20 | run: | 21 | poetry build 22 | - name: Publish package 23 | env: 24 | PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }} 25 | run: | 26 | poetry config pypi-token.pypi "$PYPI_TOKEN" 27 | poetry publish 28 | -------------------------------------------------------------------------------- /.github/workflows/scripts.yml: -------------------------------------------------------------------------------- 1 | name: Test CLI scripts 2 | 3 | on: [push] 4 | 5 | jobs: 6 | tests: 7 | runs-on: ubuntu-latest 8 | steps: 9 | - uses: actions/checkout@v3 10 | - name: Set up Python 3.11 11 | uses: actions/setup-python@v4 12 | with: 13 | python-version: 3.11 14 | - name: Install python dependencies 15 | run: | 16 | pip install poetry 17 | poetry install --extras "full" 18 | - name: Download benchmark data 19 | run: | 20 | wget -O benchmark_data.zip "https://drive.google.com/uc?export=download&id=1NHrdYatR1rtqs2gPVfdvO0BAvocH8CJi" 21 | unzip -o benchmark_data.zip 22 | - name: Test single script 23 | run: poetry run marker_single benchmark_data/pdfs/switch_trans.pdf --page_range 0 24 | - name: Test convert script 25 | run: poetry run marker benchmark_data/pdfs --max_files 1 --workers 1 --page_range 0 26 | - name: Text convert script multiple workers 27 | run: poetry run marker benchmark_data/pdfs --max_files 2 --workers 2 --page_range 0-5 28 | - name: Test llm option 29 | run: | 30 | poetry run marker_single benchmark_data/pdfs/switch_trans.pdf --page_range 0 --use_llm > output.txt || echo "Command failed but continuing" 31 | if ! grep -q "UserWarning" output.txt; then 32 | echo "Success: No UserWarning found" 33 | exit 0 34 | else 35 | echo "Error: UserWarning found in output" 36 | exit 1 37 | fi -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/astral-sh/ruff-pre-commit 3 | # Ruff version. 4 | rev: v0.9.10 5 | hooks: 6 | # Run the linter. 7 | - id: ruff 8 | types_or: [ python, pyi ] 9 | args: [ --fix ] 10 | # Run the formatter. 11 | - id: ruff-format 12 | types_or: [ python, pyi ] -------------------------------------------------------------------------------- /benchmarks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/benchmarks/__init__.py -------------------------------------------------------------------------------- /benchmarks/overall/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/benchmarks/overall/__init__.py -------------------------------------------------------------------------------- /benchmarks/overall/display/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/benchmarks/overall/display/__init__.py -------------------------------------------------------------------------------- /benchmarks/overall/display/dataset.py: -------------------------------------------------------------------------------- 1 | import json 2 | from typing import List 3 | 4 | import datasets 5 | from tqdm import tqdm 6 | 7 | from benchmarks.overall.registry import METHOD_REGISTRY 8 | from benchmarks.overall.schema import FullResult 9 | 10 | 11 | def build_dataset(bench_dataset: datasets.Dataset, result: FullResult, score_types: List[str], max_rows: int | None = None) -> datasets.Dataset: 12 | rows = [] 13 | for idx, sample in tqdm(enumerate(bench_dataset), desc="Building dataset"): 14 | if idx not in result["markdown"]: 15 | continue 16 | 17 | if max_rows is not None and idx >= max_rows: 18 | break 19 | 20 | row = { 21 | "uuid": sample["uuid"], 22 | "classification": sample["classification"], 23 | "language": sample["language"], 24 | "img": sample["img"], 25 | } 26 | for method in result["markdown"][idx]: 27 | if method == "gt": 28 | continue 29 | 30 | method_cls = METHOD_REGISTRY[method]() 31 | md = result["markdown"][idx][method] 32 | try: 33 | method_img = method_cls.render(result["markdown"][idx][method]) 34 | except Exception as e: 35 | # This can happen when the markdown is None 36 | method_img = PIL.Image.new("RGB", (200, 200)) 37 | 38 | row[f"{method}_md"] = md 39 | row[f"{method}_img"] = method_img 40 | 41 | for score_type in score_types: 42 | try: 43 | row[f"{method}_{score_type}"] = result["scores"][idx][method][score_type]["score"] 44 | except KeyError: 45 | row[f"{method}_{score_type}"] = -1.0 # Missing score 46 | try: 47 | row[f"{method}_{score_type}_detail"] = json.dumps(result["scores"][idx][method][score_type]["specific_scores"]) 48 | except KeyError: 49 | row[f"{method}_{score_type}_detail"] = "" # Missing detail 50 | rows.append(row) 51 | ds = datasets.Dataset.from_list(rows) 52 | return ds 53 | 54 | -------------------------------------------------------------------------------- /benchmarks/overall/display/table.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from typing import Dict, List 3 | 4 | import tabulate 5 | 6 | from benchmarks.overall.schema import FullResult 7 | 8 | def write_table(title: str, rows: list, headers: list, out_path: Path, filename: str): 9 | table = tabulate.tabulate(rows, headers=headers, tablefmt="github") 10 | with open(out_path / filename, "w", encoding="utf-8") as f: 11 | f.write(f"# {title}\n") 12 | f.write(table) 13 | print(title) 14 | print(table) 15 | 16 | 17 | def print_scores(result: FullResult, out_path: Path, methods: List[str], score_types: List[str], default_score_type="heuristic", default_method="marker"): 18 | document_types = list(result["averages_by_type"][default_method][default_score_type].keys()) 19 | headers = ["Document Type"] 20 | for method in methods: 21 | for score_type in score_types: 22 | headers.append(f"{method} {score_type}") 23 | 24 | document_rows = [[k] for k in document_types] 25 | for i, doc_type in enumerate(document_types): 26 | for method in methods: 27 | for score_type in score_types: 28 | avg_score = sum(result["averages_by_type"][method][score_type][doc_type]) / max(1, len(result["averages_by_type"][method][score_type][doc_type])) 29 | document_rows[i].append(avg_score) 30 | 31 | write_table("Document Types", document_rows, headers, out_path, "document_types.md") 32 | 33 | headers = ["Block Type"] 34 | block_types = list(result["averages_by_block_type"][default_method][default_score_type].keys()) # all possible blocks 35 | block_score_types = list(result["averages_by_block_type"][default_method].keys()) 36 | for method in methods: 37 | for score_type in block_score_types: 38 | headers.append(f"{method} {score_type}") 39 | 40 | block_rows = [[k] for k in block_types] 41 | for i, block_type in enumerate(block_types): 42 | for method in methods: 43 | for score_type in block_score_types: 44 | avg_score = sum(result["averages_by_block_type"][method][score_type][block_type]) / max(1, len(result["averages_by_block_type"][method][score_type][block_type])) 45 | block_rows[i].append(avg_score) 46 | 47 | write_table("Block types", block_rows, headers, out_path, "block_types.md") 48 | 49 | headers = ["Method", "Avg Time"] + score_types 50 | inference_rows = [[k] for k in methods] 51 | all_raw_scores = [result["scores"][i] for i in result["scores"]] 52 | for i, method in enumerate(methods): 53 | avg_time = sum(result["average_times"][method]) / max(1, len(result["average_times"][method])) 54 | inference_rows[i].append(avg_time) 55 | for score_type in score_types: 56 | scores_lst = [] 57 | for ar in all_raw_scores: 58 | try: 59 | # Sometimes a few llm scores are missing 60 | scores_lst.append(ar[method][score_type]["score"]) 61 | except KeyError: 62 | continue 63 | avg_score = sum(scores_lst) / max(1, len(scores_lst)) 64 | inference_rows[i].append(avg_score) 65 | 66 | write_table("Overall Results", inference_rows, headers, out_path, "overall.md") 67 | 68 | print("Scores computed by aligning ground truth markdown blocks with predicted markdown for each method. The scores are 0-100 based on edit distance.") -------------------------------------------------------------------------------- /benchmarks/overall/download/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/benchmarks/overall/download/__init__.py -------------------------------------------------------------------------------- /benchmarks/overall/download/base.py: -------------------------------------------------------------------------------- 1 | import json 2 | from json import JSONDecodeError 3 | from pathlib import Path 4 | 5 | import datasets 6 | from tqdm import tqdm 7 | 8 | 9 | class Downloader: 10 | cache_path: Path = Path("cache") 11 | service: str 12 | 13 | def __init__(self, api_key, app_id, max_rows: int = 2200): 14 | self.cache_path.mkdir(exist_ok=True) 15 | self.max_rows = max_rows 16 | self.api_key = api_key 17 | self.app_id = app_id 18 | self.ds = datasets.load_dataset("datalab-to/marker_benchmark", split="train") 19 | 20 | def get_html(self, pdf_bytes): 21 | raise NotImplementedError 22 | 23 | def upload_ds(self): 24 | rows = [] 25 | for file in self.cache_path.glob("*.json"): 26 | with open(file, "r") as f: 27 | data = json.load(f) 28 | rows.append(data) 29 | 30 | out_ds = datasets.Dataset.from_list(rows, features=datasets.Features({ 31 | "md": datasets.Value("string"), 32 | "uuid": datasets.Value("string"), 33 | "time": datasets.Value("float"), 34 | })) 35 | out_ds.push_to_hub(f"datalab-to/marker_benchmark_{self.service}", private=True) 36 | 37 | def generate_data(self): 38 | max_rows = self.max_rows 39 | for idx, sample in tqdm(enumerate(self.ds), desc=f"Saving {self.service} results"): 40 | cache_file = self.cache_path / f"{idx}.json" 41 | if cache_file.exists(): 42 | continue 43 | 44 | pdf_bytes = sample["pdf"] # This is a single page PDF 45 | try: 46 | out_data = self.get_html(pdf_bytes) 47 | except JSONDecodeError as e: 48 | print(f"Error with sample {idx}: {e}") 49 | continue 50 | except Exception as e: 51 | print(f"Error with sample {idx}: {e}") 52 | continue 53 | out_data["uuid"] = sample["uuid"] 54 | 55 | with cache_file.open("w") as f: 56 | json.dump(out_data, f) 57 | 58 | if idx >= max_rows: 59 | break 60 | 61 | def __call__(self): 62 | self.generate_data() 63 | self.upload_ds() 64 | -------------------------------------------------------------------------------- /benchmarks/overall/download/llamaparse.py: -------------------------------------------------------------------------------- 1 | import io 2 | import time 3 | 4 | import requests 5 | 6 | from benchmarks.overall.download.base import Downloader 7 | 8 | 9 | class LlamaParseDownloader(Downloader): 10 | service = "llamaparse" 11 | 12 | def get_html(self, pdf_bytes): 13 | rand_name = str(time.time()) + ".pdf" 14 | start = time.time() 15 | buff = io.BytesIO(pdf_bytes) 16 | md = upload_and_parse_file(self.api_key, rand_name, buff) 17 | end = time.time() 18 | if isinstance(md, bytes): 19 | md = md.decode("utf-8") 20 | 21 | return { 22 | "md": md, 23 | "time": end - start, 24 | } 25 | 26 | 27 | def upload_and_parse_file(api_key: str, fname: str, buff, max_retries: int = 180, delay: int = 1): 28 | headers = { 29 | "Authorization": f"Bearer {api_key}", 30 | "Accept": "application/json" 31 | } 32 | 33 | # Upload file 34 | files = { 35 | 'file': (fname, buff, 'application/pdf') 36 | } 37 | response = requests.post( 38 | 'https://api.cloud.llamaindex.ai/api/v1/parsing/upload', 39 | headers=headers, 40 | files=files 41 | ) 42 | response.raise_for_status() 43 | job_id = response.json()['id'] 44 | 45 | # Poll for completion 46 | for _ in range(max_retries): 47 | status_response = requests.get( 48 | f'https://api.cloud.llamaindex.ai/api/v1/parsing/job/{job_id}', 49 | headers=headers 50 | ) 51 | status_response.raise_for_status() 52 | if status_response.json()['status'] == 'SUCCESS': 53 | # Get results 54 | result_response = requests.get( 55 | f'https://api.cloud.llamaindex.ai/api/v1/parsing/job/{job_id}/result/markdown', 56 | headers=headers 57 | ) 58 | result_response.raise_for_status() 59 | return result_response.json()['markdown'] 60 | 61 | time.sleep(delay) 62 | 63 | raise TimeoutError("Job did not complete within the maximum retry attempts") -------------------------------------------------------------------------------- /benchmarks/overall/download/main.py: -------------------------------------------------------------------------------- 1 | import click 2 | 3 | from benchmarks.overall.download.llamaparse import LlamaParseDownloader 4 | from benchmarks.overall.download.mathpix import MathpixDownloader 5 | from benchmarks.overall.download.mistral import MistralDownloader 6 | 7 | 8 | @click.command("Download data from inference services") 9 | @click.argument("service", type=click.Choice(["mathpix", "llamaparse", "mistral"])) 10 | @click.option("--max_rows", type=int, default=2200) 11 | @click.option("--api_key", type=str, default=None) 12 | @click.option("--app_id", type=str, default=None) 13 | def main(service: str, max_rows: int, api_key: str, app_id: str): 14 | registry = { 15 | "mathpix": MathpixDownloader, 16 | "llamaparse": LlamaParseDownloader, 17 | "mistral": MistralDownloader, 18 | } 19 | downloader = registry[service](api_key, app_id, max_rows=max_rows) 20 | 21 | # Generate data and upload to hub 22 | downloader() 23 | 24 | if __name__ == "__main__": 25 | main() 26 | -------------------------------------------------------------------------------- /benchmarks/overall/download/mathpix.py: -------------------------------------------------------------------------------- 1 | import json 2 | import time 3 | 4 | import requests 5 | 6 | from benchmarks.overall.download.base import Downloader 7 | 8 | 9 | class MathpixDownloader(Downloader): 10 | service = "mathpix" 11 | 12 | def get_html(self, pdf_bytes): 13 | headers = { 14 | "app_id": self.app_id, 15 | "app_key": self.api_key, 16 | } 17 | start = time.time() 18 | pdf_id = mathpix_request(pdf_bytes, headers) 19 | status = mathpix_status(pdf_id, headers) 20 | if status in ["processing", "error"]: 21 | md = "" 22 | else: 23 | md = mathpix_results(pdf_id, headers) 24 | end = time.time() 25 | if isinstance(md, bytes): 26 | md = md.decode("utf-8") 27 | 28 | return { 29 | "md": md, 30 | "time": end - start 31 | } 32 | 33 | def mathpix_request(buffer, headers): 34 | response = requests.post("https://api.mathpix.com/v3/pdf", 35 | headers=headers, 36 | data={ 37 | "options_json": json.dumps( 38 | { 39 | "conversion_formats": { 40 | "md": True, 41 | "html": True 42 | } 43 | } 44 | ) 45 | }, 46 | files={ 47 | "file": buffer 48 | } 49 | ) 50 | data = response.json() 51 | pdf_id = data["pdf_id"] 52 | return pdf_id 53 | 54 | def mathpix_status(pdf_id, headers): 55 | max_iters = 120 56 | i = 0 57 | status = "processing" 58 | status2 = "processing" 59 | while i < max_iters: 60 | time.sleep(1) 61 | response = requests.get(f"https://api.mathpix.com/v3/converter/{pdf_id}", 62 | headers=headers 63 | ) 64 | status_resp = response.json() 65 | if "conversion_status" not in status_resp: 66 | continue 67 | status = status_resp["conversion_status"]["md"]["status"] 68 | status2 = status_resp["conversion_status"]["html"]["status"] 69 | if status == "completed" and status2 == "completed": 70 | break 71 | elif status == "error" or status2 == "error": 72 | break 73 | out_status = "completed" if status == "completed" and status2 == "completed" else "error" 74 | return out_status 75 | 76 | def mathpix_results(pdf_id, headers, ext="md"): 77 | response = requests.get(f"https://api.mathpix.com/v3/converter/{pdf_id}.{ext}", 78 | headers=headers 79 | ) 80 | return response.content 81 | -------------------------------------------------------------------------------- /benchmarks/overall/download/mistral.py: -------------------------------------------------------------------------------- 1 | import io 2 | import time 3 | import requests 4 | 5 | from benchmarks.overall.download.base import Downloader 6 | 7 | 8 | class MistralDownloader(Downloader): 9 | service = "mistral" 10 | 11 | def get_html(self, pdf_bytes): 12 | rand_name = str(time.time()) + ".pdf" 13 | start = time.time() 14 | buff = io.BytesIO(pdf_bytes) 15 | md = upload_and_process_file(self.api_key, rand_name, buff) 16 | end = time.time() 17 | if isinstance(md, bytes): 18 | md = md.decode("utf-8") 19 | 20 | return { 21 | "md": md, 22 | "time": end - start, 23 | } 24 | 25 | 26 | def upload_and_process_file(api_key: str, fname: str, buff): 27 | headers = { 28 | "Authorization": f"Bearer {api_key}" 29 | } 30 | 31 | upload_headers = headers.copy() 32 | files = { 33 | 'file': (fname, buff, 'application/pdf'), 34 | 'purpose': (None, 'ocr') 35 | } 36 | 37 | upload_response = requests.post( 38 | 'https://api.mistral.ai/v1/files', 39 | headers=upload_headers, 40 | files=files 41 | ) 42 | upload_response.raise_for_status() 43 | file_id = upload_response.json()['id'] 44 | 45 | url_headers = headers.copy() 46 | url_headers["Accept"] = "application/json" 47 | 48 | url_response = requests.get( 49 | f'https://api.mistral.ai/v1/files/{file_id}/url?expiry=24', 50 | headers=url_headers 51 | ) 52 | url_response.raise_for_status() 53 | signed_url = url_response.json()['url'] 54 | 55 | ocr_headers = headers.copy() 56 | ocr_headers["Content-Type"] = "application/json" 57 | 58 | ocr_data = { 59 | "model": "mistral-ocr-latest", 60 | "document": { 61 | "type": "document_url", 62 | "document_url": signed_url 63 | }, 64 | "include_image_base64": True 65 | } 66 | ocr_response = requests.post( 67 | 'https://api.mistral.ai/v1/ocr', 68 | headers=ocr_headers, 69 | json=ocr_data 70 | ) 71 | ocr_response.raise_for_status() 72 | result = ocr_response.json() 73 | return result["pages"][0]["markdown"] -------------------------------------------------------------------------------- /benchmarks/overall/methods/docling.py: -------------------------------------------------------------------------------- 1 | import tempfile 2 | import time 3 | 4 | from benchmarks.overall.methods import BaseMethod, BenchmarkResult 5 | 6 | 7 | class DoclingMethod(BaseMethod): 8 | model_dict: dict = None 9 | use_llm: bool = False 10 | 11 | def __call__(self, sample) -> BenchmarkResult: 12 | from docling.document_converter import DocumentConverter 13 | pdf_bytes = sample["pdf"] # This is a single page PDF 14 | converter = DocumentConverter() 15 | 16 | with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as f: 17 | f.write(pdf_bytes) 18 | start = time.time() 19 | result = converter.convert(f.name) 20 | total = time.time() - start 21 | 22 | return { 23 | "markdown": result.document.export_to_markdown(), 24 | "time": total 25 | } 26 | 27 | -------------------------------------------------------------------------------- /benchmarks/overall/methods/gt.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | import json 3 | 4 | from PIL import Image 5 | 6 | from benchmarks.overall.methods import BaseMethod, BenchmarkResult 7 | 8 | 9 | class GTMethod(BaseMethod): 10 | def __call__(self, sample) -> BenchmarkResult: 11 | gt_blocks = json.loads(sample["gt_blocks"]) 12 | gt_html = [block["html"] for block in gt_blocks if len(block["html"]) > 0] 13 | gt_markdown = [self.convert_to_md(block) for block in gt_html] 14 | return { 15 | "markdown": gt_markdown, 16 | "time": 0 17 | } 18 | 19 | def render(self, html: List[str]) -> Image.Image: 20 | joined = "\n\n".join(html) 21 | html = f""" 22 | 23 | 24 | 25 | {joined} 26 | 27 | 28 | """.strip() 29 | return self.html_to_image(html) -------------------------------------------------------------------------------- /benchmarks/overall/methods/llamaparse.py: -------------------------------------------------------------------------------- 1 | import datasets 2 | 3 | from benchmarks.overall.methods import BaseMethod, BenchmarkResult 4 | 5 | 6 | class LlamaParseMethod(BaseMethod): 7 | llamaparse_ds: datasets.Dataset = None 8 | 9 | def __call__(self, sample) -> BenchmarkResult: 10 | uuid = sample["uuid"] 11 | data = None 12 | for row in self.llamaparse_ds: 13 | if str(row["uuid"]) == str(uuid): 14 | data = row 15 | break 16 | if not data: 17 | raise ValueError(f"Could not find data for uuid {uuid}") 18 | 19 | return { 20 | "markdown": data["md"], 21 | "time": data["time"] 22 | } -------------------------------------------------------------------------------- /benchmarks/overall/methods/marker.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tempfile 3 | import time 4 | 5 | from benchmarks.overall.methods import BaseMethod, BenchmarkResult 6 | from marker.config.parser import ConfigParser 7 | from marker.converters.pdf import PdfConverter 8 | 9 | 10 | class MarkerMethod(BaseMethod): 11 | model_dict: dict = None 12 | use_llm: bool = False 13 | 14 | def __call__(self, sample) -> BenchmarkResult: 15 | pdf_bytes = sample["pdf"] # This is a single page PDF 16 | parser = ConfigParser({ 17 | "page_range": "0", 18 | "disable_tqdm": True, 19 | "use_llm": self.use_llm, 20 | "redo_inline_math": self.use_llm, 21 | "llm_service": "marker.services.vertex.GoogleVertexService", 22 | "vertex_project_id": os.getenv("VERTEX_PROJECT_ID"), 23 | }) 24 | 25 | block_converter = PdfConverter( 26 | artifact_dict=self.model_dict, 27 | config=parser.generate_config_dict(), 28 | llm_service=parser.get_llm_service() 29 | ) 30 | 31 | with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as f: 32 | f.write(pdf_bytes) 33 | start = time.time() 34 | rendered = block_converter(f.name) 35 | total = time.time() - start 36 | 37 | return { 38 | "markdown": rendered.markdown, 39 | "time": total 40 | } 41 | 42 | -------------------------------------------------------------------------------- /benchmarks/overall/methods/mathpix.py: -------------------------------------------------------------------------------- 1 | import datasets 2 | 3 | from benchmarks.overall.methods import BaseMethod, BenchmarkResult 4 | 5 | 6 | class MathpixMethod(BaseMethod): 7 | mathpix_ds: datasets.Dataset = None 8 | 9 | def __call__(self, sample) -> BenchmarkResult: 10 | uuid = sample["uuid"] 11 | data = None 12 | for row in self.mathpix_ds: 13 | if str(row["uuid"]) == str(uuid): 14 | data = row 15 | break 16 | if not data: 17 | raise ValueError(f"Could not find data for uuid {uuid}") 18 | 19 | return { 20 | "markdown": data["md"], 21 | "time": data["time"] 22 | } -------------------------------------------------------------------------------- /benchmarks/overall/methods/mistral.py: -------------------------------------------------------------------------------- 1 | import datasets 2 | 3 | from benchmarks.overall.methods import BaseMethod, BenchmarkResult 4 | 5 | 6 | class MistralMethod(BaseMethod): 7 | mistral_ds: datasets.Dataset = None 8 | 9 | def __call__(self, sample) -> BenchmarkResult: 10 | uuid = sample["uuid"] 11 | data = None 12 | for row in self.mistral_ds: 13 | if str(row["uuid"]) == str(uuid): 14 | data = row 15 | break 16 | if not data: 17 | raise ValueError(f"Could not find data for uuid {uuid}") 18 | 19 | return { 20 | "markdown": data["md"], 21 | "time": data["time"] 22 | } -------------------------------------------------------------------------------- /benchmarks/overall/methods/olmocr.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import json 3 | import tempfile 4 | import time 5 | from io import BytesIO 6 | 7 | import torch 8 | from PIL import Image 9 | 10 | from benchmarks.overall.methods import BaseMethod, BenchmarkResult 11 | 12 | 13 | def convert_single_page(filename: str, model, processor, device): 14 | from olmocr.data.renderpdf import render_pdf_to_base64png 15 | from olmocr.prompts import build_finetuning_prompt 16 | from olmocr.prompts.anchor import get_anchor_text 17 | 18 | image_base64 = render_pdf_to_base64png(filename, 1, target_longest_image_dim=1024) 19 | 20 | # Build the prompt, using document metadata 21 | anchor_text = get_anchor_text(filename, 1, pdf_engine="pdfreport", target_length=4000) 22 | prompt = build_finetuning_prompt(anchor_text) 23 | 24 | # Build the full prompt 25 | messages = [ 26 | { 27 | "role": "user", 28 | "content": [ 29 | {"type": "text", "text": prompt}, 30 | {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}}, 31 | ], 32 | } 33 | ] 34 | 35 | # Apply the chat template and processor 36 | text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) 37 | main_image = Image.open(BytesIO(base64.b64decode(image_base64))) 38 | 39 | inputs = processor( 40 | text=[text], 41 | images=[main_image], 42 | padding=True, 43 | return_tensors="pt", 44 | ) 45 | inputs = {key: value.to(device) for (key, value) in inputs.items()} 46 | 47 | # Generate the output 48 | output = model.generate( 49 | **inputs, 50 | temperature=0.8, 51 | max_new_tokens=8192, 52 | num_return_sequences=1, 53 | do_sample=True, 54 | ) 55 | 56 | # Decode the output 57 | prompt_length = inputs["input_ids"].shape[1] 58 | new_tokens = output[:, prompt_length:] 59 | text_output = processor.tokenizer.batch_decode( 60 | new_tokens, skip_special_tokens=True 61 | )[0] 62 | 63 | try: 64 | text_output = json.loads(text_output) 65 | text = text_output["natural_text"] 66 | except Exception: 67 | try: 68 | text = text_output.split("natural_text")[1].strip() 69 | except Exception: 70 | text = "" 71 | 72 | return text 73 | 74 | 75 | class OlmOCRMethod(BaseMethod): 76 | olmocr_model: dict = None 77 | use_llm: bool = False 78 | 79 | def __call__(self, sample) -> BenchmarkResult: 80 | pdf_bytes = sample["pdf"] # This is a single page PDF 81 | 82 | with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as f: 83 | f.write(pdf_bytes) 84 | start = time.time() 85 | result = convert_single_page(f.name, self.olmocr_model["model"], self.olmocr_model["processor"], self.olmocr_model["model"].device) 86 | total = time.time() - start 87 | 88 | return { 89 | "markdown": result, 90 | "time": total 91 | } 92 | -------------------------------------------------------------------------------- /benchmarks/overall/methods/schema.py: -------------------------------------------------------------------------------- 1 | from typing import TypedDict, List 2 | 3 | 4 | class BenchmarkResult(TypedDict): 5 | markdown: str | List[str] 6 | time: float | None -------------------------------------------------------------------------------- /benchmarks/overall/registry.py: -------------------------------------------------------------------------------- 1 | from benchmarks.overall.methods.docling import DoclingMethod 2 | from benchmarks.overall.methods.gt import GTMethod 3 | from benchmarks.overall.methods.llamaparse import LlamaParseMethod 4 | from benchmarks.overall.methods.marker import MarkerMethod 5 | from benchmarks.overall.methods.mathpix import MathpixMethod 6 | from benchmarks.overall.methods.mistral import MistralMethod 7 | from benchmarks.overall.methods.olmocr import OlmOCRMethod 8 | from benchmarks.overall.scorers.heuristic import HeuristicScorer 9 | from benchmarks.overall.scorers.llm import LLMScorer 10 | 11 | SCORE_REGISTRY = { 12 | "heuristic": HeuristicScorer, 13 | "llm": LLMScorer 14 | } 15 | 16 | METHOD_REGISTRY = { 17 | "marker": MarkerMethod, 18 | "gt": GTMethod, 19 | "mathpix": MathpixMethod, 20 | "llamaparse": LlamaParseMethod, 21 | "docling": DoclingMethod, 22 | "olmocr": OlmOCRMethod, 23 | "mistral": MistralMethod 24 | } -------------------------------------------------------------------------------- /benchmarks/overall/schema.py: -------------------------------------------------------------------------------- 1 | from typing import TypedDict, List, Dict 2 | 3 | from benchmarks.overall.scorers.schema import BlockScores 4 | 5 | AVG_TYPE = Dict[str, Dict[str, Dict[str, List[float]]]] 6 | 7 | class FullResult(TypedDict): 8 | scores: Dict[int, Dict[str, Dict[str, BlockScores]]] 9 | averages_by_type: AVG_TYPE 10 | averages_by_block_type: AVG_TYPE 11 | average_times: Dict[str, List[float]] 12 | markdown: Dict[int, Dict[str, str]] 13 | -------------------------------------------------------------------------------- /benchmarks/overall/scorers/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | from benchmarks.overall.scorers.schema import BlockScores 4 | 5 | 6 | class BaseScorer: 7 | def __init__(self): 8 | pass 9 | 10 | def __call__(self, sample, gt_markdown: List[str], method_markdown: str) -> BlockScores: 11 | raise NotImplementedError() -------------------------------------------------------------------------------- /benchmarks/overall/scorers/schema.py: -------------------------------------------------------------------------------- 1 | from typing import TypedDict, List, Optional, Dict 2 | 3 | 4 | class BlockScores(TypedDict): 5 | score: float 6 | specific_scores: Dict[str, float | List[float]] 7 | -------------------------------------------------------------------------------- /benchmarks/table/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/benchmarks/table/__init__.py -------------------------------------------------------------------------------- /benchmarks/table/gemini.py: -------------------------------------------------------------------------------- 1 | import json 2 | from PIL import Image 3 | from google import genai 4 | from google.genai import types 5 | from io import BytesIO 6 | from pydantic import BaseModel 7 | 8 | from marker.settings import settings 9 | 10 | prompt = """ 11 | You're an expert document analyst who is good at turning tables in documents into HTML. Analyze the provided image, and convert it to a faithful HTML representation. 12 | 13 | Guidelines: 14 | - Keep the HTML simple and concise. 15 | - Only include the tag and contents. 16 | - Only use
, , and , , or
tags. Only use the colspan and rowspan attributes if necessary. Do not use
tags. 17 | - Make sure the table is as faithful to the image as possible with the given tags. 18 | 19 | **Instructions** 20 | 1. Analyze the image, and determine the table structure. 21 | 2. Convert the table image to HTML, following the guidelines above. 22 | 3. Output only the HTML for the table, starting with the tag and ending with the
tag. 23 | """.strip() 24 | 25 | class TableSchema(BaseModel): 26 | table_html: str 27 | 28 | def gemini_table_rec(image: Image.Image): 29 | client = genai.Client( 30 | api_key=settings.GOOGLE_API_KEY, 31 | http_options={"timeout": 60000} 32 | ) 33 | 34 | image_bytes = BytesIO() 35 | image.save(image_bytes, format="PNG") 36 | 37 | responses = client.models.generate_content( 38 | model="gemini-2.0-flash", 39 | contents=[types.Part.from_bytes(data=image_bytes.getvalue(), mime_type="image/png"), prompt], # According to gemini docs, it performs better if the image is the first element 40 | config={ 41 | "temperature": 0, 42 | "response_schema": TableSchema, 43 | "response_mime_type": "application/json", 44 | }, 45 | ) 46 | 47 | output = responses.candidates[0].content.parts[0].text 48 | return json.loads(output)["table_html"] -------------------------------------------------------------------------------- /benchmarks/throughput/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/benchmarks/throughput/__init__.py -------------------------------------------------------------------------------- /benchmarks/verify_scores.py: -------------------------------------------------------------------------------- 1 | import json 2 | import argparse 3 | 4 | 5 | def verify_scores(file_path): 6 | with open(file_path, 'r') as file: 7 | data = json.load(file) 8 | 9 | raw_scores = [data["scores"][k] for k in data["scores"]] 10 | marker_scores = [r["marker"]["heuristic"]["score"] for r in raw_scores] 11 | marker_score = sum(marker_scores) / len(marker_scores) 12 | if marker_score < 90: 13 | raise ValueError("Marker score below 90") 14 | 15 | 16 | def verify_table_scores(file_path): 17 | with open(file_path, 'r') as file: 18 | data = json.load(file) 19 | 20 | avg = sum([r["marker_score"] for r in data["marker"]]) / len(data) 21 | if avg < 0.7: 22 | raise ValueError("Average score is below the required threshold of 0.7") 23 | 24 | 25 | if __name__ == "__main__": 26 | parser = argparse.ArgumentParser(description="Verify benchmark scores") 27 | parser.add_argument("file_path", type=str, help="Path to the json file") 28 | parser.add_argument("--type", type=str, help="Type of file to verify", default="marker") 29 | args = parser.parse_args() 30 | if args.type == "marker": 31 | verify_scores(args.file_path) 32 | elif args.type == "table": 33 | verify_table_scores(args.file_path) 34 | -------------------------------------------------------------------------------- /chunk_convert.py: -------------------------------------------------------------------------------- 1 | from marker.scripts.chunk_convert import chunk_convert_cli 2 | 3 | if __name__ == "__main__": 4 | chunk_convert_cli() -------------------------------------------------------------------------------- /convert.py: -------------------------------------------------------------------------------- 1 | from marker.scripts.convert import convert_cli 2 | 3 | if __name__ == "__main__": 4 | convert_cli() 5 | -------------------------------------------------------------------------------- /convert_single.py: -------------------------------------------------------------------------------- 1 | from marker.scripts.convert_single import convert_single_cli 2 | 3 | if __name__ == "__main__": 4 | convert_single_cli() 5 | -------------------------------------------------------------------------------- /data/.gitignore: -------------------------------------------------------------------------------- 1 | latex 2 | pdfs 3 | references -------------------------------------------------------------------------------- /data/examples/markdown/multicolcnn/_page_1_Figure_0.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/multicolcnn/_page_1_Figure_0.jpeg -------------------------------------------------------------------------------- /data/examples/markdown/multicolcnn/_page_2_Picture_0.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/multicolcnn/_page_2_Picture_0.jpeg -------------------------------------------------------------------------------- /data/examples/markdown/multicolcnn/_page_6_Figure_0.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/multicolcnn/_page_6_Figure_0.jpeg -------------------------------------------------------------------------------- /data/examples/markdown/multicolcnn/_page_7_Figure_0.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/multicolcnn/_page_7_Figure_0.jpeg -------------------------------------------------------------------------------- /data/examples/markdown/switch_transformers/_page_11_Figure_4.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/switch_transformers/_page_11_Figure_4.jpeg -------------------------------------------------------------------------------- /data/examples/markdown/switch_transformers/_page_12_Figure_4.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/switch_transformers/_page_12_Figure_4.jpeg -------------------------------------------------------------------------------- /data/examples/markdown/switch_transformers/_page_13_Figure_2.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/switch_transformers/_page_13_Figure_2.jpeg -------------------------------------------------------------------------------- /data/examples/markdown/switch_transformers/_page_18_Figure_1.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/switch_transformers/_page_18_Figure_1.jpeg -------------------------------------------------------------------------------- /data/examples/markdown/switch_transformers/_page_18_Figure_3.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/switch_transformers/_page_18_Figure_3.jpeg -------------------------------------------------------------------------------- /data/examples/markdown/switch_transformers/_page_20_Figure_1.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/switch_transformers/_page_20_Figure_1.jpeg -------------------------------------------------------------------------------- /data/examples/markdown/switch_transformers/_page_20_Figure_4.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/switch_transformers/_page_20_Figure_4.jpeg -------------------------------------------------------------------------------- /data/examples/markdown/switch_transformers/_page_27_Figure_1.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/switch_transformers/_page_27_Figure_1.jpeg -------------------------------------------------------------------------------- /data/examples/markdown/switch_transformers/_page_29_Figure_1.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/switch_transformers/_page_29_Figure_1.jpeg -------------------------------------------------------------------------------- /data/examples/markdown/switch_transformers/_page_2_Figure_3.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/switch_transformers/_page_2_Figure_3.jpeg -------------------------------------------------------------------------------- /data/examples/markdown/switch_transformers/_page_30_Figure_1.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/switch_transformers/_page_30_Figure_1.jpeg -------------------------------------------------------------------------------- /data/examples/markdown/switch_transformers/_page_31_Figure_3.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/switch_transformers/_page_31_Figure_3.jpeg -------------------------------------------------------------------------------- /data/examples/markdown/switch_transformers/_page_4_Figure_1.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/switch_transformers/_page_4_Figure_1.jpeg -------------------------------------------------------------------------------- /data/examples/markdown/switch_transformers/_page_5_Figure_3.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/switch_transformers/_page_5_Figure_3.jpeg -------------------------------------------------------------------------------- /data/examples/markdown/thinkpython/_page_109_Figure_1.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/thinkpython/_page_109_Figure_1.jpeg -------------------------------------------------------------------------------- /data/examples/markdown/thinkpython/_page_115_Figure_1.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/thinkpython/_page_115_Figure_1.jpeg -------------------------------------------------------------------------------- /data/examples/markdown/thinkpython/_page_116_Figure_3.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/thinkpython/_page_116_Figure_3.jpeg -------------------------------------------------------------------------------- /data/examples/markdown/thinkpython/_page_127_Figure_1.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/thinkpython/_page_127_Figure_1.jpeg -------------------------------------------------------------------------------- /data/examples/markdown/thinkpython/_page_128_Figure_1.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/thinkpython/_page_128_Figure_1.jpeg -------------------------------------------------------------------------------- /data/examples/markdown/thinkpython/_page_167_Figure_1.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/thinkpython/_page_167_Figure_1.jpeg -------------------------------------------------------------------------------- /data/examples/markdown/thinkpython/_page_169_Figure_1.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/thinkpython/_page_169_Figure_1.jpeg -------------------------------------------------------------------------------- /data/examples/markdown/thinkpython/_page_173_Figure_1.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/thinkpython/_page_173_Figure_1.jpeg -------------------------------------------------------------------------------- /data/examples/markdown/thinkpython/_page_190_Figure_1.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/thinkpython/_page_190_Figure_1.jpeg -------------------------------------------------------------------------------- /data/examples/markdown/thinkpython/_page_195_Figure_1.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/thinkpython/_page_195_Figure_1.jpeg -------------------------------------------------------------------------------- /data/examples/markdown/thinkpython/_page_205_Figure_1.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/thinkpython/_page_205_Figure_1.jpeg -------------------------------------------------------------------------------- /data/examples/markdown/thinkpython/_page_230_Figure_1.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/thinkpython/_page_230_Figure_1.jpeg -------------------------------------------------------------------------------- /data/examples/markdown/thinkpython/_page_233_Figure_1.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/thinkpython/_page_233_Figure_1.jpeg -------------------------------------------------------------------------------- /data/examples/markdown/thinkpython/_page_233_Figure_3.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/thinkpython/_page_233_Figure_3.jpeg -------------------------------------------------------------------------------- /data/examples/markdown/thinkpython/_page_234_Figure_1.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/thinkpython/_page_234_Figure_1.jpeg -------------------------------------------------------------------------------- /data/examples/markdown/thinkpython/_page_235_Figure_1.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/thinkpython/_page_235_Figure_1.jpeg -------------------------------------------------------------------------------- /data/examples/markdown/thinkpython/_page_236_Figure_1.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/thinkpython/_page_236_Figure_1.jpeg -------------------------------------------------------------------------------- /data/examples/markdown/thinkpython/_page_236_Figure_3.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/thinkpython/_page_236_Figure_3.jpeg -------------------------------------------------------------------------------- /data/examples/markdown/thinkpython/_page_237_Figure_1.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/thinkpython/_page_237_Figure_1.jpeg -------------------------------------------------------------------------------- /data/examples/markdown/thinkpython/_page_238_Figure_1.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/thinkpython/_page_238_Figure_1.jpeg -------------------------------------------------------------------------------- /data/examples/markdown/thinkpython/_page_23_Figure_1.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/thinkpython/_page_23_Figure_1.jpeg -------------------------------------------------------------------------------- /data/examples/markdown/thinkpython/_page_23_Figure_3.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/thinkpython/_page_23_Figure_3.jpeg -------------------------------------------------------------------------------- /data/examples/markdown/thinkpython/_page_46_Figure_1.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/thinkpython/_page_46_Figure_1.jpeg -------------------------------------------------------------------------------- /data/examples/markdown/thinkpython/_page_60_Figure_1.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/thinkpython/_page_60_Figure_1.jpeg -------------------------------------------------------------------------------- /data/examples/markdown/thinkpython/_page_60_Figure_3.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/thinkpython/_page_60_Figure_3.jpeg -------------------------------------------------------------------------------- /data/examples/markdown/thinkpython/_page_67_Figure_1.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/thinkpython/_page_67_Figure_1.jpeg -------------------------------------------------------------------------------- /data/examples/markdown/thinkpython/_page_71_Figure_1.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/thinkpython/_page_71_Figure_1.jpeg -------------------------------------------------------------------------------- /data/examples/markdown/thinkpython/_page_78_Figure_1.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/thinkpython/_page_78_Figure_1.jpeg -------------------------------------------------------------------------------- /data/examples/markdown/thinkpython/_page_85_Figure_1.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/thinkpython/_page_85_Figure_1.jpeg -------------------------------------------------------------------------------- /data/examples/markdown/thinkpython/_page_94_Figure_1.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/thinkpython/_page_94_Figure_1.jpeg -------------------------------------------------------------------------------- /data/examples/markdown/thinkpython/_page_99_Figure_17.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/thinkpython/_page_99_Figure_17.jpeg -------------------------------------------------------------------------------- /data/examples/markdown/thinkpython/_page_99_Figure_178.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/examples/markdown/thinkpython/_page_99_Figure_178.jpeg -------------------------------------------------------------------------------- /data/images/overall.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/images/overall.png -------------------------------------------------------------------------------- /data/images/per_doc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/images/per_doc.png -------------------------------------------------------------------------------- /data/images/table.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/data/images/table.png -------------------------------------------------------------------------------- /data/latex_to_md.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # List all .tex files in the latex folder 4 | FILES=$(find latex -name "*.tex") 5 | 6 | for f in $FILES 7 | do 8 | echo "Processing $f file..." 9 | base_name=$(basename "$f" .tex) 10 | out_file="references/${base_name}.md" 11 | 12 | pandoc --wrap=none \ 13 | --no-highlight \ 14 | --strip-comments \ 15 | --from=latex \ 16 | --to=commonmark_x+pipe_tables \ 17 | "$f" \ 18 | -o "$out_file" 19 | # Replace non-breaking spaces 20 | sed -i .bak 's/ / /g' "$out_file" 21 | sed -i .bak 's/ / /g' "$out_file" 22 | sed -i .bak 's/ / /g' "$out_file" 23 | sed -i .bak 's/ / /g' "$out_file" 24 | sed -i.bak -E 's/`\\cite`//g; s/<[^>]*>//g; s/\{[^}]*\}//g; s/\\cite\{[^}]*\}//g' "$out_file" 25 | sed -i.bak -E ' 26 | s/`\\cite`//g; # Remove \cite commands inside backticks 27 | s/::: //g; # Remove the leading ::: for content markers 28 | s/\[//g; # Remove opening square bracket 29 | s/\]//g; # Remove closing square bracket 30 | ' "$out_file" 31 | # Remove .bak file 32 | rm "$out_file.bak" 33 | done 34 | 35 | -------------------------------------------------------------------------------- /extraction_app.py: -------------------------------------------------------------------------------- 1 | from marker.scripts.run_streamlit_app import extraction_app_cli 2 | 3 | if __name__ == "__main__": 4 | extraction_app_cli() 5 | -------------------------------------------------------------------------------- /marker/builders/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from pydantic import BaseModel 4 | 5 | from marker.util import assign_config 6 | 7 | 8 | class BaseBuilder: 9 | def __init__(self, config: Optional[BaseModel | dict] = None): 10 | assign_config(self, config) 11 | 12 | def __call__(self, data, *args, **kwargs): 13 | raise NotImplementedError 14 | -------------------------------------------------------------------------------- /marker/builders/document.py: -------------------------------------------------------------------------------- 1 | from typing import Annotated 2 | 3 | from marker.builders import BaseBuilder 4 | from marker.builders.layout import LayoutBuilder 5 | from marker.builders.line import LineBuilder 6 | from marker.builders.ocr import OcrBuilder 7 | from marker.providers.pdf import PdfProvider 8 | from marker.schema import BlockTypes 9 | from marker.schema.document import Document 10 | from marker.schema.groups.page import PageGroup 11 | from marker.schema.registry import get_block_class 12 | 13 | 14 | class DocumentBuilder(BaseBuilder): 15 | """ 16 | Constructs a Document given a PdfProvider, LayoutBuilder, and OcrBuilder. 17 | """ 18 | lowres_image_dpi: Annotated[ 19 | int, 20 | "DPI setting for low-resolution page images used for Layout and Line Detection.", 21 | ] = 96 22 | highres_image_dpi: Annotated[ 23 | int, 24 | "DPI setting for high-resolution page images used for OCR.", 25 | ] = 192 26 | disable_ocr: Annotated[ 27 | bool, 28 | "Disable OCR processing.", 29 | ] = False 30 | 31 | def __call__(self, provider: PdfProvider, layout_builder: LayoutBuilder, line_builder: LineBuilder, ocr_builder: OcrBuilder): 32 | document = self.build_document(provider) 33 | layout_builder(document, provider) 34 | line_builder(document, provider) 35 | if not self.disable_ocr: 36 | ocr_builder(document, provider) 37 | return document 38 | 39 | def build_document(self, provider: PdfProvider): 40 | PageGroupClass: PageGroup = get_block_class(BlockTypes.Page) 41 | lowres_images = provider.get_images(provider.page_range, self.lowres_image_dpi) 42 | highres_images = provider.get_images(provider.page_range, self.highres_image_dpi) 43 | initial_pages = [ 44 | PageGroupClass( 45 | page_id=p, 46 | lowres_image=lowres_images[i], 47 | highres_image=highres_images[i], 48 | polygon=provider.get_page_bbox(p), 49 | refs=provider.get_page_refs(p) 50 | ) for i, p in enumerate(provider.page_range) 51 | ] 52 | DocumentClass: Document = get_block_class(BlockTypes.Document) 53 | return DocumentClass(filepath=provider.filepath, pages=initial_pages) 54 | -------------------------------------------------------------------------------- /marker/config/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/marker/config/__init__.py -------------------------------------------------------------------------------- /marker/converters/__init__.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | from typing import Optional, List, Type 3 | 4 | from pydantic import BaseModel 5 | 6 | from marker.processors import BaseProcessor 7 | from marker.processors.llm import BaseLLMSimpleBlockProcessor 8 | from marker.processors.llm.llm_meta import LLMSimpleBlockMetaProcessor 9 | from marker.util import assign_config, download_font 10 | 11 | 12 | class BaseConverter: 13 | def __init__(self, config: Optional[BaseModel | dict] = None): 14 | assign_config(self, config) 15 | self.config = config 16 | self.llm_service = None 17 | 18 | # Download render font, needed for some providers 19 | download_font() 20 | 21 | def __call__(self, *args, **kwargs): 22 | raise NotImplementedError 23 | 24 | def resolve_dependencies(self, cls): 25 | init_signature = inspect.signature(cls.__init__) 26 | parameters = init_signature.parameters 27 | 28 | resolved_kwargs = {} 29 | for param_name, param in parameters.items(): 30 | if param_name == 'self': 31 | continue 32 | elif param_name == 'config': 33 | resolved_kwargs[param_name] = self.config 34 | elif param.name in self.artifact_dict: 35 | resolved_kwargs[param_name] = self.artifact_dict[param_name] 36 | elif param.default != inspect.Parameter.empty: 37 | resolved_kwargs[param_name] = param.default 38 | else: 39 | raise ValueError(f"Cannot resolve dependency for parameter: {param_name}") 40 | 41 | return cls(**resolved_kwargs) 42 | 43 | def initialize_processors(self, processor_cls_lst: List[Type[BaseProcessor]]) -> List[BaseProcessor]: 44 | processors = [] 45 | for processor_cls in processor_cls_lst: 46 | processors.append(self.resolve_dependencies(processor_cls)) 47 | 48 | simple_llm_processors = [p for p in processors if issubclass(type(p), BaseLLMSimpleBlockProcessor)] 49 | other_processors = [p for p in processors if not issubclass(type(p), BaseLLMSimpleBlockProcessor)] 50 | 51 | if not simple_llm_processors: 52 | return processors 53 | 54 | llm_positions = [i for i, p in enumerate(processors) if issubclass(type(p), BaseLLMSimpleBlockProcessor)] 55 | insert_position = max(0, llm_positions[-1] - len(simple_llm_processors) + 1) 56 | 57 | meta_processor = LLMSimpleBlockMetaProcessor( 58 | processor_lst=simple_llm_processors, 59 | llm_service=self.llm_service, 60 | config=self.config, 61 | ) 62 | other_processors.insert(insert_position, meta_processor) 63 | return other_processors -------------------------------------------------------------------------------- /marker/converters/extraction.py: -------------------------------------------------------------------------------- 1 | import json 2 | import re 3 | 4 | from marker.builders.document import DocumentBuilder 5 | from marker.builders.line import LineBuilder 6 | from marker.builders.ocr import OcrBuilder 7 | from marker.builders.structure import StructureBuilder 8 | from marker.converters.pdf import PdfConverter 9 | from marker.extractors.page import PageExtractor, json_schema_to_base_model 10 | from marker.providers.registry import provider_from_filepath 11 | 12 | from marker.renderers.extraction import ExtractionRenderer, ExtractionOutput 13 | from marker.renderers.markdown import MarkdownRenderer 14 | 15 | from marker.logger import get_logger 16 | 17 | logger = get_logger() 18 | 19 | 20 | class ExtractionConverter(PdfConverter): 21 | pattern: str = r"{\d+\}-{48}\n\n" 22 | 23 | def build_document(self, filepath: str): 24 | provider_cls = provider_from_filepath(filepath) 25 | layout_builder = self.resolve_dependencies(self.layout_builder_class) 26 | line_builder = self.resolve_dependencies(LineBuilder) 27 | ocr_builder = self.resolve_dependencies(OcrBuilder) 28 | provider = provider_cls(filepath, self.config) 29 | document = DocumentBuilder(self.config)( 30 | provider, layout_builder, line_builder, ocr_builder 31 | ) 32 | structure_builder_cls = self.resolve_dependencies(StructureBuilder) 33 | structure_builder_cls(document) 34 | 35 | for processor in self.processor_list: 36 | processor(document) 37 | 38 | return document, provider 39 | 40 | def __call__(self, filepath: str) -> ExtractionOutput: 41 | self.config["paginate_output"] = True # Ensure we can split the output properly 42 | self.config["output_format"] = ( 43 | "markdown" # Output must be markdown for extraction 44 | ) 45 | try: 46 | json_schema_to_base_model(json.loads(self.config["page_schema"])) 47 | except Exception as e: 48 | logger.error(f"Could not parse page schema: {e}") 49 | raise ValueError( 50 | "Could not parse your page schema. Please check the schema format." 51 | ) 52 | 53 | document, provider = self.build_document(filepath) 54 | renderer = self.resolve_dependencies(MarkdownRenderer) 55 | output = renderer(document) 56 | 57 | output_pages = re.split(self.pattern, output.markdown)[ 58 | 1: 59 | ] # Split output into pages 60 | 61 | # This needs an LLM service for extraction, this sets it in the extractor 62 | if not self.artifact_dict["llm_service"]: 63 | self.artifact_dict["llm_service"] = self.resolve_dependencies( 64 | self.default_llm_service 65 | ) 66 | 67 | extractor = self.resolve_dependencies(PageExtractor) 68 | renderer = self.resolve_dependencies(ExtractionRenderer) 69 | 70 | pnums = provider.page_range 71 | all_json = {} 72 | for page, page_md, pnum in zip(document.pages, output_pages, pnums): 73 | extracted_json = extractor(document, page, page_md.strip()) 74 | all_json[pnum] = extracted_json 75 | 76 | merged = renderer(all_json) 77 | return merged 78 | -------------------------------------------------------------------------------- /marker/converters/ocr.py: -------------------------------------------------------------------------------- 1 | from typing import Tuple 2 | 3 | from marker.builders.document import DocumentBuilder 4 | from marker.builders.line import LineBuilder 5 | from marker.builders.ocr import OcrBuilder 6 | from marker.converters.pdf import PdfConverter 7 | from marker.processors import BaseProcessor 8 | from marker.processors.equation import EquationProcessor 9 | from marker.providers.registry import provider_from_filepath 10 | from marker.renderers.ocr_json import OCRJSONRenderer 11 | 12 | 13 | class OCRConverter(PdfConverter): 14 | default_processors: Tuple[BaseProcessor, ...] = (EquationProcessor,) 15 | 16 | def __init__(self, *args, **kwargs): 17 | super().__init__(*args, **kwargs) 18 | 19 | if not self.config: 20 | self.config = {} 21 | 22 | self.config["format_lines"] = True 23 | self.renderer = OCRJSONRenderer 24 | 25 | def build_document(self, filepath: str): 26 | provider_cls = provider_from_filepath(filepath) 27 | layout_builder = self.resolve_dependencies(self.layout_builder_class) 28 | line_builder = self.resolve_dependencies(LineBuilder) 29 | ocr_builder = self.resolve_dependencies(OcrBuilder) 30 | document_builder = DocumentBuilder(self.config) 31 | 32 | provider = provider_cls(filepath, self.config) 33 | document = document_builder(provider, layout_builder, line_builder, ocr_builder) 34 | 35 | for processor in self.processor_list: 36 | processor(document) 37 | 38 | return document 39 | 40 | def __call__(self, filepath: str): 41 | document = self.build_document(filepath) 42 | renderer = self.resolve_dependencies(self.renderer) 43 | return renderer(document) 44 | -------------------------------------------------------------------------------- /marker/converters/table.py: -------------------------------------------------------------------------------- 1 | from functools import cache 2 | from typing import Tuple, List 3 | 4 | from marker.builders.document import DocumentBuilder 5 | from marker.builders.line import LineBuilder 6 | from marker.builders.ocr import OcrBuilder 7 | from marker.converters.pdf import PdfConverter 8 | from marker.processors import BaseProcessor 9 | from marker.processors.llm.llm_complex import LLMComplexRegionProcessor 10 | from marker.processors.llm.llm_form import LLMFormProcessor 11 | from marker.processors.llm.llm_table import LLMTableProcessor 12 | from marker.processors.llm.llm_table_merge import LLMTableMergeProcessor 13 | from marker.processors.table import TableProcessor 14 | from marker.providers.registry import provider_from_filepath 15 | from marker.schema import BlockTypes 16 | 17 | 18 | class TableConverter(PdfConverter): 19 | default_processors: Tuple[BaseProcessor, ...] = ( 20 | TableProcessor, 21 | LLMTableProcessor, 22 | LLMTableMergeProcessor, 23 | LLMFormProcessor, 24 | LLMComplexRegionProcessor, 25 | ) 26 | converter_block_types: List[BlockTypes] = (BlockTypes.Table, BlockTypes.Form, BlockTypes.TableOfContents) 27 | 28 | def build_document(self, filepath: str): 29 | provider_cls = provider_from_filepath(filepath) 30 | layout_builder = self.resolve_dependencies(self.layout_builder_class) 31 | line_builder = self.resolve_dependencies(LineBuilder) 32 | ocr_builder = self.resolve_dependencies(OcrBuilder) 33 | document_builder = DocumentBuilder(self.config) 34 | document_builder.disable_ocr = True 35 | 36 | provider = provider_cls(filepath, self.config) 37 | document = document_builder(provider, layout_builder, line_builder, ocr_builder) 38 | 39 | for page in document.pages: 40 | page.structure = [p for p in page.structure if p.block_type in self.converter_block_types] 41 | 42 | for processor in self.processor_list: 43 | processor(document) 44 | 45 | return document 46 | 47 | def __call__(self, filepath: str): 48 | document = self.build_document(filepath) 49 | renderer = self.resolve_dependencies(self.renderer) 50 | return renderer(document) -------------------------------------------------------------------------------- /marker/extractors/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import Annotated, Sequence, Optional 2 | 3 | from pydantic import BaseModel 4 | 5 | from marker.schema import BlockTypes 6 | from marker.schema.document import Document 7 | from marker.schema.groups import PageGroup 8 | from PIL import Image 9 | 10 | from marker.services import BaseService 11 | from marker.util import assign_config 12 | 13 | 14 | class ExtractionResult(BaseModel): 15 | extracted_data: dict | list 16 | value_confidence: int 17 | existence_confidence: int 18 | 19 | 20 | class BaseExtractor: 21 | """ 22 | An extractor that uses a provided service to extract structured data from documents. 23 | """ 24 | 25 | max_concurrency: Annotated[ 26 | int, 27 | "The maximum number of concurrent requests to make to the Gemini model.", 28 | ] = 3 29 | disable_tqdm: Annotated[ 30 | bool, 31 | "Whether to disable the tqdm progress bar.", 32 | ] = False 33 | 34 | def __init__(self, llm_service: BaseService, config=None): 35 | self.llm_service = llm_service 36 | assign_config(self, config) 37 | 38 | def extract_image( 39 | self, 40 | document: Document, 41 | page: PageGroup, 42 | remove_blocks: Sequence[BlockTypes] | None = None, 43 | highres: bool = False, # Default False to save tokens 44 | ) -> Image.Image: 45 | return page.get_image( 46 | document, 47 | highres=highres, 48 | remove_blocks=remove_blocks, 49 | ) 50 | 51 | def __call__( 52 | self, document: Document, *args, **kwargs 53 | ) -> Optional[ExtractionResult]: 54 | raise NotImplementedError 55 | -------------------------------------------------------------------------------- /marker/logger.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import warnings 3 | 4 | from marker.settings import settings 5 | 6 | 7 | def configure_logging(): 8 | # Setup marker logger 9 | logger = get_logger() 10 | 11 | if not logger.handlers: 12 | handler = logging.StreamHandler() 13 | formatter = logging.Formatter( 14 | "%(asctime)s [%(levelname)s] %(name)s: %(message)s" 15 | ) 16 | handler.setFormatter(formatter) 17 | logger.addHandler(handler) 18 | 19 | logger.setLevel(settings.LOGLEVEL) 20 | 21 | # Ignore future warnings 22 | warnings.simplefilter(action="ignore", category=FutureWarning) 23 | 24 | # Set component loglevels 25 | logging.getLogger("PIL").setLevel(logging.ERROR) 26 | logging.getLogger("fontTools.subset").setLevel(logging.ERROR) 27 | logging.getLogger("fontTools.ttLib.ttFont").setLevel(logging.ERROR) 28 | logging.getLogger("weasyprint").setLevel(logging.CRITICAL) 29 | 30 | 31 | def get_logger(): 32 | return logging.getLogger("marker") 33 | -------------------------------------------------------------------------------- /marker/models.py: -------------------------------------------------------------------------------- 1 | import os 2 | os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" # Transformers uses .isin for an op, which is not supported on MPS 3 | 4 | from surya.detection import DetectionPredictor 5 | from surya.layout import LayoutPredictor 6 | from surya.ocr_error import OCRErrorPredictor 7 | from surya.recognition import RecognitionPredictor 8 | from surya.table_rec import TableRecPredictor 9 | 10 | def create_model_dict(device=None, dtype=None) -> dict: 11 | return { 12 | "layout_model": LayoutPredictor(device=device, dtype=dtype), 13 | "recognition_model": RecognitionPredictor(device=device, dtype=dtype), 14 | "table_rec_model": TableRecPredictor(device=device, dtype=dtype), 15 | "detection_model": DetectionPredictor(device=device, dtype=dtype), 16 | "ocr_error_model": OCRErrorPredictor(device=device, dtype=dtype) 17 | } -------------------------------------------------------------------------------- /marker/output.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | from bs4 import BeautifulSoup, Tag 5 | from pydantic import BaseModel 6 | from PIL import Image 7 | 8 | from marker.renderers.extraction import ExtractionOutput 9 | from marker.renderers.html import HTMLOutput 10 | from marker.renderers.json import JSONOutput, JSONBlockOutput 11 | from marker.renderers.markdown import MarkdownOutput 12 | from marker.renderers.ocr_json import OCRJSONOutput 13 | from marker.schema.blocks import BlockOutput 14 | from marker.settings import settings 15 | 16 | 17 | def unwrap_outer_tag(html: str): 18 | soup = BeautifulSoup(html, "html.parser") 19 | contents = list(soup.contents) 20 | if len(contents) == 1 and isinstance(contents[0], Tag) and contents[0].name == "p": 21 | # Unwrap the p tag 22 | soup.p.unwrap() 23 | 24 | return str(soup) 25 | 26 | 27 | def json_to_html(block: JSONBlockOutput | BlockOutput): 28 | # Utility function to take in json block output and give html for the block. 29 | if not getattr(block, "children", None): 30 | return block.html 31 | else: 32 | child_html = [json_to_html(child) for child in block.children] 33 | child_ids = [child.id for child in block.children] 34 | 35 | soup = BeautifulSoup(block.html, "html.parser") 36 | content_refs = soup.find_all("content-ref") 37 | for ref in content_refs: 38 | src_id = ref.attrs["src"] 39 | if src_id in child_ids: 40 | child_soup = BeautifulSoup( 41 | child_html[child_ids.index(src_id)], "html.parser" 42 | ) 43 | ref.replace_with(child_soup) 44 | return str(soup) 45 | 46 | 47 | def output_exists(output_dir: str, fname_base: str): 48 | exts = ["md", "html", "json"] 49 | for ext in exts: 50 | if os.path.exists(os.path.join(output_dir, f"{fname_base}.{ext}")): 51 | return True 52 | return False 53 | 54 | 55 | def text_from_rendered(rendered: BaseModel): 56 | if isinstance(rendered, MarkdownOutput): 57 | return rendered.markdown, "md", rendered.images 58 | elif isinstance(rendered, HTMLOutput): 59 | return rendered.html, "html", rendered.images 60 | elif isinstance(rendered, JSONOutput): 61 | return rendered.model_dump_json(exclude=["metadata"], indent=2), "json", {} 62 | elif isinstance(rendered, OCRJSONOutput): 63 | return rendered.model_dump_json(exclude=["metadata"], indent=2), "json", {} 64 | elif isinstance(rendered, ExtractionOutput): 65 | return rendered.document_json, "json", {} 66 | else: 67 | raise ValueError("Invalid output type") 68 | 69 | 70 | def convert_if_not_rgb(image: Image.Image) -> Image.Image: 71 | if image.mode != "RGB": 72 | image = image.convert("RGB") 73 | return image 74 | 75 | 76 | def save_output(rendered: BaseModel, output_dir: str, fname_base: str): 77 | text, ext, images = text_from_rendered(rendered) 78 | text = text.encode(settings.OUTPUT_ENCODING, errors="replace").decode( 79 | settings.OUTPUT_ENCODING 80 | ) 81 | 82 | with open( 83 | os.path.join(output_dir, f"{fname_base}.{ext}"), 84 | "w+", 85 | encoding=settings.OUTPUT_ENCODING, 86 | ) as f: 87 | f.write(text) 88 | with open( 89 | os.path.join(output_dir, f"{fname_base}_meta.json"), 90 | "w+", 91 | encoding=settings.OUTPUT_ENCODING, 92 | ) as f: 93 | f.write(json.dumps(rendered.metadata, indent=2)) 94 | 95 | for img_name, img in images.items(): 96 | img = convert_if_not_rgb(img) # RGBA images can't save as JPG 97 | img.save(os.path.join(output_dir, img_name), settings.OUTPUT_IMAGE_FORMAT) 98 | -------------------------------------------------------------------------------- /marker/processors/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Tuple 2 | 3 | from pydantic import BaseModel 4 | 5 | from marker.schema import BlockTypes 6 | from marker.schema.document import Document 7 | from marker.util import assign_config 8 | 9 | 10 | class BaseProcessor: 11 | block_types: Tuple[BlockTypes] | None = None # What block types this processor is responsible for 12 | 13 | def __init__(self, config: Optional[BaseModel | dict] = None): 14 | assign_config(self, config) 15 | 16 | def __call__(self, document: Document, *args, **kwargs): 17 | raise NotImplementedError 18 | -------------------------------------------------------------------------------- /marker/processors/blockquote.py: -------------------------------------------------------------------------------- 1 | from typing import Annotated, Tuple 2 | 3 | from marker.processors import BaseProcessor 4 | from marker.schema import BlockTypes 5 | from marker.schema.document import Document 6 | 7 | 8 | class BlockquoteProcessor(BaseProcessor): 9 | """ 10 | A processor for tagging blockquotes. 11 | """ 12 | block_types: Annotated[ 13 | Tuple[BlockTypes], 14 | "The block types to process.", 15 | ] = (BlockTypes.Text, BlockTypes.TextInlineMath) 16 | min_x_indent: Annotated[ 17 | float, 18 | "The minimum horizontal indentation required to consider a block as part of a blockquote.", 19 | "Expressed as a percentage of the block width.", 20 | ] = 0.1 21 | x_start_tolerance: Annotated[ 22 | float, 23 | "The maximum allowable difference between the starting x-coordinates of consecutive blocks to consider them aligned.", 24 | "Expressed as a percentage of the block width.", 25 | ] = 0.01 26 | x_end_tolerance: Annotated[ 27 | float, 28 | "The maximum allowable difference between the ending x-coordinates of consecutive blocks to consider them aligned.", 29 | "Expressed as a percentage of the block width.", 30 | ] = 0.01 31 | 32 | def __init__(self, config): 33 | super().__init__(config) 34 | 35 | def __call__(self, document: Document): 36 | for page in document.pages: 37 | for block in page.contained_blocks(document, self.block_types): 38 | if block.structure is None: 39 | continue 40 | 41 | if not len(block.structure) >= 2: 42 | continue 43 | 44 | next_block = page.get_next_block(block) 45 | if next_block is None: 46 | continue 47 | if next_block.block_type not in self.block_types: 48 | continue 49 | if next_block.structure is None: 50 | continue 51 | if next_block.ignore_for_output: 52 | continue 53 | 54 | matching_x_end = abs(next_block.polygon.x_end - block.polygon.x_end) < self.x_end_tolerance * block.polygon.width 55 | matching_x_start = abs(next_block.polygon.x_start - block.polygon.x_start) < self.x_start_tolerance * block.polygon.width 56 | x_indent = next_block.polygon.x_start > block.polygon.x_start + (self.min_x_indent * block.polygon.width) 57 | y_indent = next_block.polygon.y_start > block.polygon.y_end 58 | 59 | if block.blockquote: 60 | next_block.blockquote = (matching_x_end and matching_x_start) or (x_indent and y_indent) 61 | next_block.blockquote_level = block.blockquote_level 62 | if (x_indent and y_indent): 63 | next_block.blockquote_level += 1 64 | elif len(next_block.structure) >= 2 and (x_indent and y_indent): 65 | next_block.blockquote = True 66 | next_block.blockquote_level = 1 -------------------------------------------------------------------------------- /marker/processors/code.py: -------------------------------------------------------------------------------- 1 | from marker.processors import BaseProcessor 2 | from marker.schema import BlockTypes 3 | from marker.schema.blocks import Code 4 | from marker.schema.document import Document 5 | 6 | 7 | class CodeProcessor(BaseProcessor): 8 | """ 9 | A processor for formatting code blocks. 10 | """ 11 | block_types = (BlockTypes.Code, ) 12 | 13 | def __call__(self, document: Document): 14 | for page in document.pages: 15 | for block in page.contained_blocks(document, self.block_types): 16 | self.format_block(document, block) 17 | 18 | 19 | def format_block(self, document: Document, block: Code): 20 | min_left = 9999 # will contain x- coord of column 0 21 | total_width = 0 22 | total_chars = 0 23 | 24 | contained_lines = block.contained_blocks(document, (BlockTypes.Line,)) 25 | for line in contained_lines: 26 | min_left = min(line.polygon.bbox[0], min_left) 27 | total_width += line.polygon.width 28 | total_chars += len(line.raw_text(document)) 29 | 30 | avg_char_width = total_width / max(total_chars, 1) 31 | code_text = "" 32 | is_new_line = False 33 | for line in contained_lines: 34 | text = line.raw_text(document) 35 | if avg_char_width == 0: 36 | prefix = "" 37 | else: 38 | total_spaces = int((line.polygon.bbox[0] - min_left) / avg_char_width) 39 | prefix = " " * max(0, total_spaces) 40 | 41 | if is_new_line: 42 | text = prefix + text 43 | 44 | code_text += text 45 | is_new_line = text.endswith("\n") 46 | 47 | block.code = code_text.rstrip() 48 | -------------------------------------------------------------------------------- /marker/processors/document_toc.py: -------------------------------------------------------------------------------- 1 | from marker.processors import BaseProcessor 2 | from marker.schema import BlockTypes 3 | from marker.schema.document import Document 4 | 5 | 6 | class DocumentTOCProcessor(BaseProcessor): 7 | """ 8 | A processor for generating a table of contents for the document. 9 | """ 10 | block_types = (BlockTypes.SectionHeader, ) 11 | 12 | def __call__(self, document: Document): 13 | toc = [] 14 | for page in document.pages: 15 | for block in page.contained_blocks(document, self.block_types): 16 | toc.append({ 17 | "title": block.raw_text(document).strip(), 18 | "heading_level": block.heading_level, 19 | "page_id": page.page_id, 20 | "polygon": block.polygon.polygon 21 | }) 22 | document.table_of_contents = toc 23 | -------------------------------------------------------------------------------- /marker/processors/footnote.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from marker.processors import BaseProcessor 4 | from marker.schema import BlockTypes 5 | from marker.schema.document import Document 6 | from marker.schema.groups import PageGroup 7 | 8 | 9 | class FootnoteProcessor(BaseProcessor): 10 | """ 11 | A processor for pushing footnotes to the bottom, and relabeling mislabeled text blocks. 12 | """ 13 | block_types = (BlockTypes.Footnote,) 14 | 15 | def __call__(self, document: Document): 16 | for page in document.pages: 17 | self.push_footnotes_to_bottom(page, document) 18 | self.assign_superscripts(page, document) 19 | 20 | def push_footnotes_to_bottom(self, page: PageGroup, document: Document): 21 | footnote_blocks = page.contained_blocks(document, self.block_types) 22 | 23 | # Push footnotes to the bottom 24 | for block in footnote_blocks: 25 | # Check if it is top-level 26 | if block.id in page.structure: 27 | # Move to bottom if it is 28 | page.structure.remove(block.id) 29 | page.add_structure(block) 30 | 31 | def assign_superscripts(self, page: PageGroup, document: Document): 32 | footnote_blocks = page.contained_blocks(document, self.block_types) 33 | 34 | for block in footnote_blocks: 35 | for span in block.contained_blocks(document, (BlockTypes.Span,)): 36 | if re.match(r"^[0-9\W]+", span.text): 37 | span.has_superscript = True 38 | break 39 | -------------------------------------------------------------------------------- /marker/processors/llm/llm_image_description.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel 2 | 3 | from marker.processors.llm import PromptData, BaseLLMSimpleBlockProcessor, BlockData 4 | 5 | from marker.schema import BlockTypes 6 | from marker.schema.document import Document 7 | 8 | from typing import Annotated, List 9 | 10 | 11 | class LLMImageDescriptionProcessor(BaseLLMSimpleBlockProcessor): 12 | block_types = (BlockTypes.Picture, BlockTypes.Figure,) 13 | extract_images: Annotated[ 14 | bool, 15 | "Extract images from the document." 16 | ] = True 17 | image_description_prompt: Annotated[ 18 | str, 19 | "The prompt to use for generating image descriptions.", 20 | "Default is a string containing the Gemini prompt." 21 | ] = """You are a document analysis expert who specializes in creating text descriptions for images. 22 | You will receive an image of a picture or figure. Your job will be to create a short description of the image. 23 | **Instructions:** 24 | 1. Carefully examine the provided image. 25 | 2. Analyze any text that was extracted from within the image. 26 | 3. Output a faithful description of the image. Make sure there is enough specific detail to accurately reconstruct the image. If the image is a figure or contains numeric data, include the numeric data in the output. 27 | **Example:** 28 | Input: 29 | ```text 30 | "Fruit Preference Survey" 31 | 20, 15, 10 32 | Apples, Bananas, Oranges 33 | ``` 34 | Output: 35 | In this figure, a bar chart titled "Fruit Preference Survey" is showing the number of people who prefer different types of fruits. The x-axis shows the types of fruits, and the y-axis shows the number of people. The bar chart shows that most people prefer apples, followed by bananas and oranges. 20 people prefer apples, 15 people prefer bananas, and 10 people prefer oranges. 36 | **Input:** 37 | ```text 38 | {raw_text} 39 | ``` 40 | """ 41 | 42 | def inference_blocks(self, document: Document) -> List[BlockData]: 43 | blocks = super().inference_blocks(document) 44 | if self.extract_images: 45 | return [] 46 | return blocks 47 | 48 | def block_prompts(self, document: Document) -> List[PromptData]: 49 | prompt_data = [] 50 | for block_data in self.inference_blocks(document): 51 | block = block_data["block"] 52 | prompt = self.image_description_prompt.replace("{raw_text}", block.raw_text(document)) 53 | image = self.extract_image(document, block) 54 | 55 | prompt_data.append({ 56 | "prompt": prompt, 57 | "image": image, 58 | "block": block, 59 | "schema": ImageSchema, 60 | "page": block_data["page"] 61 | }) 62 | 63 | return prompt_data 64 | 65 | def rewrite_block(self, response: dict, prompt_data: PromptData, document: Document): 66 | block = prompt_data["block"] 67 | 68 | if not response or "image_description" not in response: 69 | block.update_metadata(llm_error_count=1) 70 | return 71 | 72 | image_description = response["image_description"] 73 | if len(image_description) < 10: 74 | block.update_metadata(llm_error_count=1) 75 | return 76 | 77 | block.description = image_description 78 | 79 | class ImageSchema(BaseModel): 80 | image_description: str 81 | -------------------------------------------------------------------------------- /marker/processors/llm/llm_meta.py: -------------------------------------------------------------------------------- 1 | from concurrent.futures import ThreadPoolExecutor 2 | from typing import List, Dict, Any 3 | 4 | from marker.logger import get_logger 5 | from tqdm import tqdm 6 | 7 | from marker.processors.llm import BaseLLMSimpleBlockProcessor, BaseLLMProcessor 8 | from marker.schema.document import Document 9 | from marker.services import BaseService 10 | 11 | logger = get_logger() 12 | 13 | 14 | class LLMSimpleBlockMetaProcessor(BaseLLMProcessor): 15 | """ 16 | A wrapper for simple LLM processors, so they can all run in parallel. 17 | """ 18 | 19 | def __init__( 20 | self, 21 | processor_lst: List[BaseLLMSimpleBlockProcessor], 22 | llm_service: BaseService, 23 | config=None, 24 | ): 25 | super().__init__(llm_service, config) 26 | self.processors = processor_lst 27 | 28 | def __call__(self, document: Document): 29 | if not self.use_llm or self.llm_service is None: 30 | return 31 | 32 | total = sum( 33 | [len(processor.inference_blocks(document)) for processor in self.processors] 34 | ) 35 | pbar = tqdm( 36 | desc="LLM processors running", disable=self.disable_tqdm, total=total 37 | ) 38 | 39 | all_prompts = [ 40 | processor.block_prompts(document) for processor in self.processors 41 | ] 42 | pending = [] 43 | futures_map = {} 44 | with ThreadPoolExecutor(max_workers=self.max_concurrency) as executor: 45 | for i, prompt_lst in enumerate(all_prompts): 46 | for prompt in prompt_lst: 47 | future = executor.submit(self.get_response, prompt) 48 | pending.append(future) 49 | futures_map[future] = {"processor_idx": i, "prompt_data": prompt} 50 | 51 | for future in pending: 52 | try: 53 | result = future.result() 54 | future_data = futures_map.pop(future) 55 | processor: BaseLLMSimpleBlockProcessor = self.processors[ 56 | future_data["processor_idx"] 57 | ] 58 | # finalize the result 59 | processor(result, future_data["prompt_data"], document) 60 | except Exception as e: 61 | logger.warning(f"Error processing LLM response: {e}") 62 | 63 | pbar.update(1) 64 | 65 | pbar.close() 66 | 67 | def get_response(self, prompt_data: Dict[str, Any]): 68 | return self.llm_service( 69 | prompt_data["prompt"], 70 | prompt_data["image"], 71 | prompt_data["block"], 72 | prompt_data["schema"], 73 | ) 74 | -------------------------------------------------------------------------------- /marker/processors/order.py: -------------------------------------------------------------------------------- 1 | from statistics import mean 2 | from collections import defaultdict 3 | 4 | from marker.processors import BaseProcessor 5 | from marker.schema import BlockTypes 6 | from marker.schema.document import Document 7 | 8 | 9 | class OrderProcessor(BaseProcessor): 10 | """ 11 | A processor for sorting the blocks in order if needed. This can help when the layout image was sliced. 12 | """ 13 | block_types = tuple() 14 | 15 | def __call__(self, document: Document): 16 | for page in document.pages: 17 | # Skip OCRed pages 18 | if page.text_extraction_method != "pdftext": 19 | continue 20 | 21 | # Skip pages without layout slicing 22 | if not page.layout_sliced: 23 | continue 24 | 25 | block_idxs = defaultdict(int) 26 | for block_id in page.structure: 27 | block = document.get_block(block_id) 28 | spans = block.contained_blocks(document, (BlockTypes.Span, )) 29 | if len(spans) == 0: 30 | continue 31 | 32 | # Avg span position in original PDF 33 | block_idxs[block_id] = (spans[0].minimum_position + spans[-1].maximum_position) / 2 34 | 35 | for block_id in page.structure: 36 | # Already assigned block id via span position 37 | if block_idxs[block_id] > 0: 38 | continue 39 | 40 | block = document.get_block(block_id) 41 | prev_block = document.get_prev_block(block) 42 | next_block = document.get_next_block(block) 43 | 44 | block_idx_add = 0 45 | if prev_block: 46 | block_idx_add = 1 47 | 48 | while prev_block and prev_block.id not in block_idxs: 49 | prev_block = document.get_prev_block(prev_block) 50 | block_idx_add += 1 51 | 52 | if not prev_block: 53 | block_idx_add = -1 54 | while next_block and next_block.id not in block_idxs: 55 | next_block = document.get_next_block(next_block) 56 | block_idx_add -= 1 57 | 58 | if not next_block and not prev_block: 59 | pass 60 | elif prev_block: 61 | block_idxs[block_id] = block_idxs[prev_block.id] + block_idx_add 62 | else: 63 | block_idxs[block_id] = block_idxs[next_block.id] + block_idx_add 64 | 65 | page.structure = sorted(page.structure, key=lambda x: block_idxs[x]) 66 | 67 | -------------------------------------------------------------------------------- /marker/processors/page_header.py: -------------------------------------------------------------------------------- 1 | from marker.processors import BaseProcessor 2 | from marker.schema import BlockTypes 3 | from marker.schema.document import Document 4 | from marker.schema.groups.page import PageGroup 5 | 6 | 7 | class PageHeaderProcessor(BaseProcessor): 8 | """ 9 | A processor for moving PageHeaders to the top 10 | """ 11 | block_types = (BlockTypes.PageHeader,) 12 | 13 | def __call__(self, document: Document): 14 | for page in document.pages: 15 | self.move_page_header_to_top(page, document) 16 | 17 | def move_page_header_to_top(self, page: PageGroup, document: Document): 18 | page_header_blocks = page.contained_blocks(document, self.block_types) 19 | page_header_block_ids = [block.id for block in page_header_blocks] 20 | for block_id in page_header_block_ids: 21 | page.structure.remove(block_id) 22 | page.structure[:0] = page_header_block_ids 23 | 24 | -------------------------------------------------------------------------------- /marker/processors/reference.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from marker.processors import BaseProcessor 4 | from marker.schema import BlockTypes 5 | from marker.schema.blocks import Reference 6 | from marker.schema.document import Document 7 | from marker.schema.groups.list import ListGroup 8 | from marker.schema.groups.table import TableGroup 9 | from marker.schema.registry import get_block_class 10 | from marker.schema.groups.figure import FigureGroup 11 | 12 | 13 | class ReferenceProcessor(BaseProcessor): 14 | """ 15 | A processor for adding references to the document. 16 | """ 17 | 18 | def __init__(self, config): 19 | super().__init__(config) 20 | 21 | def __call__(self, document: Document): 22 | ReferenceClass: Reference = get_block_class(BlockTypes.Reference) 23 | 24 | for page in document.pages: 25 | refs = page.refs 26 | ref_starts = np.array([ref.coord for ref in refs]) 27 | 28 | blocks = [] 29 | for block_id in page.structure: 30 | block = page.get_block(block_id) 31 | if isinstance(block, (ListGroup, FigureGroup, TableGroup)): 32 | blocks.extend([page.get_block(b) for b in block.structure]) 33 | else: 34 | blocks.append(block) 35 | blocks = [b for b in blocks if not b.ignore_for_output] 36 | 37 | block_starts = np.array([block.polygon.bbox[:2] for block in blocks]) 38 | 39 | if not (len(refs) and len(block_starts)): 40 | continue 41 | 42 | distances = np.linalg.norm(block_starts[:, np.newaxis, :] - ref_starts[np.newaxis, :, :], axis=2) 43 | for ref_idx in range(len(ref_starts)): 44 | block_idx = np.argmin(distances[:, ref_idx]) 45 | block = blocks[block_idx] 46 | 47 | ref_block = page.add_full_block(ReferenceClass( 48 | ref=refs[ref_idx].ref, 49 | polygon=block.polygon, 50 | page_id=page.page_id 51 | )) 52 | if block.structure is None: 53 | block.structure = [] 54 | block.structure.insert(0, ref_block.id) 55 | -------------------------------------------------------------------------------- /marker/processors/util.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from bs4 import BeautifulSoup 4 | 5 | from marker.schema import BlockTypes 6 | from marker.schema.groups import PageGroup 7 | from marker.schema.registry import get_block_class 8 | from marker.schema.text import Line 9 | 10 | 11 | def escape_latex_commands(text: str): 12 | text = (text 13 | .replace('\n', '\\n') 14 | .replace('\t', '\\t') 15 | .replace('\r', '\\r')) 16 | return text 17 | 18 | 19 | def add_math_spans_to_line(corrected_text: str, text_line: Line, page: PageGroup): 20 | SpanClass = get_block_class(BlockTypes.Span) 21 | corrected_spans = text_to_spans(corrected_text) 22 | 23 | for span_idx, span in enumerate(corrected_spans): 24 | if span_idx == len(corrected_spans) - 1: 25 | span['content'] += "\n" 26 | 27 | span_block = page.add_full_block( 28 | SpanClass( 29 | polygon=text_line.polygon, 30 | text=span['content'], 31 | font='Unknown', 32 | font_weight=0, 33 | font_size=0, 34 | minimum_position=0, 35 | maximum_position=0, 36 | formats=[span['type']], 37 | url=span.get('url'), 38 | page_id=text_line.page_id, 39 | text_extraction_method="gemini", 40 | has_superscript=span["has_superscript"], 41 | has_subscript=span["has_subscript"] 42 | ) 43 | ) 44 | text_line.structure.append(span_block.id) 45 | 46 | 47 | def text_to_spans(text): 48 | soup = BeautifulSoup(text, 'html.parser') 49 | 50 | tag_types = { 51 | 'b': 'bold', 52 | 'i': 'italic', 53 | 'math': 'math', 54 | 'sub': 'plain', 55 | 'sup': 'plain', 56 | 'span': 'plain' 57 | } 58 | spans = [] 59 | 60 | for element in soup.descendants: 61 | if not len(list(element.parents)) == 1: 62 | continue 63 | 64 | url = element.attrs.get('href') if hasattr(element, 'attrs') else None 65 | 66 | if element.name in tag_types: 67 | text = element.get_text() 68 | if element.name == "math": 69 | text = escape_latex_commands(text) 70 | spans.append({ 71 | 'type': tag_types[element.name], 72 | 'content': text, 73 | 'url': url, 74 | "has_superscript": element.name == "sup", 75 | "has_subscript": element.name == "sub" 76 | }) 77 | elif element.string: 78 | spans.append({ 79 | 'type': 'plain', 80 | 'content': element.string, 81 | 'url': url, 82 | "has_superscript": False, 83 | "has_subscript": False 84 | }) 85 | 86 | return spans -------------------------------------------------------------------------------- /marker/providers/__init__.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | from typing import List, Optional, Dict 3 | 4 | from PIL import Image 5 | from pydantic import BaseModel 6 | 7 | from pdftext.schema import Reference 8 | 9 | from marker.logger import configure_logging 10 | from marker.schema.polygon import PolygonBox 11 | from marker.schema.text import Span 12 | from marker.schema.text.char import Char 13 | from marker.schema.text.line import Line 14 | from marker.settings import settings 15 | from marker.util import assign_config 16 | 17 | configure_logging() 18 | 19 | 20 | class ProviderOutput(BaseModel): 21 | line: Line 22 | spans: List[Span] 23 | chars: Optional[List[List[Char]]] = None 24 | 25 | @property 26 | def raw_text(self): 27 | return "".join(span.text for span in self.spans) 28 | 29 | def __hash__(self): 30 | return hash(tuple(self.line.polygon.bbox)) 31 | 32 | def merge(self, other: "ProviderOutput"): 33 | new_output = deepcopy(self) 34 | other_copy = deepcopy(other) 35 | 36 | new_output.spans.extend(other_copy.spans) 37 | if new_output.chars is not None and other_copy.chars is not None: 38 | new_output.chars.extend(other_copy.chars) 39 | elif other_copy.chars is not None: 40 | new_output.chars = other_copy.chars 41 | 42 | new_output.line.polygon = new_output.line.polygon.merge( 43 | [other_copy.line.polygon] 44 | ) 45 | return new_output 46 | 47 | 48 | ProviderPageLines = Dict[int, List[ProviderOutput]] 49 | 50 | 51 | class BaseProvider: 52 | def __init__(self, filepath: str, config: Optional[BaseModel | dict] = None): 53 | assign_config(self, config) 54 | self.filepath = filepath 55 | 56 | def __len__(self): 57 | pass 58 | 59 | def get_images(self, idxs: List[int], dpi: int) -> List[Image.Image]: 60 | pass 61 | 62 | def get_page_bbox(self, idx: int) -> PolygonBox | None: 63 | pass 64 | 65 | def get_page_lines(self, idx: int) -> List[Line]: 66 | pass 67 | 68 | def get_page_refs(self, idx: int) -> List[Reference]: 69 | pass 70 | 71 | def __enter__(self): 72 | return self 73 | 74 | @staticmethod 75 | def get_font_css(): 76 | from weasyprint import CSS 77 | from weasyprint.text.fonts import FontConfiguration 78 | 79 | font_config = FontConfiguration() 80 | css = CSS( 81 | string=f""" 82 | @font-face {{ 83 | font-family: GoNotoCurrent-Regular; 84 | src: url({settings.FONT_PATH}); 85 | font-display: swap; 86 | }} 87 | body {{ 88 | font-family: {settings.FONT_NAME.split(".")[0]}, sans-serif; 89 | font-variant-ligatures: none; 90 | font-feature-settings: "liga" 0; 91 | text-rendering: optimizeLegibility; 92 | }} 93 | """, 94 | font_config=font_config, 95 | ) 96 | return css 97 | -------------------------------------------------------------------------------- /marker/providers/document.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import os 3 | import re 4 | import tempfile 5 | from io import BytesIO 6 | 7 | from PIL import Image 8 | from marker.logger import get_logger 9 | 10 | from marker.providers.pdf import PdfProvider 11 | 12 | logger = get_logger() 13 | 14 | css = """ 15 | @page { 16 | size: A4; 17 | margin: 2cm; 18 | } 19 | 20 | img { 21 | max-width: 100%; 22 | max-height: 25cm; 23 | object-fit: contain; 24 | margin: 12pt auto; 25 | } 26 | 27 | div, p { 28 | max-width: 100%; 29 | word-break: break-word; 30 | font-size: 10pt; 31 | } 32 | 33 | table { 34 | width: 100%; 35 | border-collapse: collapse; 36 | break-inside: auto; 37 | font-size: 10pt; 38 | } 39 | 40 | tr { 41 | break-inside: avoid; 42 | page-break-inside: avoid; 43 | } 44 | 45 | td { 46 | border: 0.75pt solid #000; 47 | padding: 6pt; 48 | } 49 | """ 50 | 51 | 52 | class DocumentProvider(PdfProvider): 53 | def __init__(self, filepath: str, config=None): 54 | temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") 55 | self.temp_pdf_path = temp_pdf.name 56 | temp_pdf.close() 57 | 58 | # Convert DOCX to PDF 59 | try: 60 | self.convert_docx_to_pdf(filepath) 61 | except Exception as e: 62 | raise RuntimeError(f"Failed to convert {filepath} to PDF: {e}") 63 | 64 | # Initialize the PDF provider with the temp pdf path 65 | super().__init__(self.temp_pdf_path, config) 66 | 67 | def __del__(self): 68 | if os.path.exists(self.temp_pdf_path): 69 | os.remove(self.temp_pdf_path) 70 | 71 | def convert_docx_to_pdf(self, filepath: str): 72 | from weasyprint import CSS, HTML 73 | import mammoth 74 | 75 | with open(filepath, "rb") as docx_file: 76 | # we convert the docx to HTML 77 | result = mammoth.convert_to_html(docx_file) 78 | html = result.value 79 | 80 | # We convert the HTML into a PDF 81 | HTML(string=self._preprocess_base64_images(html)).write_pdf( 82 | self.temp_pdf_path, stylesheets=[CSS(string=css), self.get_font_css()] 83 | ) 84 | 85 | @staticmethod 86 | def _preprocess_base64_images(html_content): 87 | pattern = r'data:([^;]+);base64,([^"\'>\s]+)' 88 | 89 | def convert_image(match): 90 | try: 91 | img_data = base64.b64decode(match.group(2)) 92 | 93 | with BytesIO(img_data) as bio: 94 | with Image.open(bio) as img: 95 | output = BytesIO() 96 | img.save(output, format=img.format) 97 | new_base64 = base64.b64encode(output.getvalue()).decode() 98 | return f"data:{match.group(1)};base64,{new_base64}" 99 | 100 | except Exception as e: 101 | logger.error(f"Failed to process image: {e}") 102 | return "" # we ditch broken images as that breaks the PDF creation down the line 103 | 104 | return re.sub(pattern, convert_image, html_content) 105 | -------------------------------------------------------------------------------- /marker/providers/epub.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import os 3 | import tempfile 4 | 5 | from bs4 import BeautifulSoup 6 | 7 | from marker.providers.pdf import PdfProvider 8 | 9 | css = ''' 10 | @page { 11 | size: A4; 12 | margin: 2cm; 13 | } 14 | 15 | img { 16 | max-width: 100%; 17 | max-height: 25cm; 18 | object-fit: contain; 19 | margin: 12pt auto; 20 | } 21 | 22 | div, p { 23 | max-width: 100%; 24 | word-break: break-word; 25 | font-size: 10pt; 26 | } 27 | 28 | table { 29 | width: 100%; 30 | border-collapse: collapse; 31 | break-inside: auto; 32 | font-size: 10pt; 33 | } 34 | 35 | tr { 36 | break-inside: avoid; 37 | page-break-inside: avoid; 38 | } 39 | 40 | td { 41 | border: 0.75pt solid #000; 42 | padding: 6pt; 43 | } 44 | ''' 45 | 46 | 47 | class EpubProvider(PdfProvider): 48 | def __init__(self, filepath: str, config=None): 49 | temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=f".pdf") 50 | self.temp_pdf_path = temp_pdf.name 51 | temp_pdf.close() 52 | 53 | # Convert Epub to PDF 54 | try: 55 | self.convert_epub_to_pdf(filepath) 56 | except Exception as e: 57 | raise RuntimeError(f"Failed to convert {filepath} to PDF: {e}") 58 | 59 | # Initialize the PDF provider with the temp pdf path 60 | super().__init__(self.temp_pdf_path, config) 61 | 62 | def __del__(self): 63 | if os.path.exists(self.temp_pdf_path): 64 | os.remove(self.temp_pdf_path) 65 | 66 | def convert_epub_to_pdf(self, filepath): 67 | from weasyprint import CSS, HTML 68 | from ebooklib import epub 69 | import ebooklib 70 | 71 | ebook = epub.read_epub(filepath) 72 | 73 | styles = [] 74 | html_content = "" 75 | img_tags = {} 76 | 77 | for item in ebook.get_items(): 78 | if item.get_type() == ebooklib.ITEM_IMAGE: 79 | img_data = base64.b64encode(item.get_content()).decode("utf-8") 80 | img_tags[item.file_name] = f'data:{item.media_type};base64,{img_data}' 81 | elif item.get_type() == ebooklib.ITEM_STYLE: 82 | styles.append(item.get_content().decode('utf-8')) 83 | 84 | for item in ebook.get_items(): 85 | if item.get_type() == ebooklib.ITEM_DOCUMENT: 86 | html_content += item.get_content().decode("utf-8") 87 | 88 | soup = BeautifulSoup(html_content, 'html.parser') 89 | for img in soup.find_all('img'): 90 | src = img.get('src') 91 | if src: 92 | normalized_src = src.replace('../', '') 93 | if normalized_src in img_tags: 94 | img['src'] = img_tags[normalized_src] 95 | 96 | for image in soup.find_all('image'): 97 | src = image.get('xlink:href') 98 | if src: 99 | normalized_src = src.replace('../', '') 100 | if normalized_src in img_tags: 101 | image['xlink:href'] = img_tags[normalized_src] 102 | 103 | html_content = str(soup) 104 | full_style = ''.join([css]) # + styles) 105 | 106 | # we convert the epub to HTML 107 | HTML(string=html_content, base_url=filepath).write_pdf( 108 | self.temp_pdf_path, 109 | stylesheets=[CSS(string=full_style), self.get_font_css()] 110 | ) 111 | -------------------------------------------------------------------------------- /marker/providers/html.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tempfile 3 | 4 | from marker.providers.pdf import PdfProvider 5 | 6 | 7 | class HTMLProvider(PdfProvider): 8 | def __init__(self, filepath: str, config=None): 9 | temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") 10 | self.temp_pdf_path = temp_pdf.name 11 | temp_pdf.close() 12 | 13 | # Convert HTML to PDF 14 | try: 15 | self.convert_html_to_pdf(filepath) 16 | except Exception as e: 17 | raise RuntimeError(f"Failed to convert {filepath} to PDF: {e}") 18 | 19 | # Initialize the PDF provider with the temp pdf path 20 | super().__init__(self.temp_pdf_path, config) 21 | 22 | def __del__(self): 23 | if os.path.exists(self.temp_pdf_path): 24 | os.remove(self.temp_pdf_path) 25 | 26 | def convert_html_to_pdf(self, filepath: str): 27 | from weasyprint import HTML 28 | 29 | font_css = self.get_font_css() 30 | HTML(filename=filepath, encoding="utf-8").write_pdf( 31 | self.temp_pdf_path, stylesheets=[font_css] 32 | ) 33 | -------------------------------------------------------------------------------- /marker/providers/image.py: -------------------------------------------------------------------------------- 1 | from typing import List, Annotated 2 | from PIL import Image 3 | 4 | from marker.providers import ProviderPageLines, BaseProvider 5 | from marker.schema.polygon import PolygonBox 6 | from marker.schema.text import Line 7 | from pdftext.schema import Reference 8 | 9 | 10 | class ImageProvider(BaseProvider): 11 | page_range: Annotated[ 12 | List[int], 13 | "The range of pages to process.", 14 | "Default is None, which will process all pages.", 15 | ] = None 16 | 17 | image_count: int = 1 18 | 19 | def __init__(self, filepath: str, config=None): 20 | super().__init__(filepath, config) 21 | 22 | self.images = [Image.open(filepath)] 23 | self.page_lines: ProviderPageLines = {i: [] for i in range(self.image_count)} 24 | 25 | if self.page_range is None: 26 | self.page_range = range(self.image_count) 27 | 28 | assert max(self.page_range) < self.image_count and min(self.page_range) >= 0, ( 29 | f"Invalid page range, values must be between 0 and {len(self.doc) - 1}. Min of provided page range is {min(self.page_range)} and max is {max(self.page_range)}." 30 | ) 31 | 32 | self.page_bboxes = { 33 | i: [0, 0, self.images[i].size[0], self.images[i].size[1]] 34 | for i in self.page_range 35 | } 36 | 37 | def __len__(self): 38 | return self.image_count 39 | 40 | def get_images(self, idxs: List[int], dpi: int) -> List[Image.Image]: 41 | return [self.images[i] for i in idxs] 42 | 43 | def get_page_bbox(self, idx: int) -> PolygonBox | None: 44 | bbox = self.page_bboxes[idx] 45 | if bbox: 46 | return PolygonBox.from_bbox(bbox) 47 | 48 | def get_page_lines(self, idx: int) -> List[Line]: 49 | return self.page_lines[idx] 50 | 51 | def get_page_refs(self, idx: int) -> List[Reference]: 52 | return [] 53 | -------------------------------------------------------------------------------- /marker/providers/registry.py: -------------------------------------------------------------------------------- 1 | import filetype 2 | import filetype.match as file_match 3 | from bs4 import BeautifulSoup 4 | from filetype.types import archive, document, IMAGE 5 | 6 | from marker.providers.document import DocumentProvider 7 | from marker.providers.epub import EpubProvider 8 | from marker.providers.html import HTMLProvider 9 | from marker.providers.image import ImageProvider 10 | from marker.providers.pdf import PdfProvider 11 | from marker.providers.powerpoint import PowerPointProvider 12 | from marker.providers.spreadsheet import SpreadSheetProvider 13 | 14 | DOCTYPE_MATCHERS = { 15 | "image": IMAGE, 16 | "pdf": [ 17 | archive.Pdf, 18 | ], 19 | "epub": [ 20 | archive.Epub, 21 | ], 22 | "doc": [document.Docx], 23 | "xls": [document.Xlsx], 24 | "ppt": [document.Pptx], 25 | } 26 | 27 | 28 | def load_matchers(doctype: str): 29 | return [cls() for cls in DOCTYPE_MATCHERS[doctype]] 30 | 31 | 32 | def load_extensions(doctype: str): 33 | return [cls.EXTENSION for cls in DOCTYPE_MATCHERS[doctype]] 34 | 35 | 36 | def provider_from_ext(filepath: str): 37 | ext = filepath.rsplit(".", 1)[-1].strip() 38 | if not ext: 39 | return PdfProvider 40 | 41 | if ext in load_extensions("image"): 42 | return ImageProvider 43 | if ext in load_extensions("pdf"): 44 | return PdfProvider 45 | if ext in load_extensions("doc"): 46 | return DocumentProvider 47 | if ext in load_extensions("xls"): 48 | return SpreadSheetProvider 49 | if ext in load_extensions("ppt"): 50 | return PowerPointProvider 51 | if ext in load_extensions("epub"): 52 | return EpubProvider 53 | if ext in ["html"]: 54 | return HTMLProvider 55 | 56 | return PdfProvider 57 | 58 | 59 | def provider_from_filepath(filepath: str): 60 | if filetype.image_match(filepath) is not None: 61 | return ImageProvider 62 | if file_match(filepath, load_matchers("pdf")) is not None: 63 | return PdfProvider 64 | if file_match(filepath, load_matchers("epub")) is not None: 65 | return EpubProvider 66 | if file_match(filepath, load_matchers("doc")) is not None: 67 | return DocumentProvider 68 | if file_match(filepath, load_matchers("xls")) is not None: 69 | return SpreadSheetProvider 70 | if file_match(filepath, load_matchers("ppt")) is not None: 71 | return PowerPointProvider 72 | 73 | try: 74 | with open(filepath, "r", encoding="utf-8") as f: 75 | soup = BeautifulSoup(f.read(), "html.parser") 76 | # Check if there are any HTML tags 77 | if bool(soup.find()): 78 | return HTMLProvider 79 | except Exception: 80 | pass 81 | 82 | # Fallback if we incorrectly detect the file type 83 | return provider_from_ext(filepath) 84 | -------------------------------------------------------------------------------- /marker/providers/utils.py: -------------------------------------------------------------------------------- 1 | def alphanum_ratio(text): 2 | text = text.replace(" ", "") 3 | text = text.replace("\n", "") 4 | alphanumeric_count = sum([1 for c in text if c.isalnum()]) 5 | 6 | if len(text) == 0: 7 | return 1 8 | 9 | ratio = alphanumeric_count / len(text) 10 | return ratio 11 | -------------------------------------------------------------------------------- /marker/renderers/extraction.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import Dict 3 | 4 | from pydantic import BaseModel 5 | 6 | from marker.extractors import ExtractionResult 7 | from marker.renderers import BaseRenderer 8 | 9 | 10 | @dataclass 11 | class MergeData: 12 | confidence_exists_1: float 13 | confidence_exists_2: float 14 | confidence_value_1: float 15 | confidence_value_2: float 16 | 17 | 18 | def merge_keys( 19 | json: dict | list, json2: dict, merge_data: MergeData, confidence_threshold: int = 3 20 | ): 21 | if isinstance(json, list): 22 | json.extend(json2) 23 | 24 | elif isinstance(json, dict): 25 | for key in json: 26 | if isinstance(json[key], dict): 27 | merge_keys(json[key], json2[key], merge_data) 28 | elif isinstance(json[key], list): 29 | json[key] = json[key] + json2[key] 30 | else: 31 | value_2_correct = ( 32 | merge_data.confidence_exists_2 > confidence_threshold 33 | and merge_data.confidence_value_2 > confidence_threshold 34 | ) 35 | 36 | if value_2_correct and json2[key]: 37 | json[key] = json2[key] 38 | 39 | if not json[key] and json2[key]: 40 | json[key] = json2[key] 41 | 42 | 43 | class ExtractionOutput(BaseModel): 44 | pages: Dict[int, ExtractionResult] 45 | document_json: dict 46 | 47 | 48 | class ExtractionRenderer(BaseRenderer): 49 | def __call__(self, outputs: Dict[int, ExtractionResult]) -> ExtractionOutput: 50 | pnums = sorted(list(outputs.keys())) 51 | merged_result = outputs[pnums[0]].extracted_data.copy() 52 | confidence_exists = outputs[pnums[0]].existence_confidence 53 | confidence_value = outputs[pnums[0]].value_confidence 54 | 55 | for pnum in pnums[1:]: 56 | merge_data = MergeData( 57 | confidence_exists_1=confidence_exists, 58 | confidence_exists_2=outputs[pnum].existence_confidence, 59 | confidence_value_1=confidence_value, 60 | confidence_value_2=outputs[pnum].value_confidence, 61 | ) 62 | merge_keys(merged_result, outputs[pnum].extracted_data, merge_data) 63 | 64 | return ExtractionOutput(pages=outputs, document_json=merged_result) 65 | -------------------------------------------------------------------------------- /marker/renderers/json.py: -------------------------------------------------------------------------------- 1 | from typing import Annotated, Dict, List, Tuple 2 | 3 | from pydantic import BaseModel 4 | 5 | from marker.renderers import BaseRenderer 6 | from marker.schema import BlockTypes 7 | from marker.schema.blocks import Block, BlockOutput 8 | from marker.schema.document import Document 9 | from marker.schema.registry import get_block_class 10 | 11 | 12 | class JSONBlockOutput(BaseModel): 13 | id: str 14 | block_type: str 15 | html: str 16 | polygon: List[List[float]] 17 | bbox: List[float] 18 | children: List['JSONBlockOutput'] | None = None 19 | section_hierarchy: Dict[int, str] | None = None 20 | images: dict | None = None 21 | 22 | 23 | class JSONOutput(BaseModel): 24 | children: List[JSONBlockOutput] 25 | block_type: str = str(BlockTypes.Document) 26 | metadata: dict 27 | 28 | 29 | def reformat_section_hierarchy(section_hierarchy): 30 | new_section_hierarchy = {} 31 | for key, value in section_hierarchy.items(): 32 | new_section_hierarchy[key] = str(value) 33 | return new_section_hierarchy 34 | 35 | 36 | class JSONRenderer(BaseRenderer): 37 | """ 38 | A renderer for JSON output. 39 | """ 40 | image_blocks: Annotated[ 41 | Tuple[BlockTypes], 42 | "The list of block types to consider as images.", 43 | ] = (BlockTypes.Picture, BlockTypes.Figure) 44 | page_blocks: Annotated[ 45 | Tuple[BlockTypes], 46 | "The list of block types to consider as pages.", 47 | ] = (BlockTypes.Page,) 48 | 49 | def extract_json(self, document: Document, block_output: BlockOutput): 50 | cls = get_block_class(block_output.id.block_type) 51 | if cls.__base__ == Block: 52 | html, images = self.extract_block_html(document, block_output) 53 | return JSONBlockOutput( 54 | html=html, 55 | polygon=block_output.polygon.polygon, 56 | bbox=block_output.polygon.bbox, 57 | id=str(block_output.id), 58 | block_type=str(block_output.id.block_type), 59 | images=images, 60 | section_hierarchy=reformat_section_hierarchy(block_output.section_hierarchy) 61 | ) 62 | else: 63 | children = [] 64 | for child in block_output.children: 65 | child_output = self.extract_json(document, child) 66 | children.append(child_output) 67 | 68 | return JSONBlockOutput( 69 | html=block_output.html, 70 | polygon=block_output.polygon.polygon, 71 | bbox=block_output.polygon.bbox, 72 | id=str(block_output.id), 73 | block_type=str(block_output.id.block_type), 74 | children=children, 75 | section_hierarchy=reformat_section_hierarchy(block_output.section_hierarchy) 76 | ) 77 | 78 | def __call__(self, document: Document) -> JSONOutput: 79 | document_output = document.render() 80 | json_output = [] 81 | for page_output in document_output.children: 82 | json_output.append(self.extract_json(document, page_output)) 83 | return JSONOutput( 84 | children=json_output, 85 | metadata=self.generate_document_metadata(document, document_output) 86 | ) 87 | -------------------------------------------------------------------------------- /marker/schema/__init__.py: -------------------------------------------------------------------------------- 1 | from enum import auto, Enum 2 | 3 | 4 | class BlockTypes(str, Enum): 5 | Line = auto() 6 | Span = auto() 7 | Char = auto() 8 | FigureGroup = auto() 9 | TableGroup = auto() 10 | ListGroup = auto() 11 | PictureGroup = auto() 12 | Page = auto() 13 | Caption = auto() 14 | Code = auto() 15 | Figure = auto() 16 | Footnote = auto() 17 | Form = auto() 18 | Equation = auto() 19 | Handwriting = auto() 20 | TextInlineMath = auto() 21 | ListItem = auto() 22 | PageFooter = auto() 23 | PageHeader = auto() 24 | Picture = auto() 25 | SectionHeader = auto() 26 | Table = auto() 27 | Text = auto() 28 | TableOfContents = auto() 29 | Document = auto() 30 | ComplexRegion = auto() 31 | TableCell = auto() 32 | Reference = auto() 33 | 34 | def __str__(self): 35 | return self.name 36 | -------------------------------------------------------------------------------- /marker/schema/blocks/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from marker.schema.blocks.base import Block, BlockId, BlockOutput 4 | from marker.schema.blocks.caption import Caption 5 | from marker.schema.blocks.code import Code 6 | from marker.schema.blocks.figure import Figure 7 | from marker.schema.blocks.footnote import Footnote 8 | from marker.schema.blocks.form import Form 9 | from marker.schema.blocks.equation import Equation 10 | from marker.schema.blocks.handwriting import Handwriting 11 | from marker.schema.blocks.inlinemath import InlineMath 12 | from marker.schema.blocks.listitem import ListItem 13 | from marker.schema.blocks.pagefooter import PageFooter 14 | from marker.schema.blocks.pageheader import PageHeader 15 | from marker.schema.blocks.picture import Picture 16 | from marker.schema.blocks.sectionheader import SectionHeader 17 | from marker.schema.blocks.table import Table 18 | from marker.schema.blocks.text import Text 19 | from marker.schema.blocks.toc import TableOfContents 20 | from marker.schema.blocks.complexregion import ComplexRegion 21 | from marker.schema.blocks.tablecell import TableCell 22 | from marker.schema.blocks.reference import Reference 23 | -------------------------------------------------------------------------------- /marker/schema/blocks/basetable.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | from marker.schema import BlockTypes 4 | from marker.schema.blocks import Block, BlockOutput 5 | from marker.schema.blocks.tablecell import TableCell 6 | 7 | 8 | class BaseTable(Block): 9 | block_type: BlockTypes | None = None 10 | html: str | None = None 11 | 12 | @staticmethod 13 | def format_cells(document, child_blocks, child_cells: List[TableCell] | None = None): 14 | if child_cells is None: 15 | child_cells: List[TableCell] = [document.get_block(c.id) for c in child_blocks if c.id.block_type == BlockTypes.TableCell] 16 | 17 | unique_rows = sorted(list(set([c.row_id for c in child_cells]))) 18 | html_repr = "" 19 | for row_id in unique_rows: 20 | row_cells = sorted([c for c in child_cells if c.row_id == row_id], key=lambda x: x.col_id) 21 | html_repr += "" 22 | for cell in row_cells: 23 | html_repr += cell.assemble_html(document, child_blocks, None) 24 | html_repr += "" 25 | html_repr += "
" 26 | return html_repr 27 | 28 | def assemble_html(self, document, child_blocks: List[BlockOutput], parent_structure=None): 29 | # Filter out the table cells, so they don't render twice 30 | child_ref_blocks = [block for block in child_blocks if block.id.block_type == BlockTypes.Reference] 31 | template = super().assemble_html(document, child_ref_blocks, parent_structure) 32 | 33 | child_block_types = set([c.id.block_type for c in child_blocks]) 34 | if self.html: 35 | # LLM processor 36 | return template + self.html 37 | elif len(child_blocks) > 0 and BlockTypes.TableCell in child_block_types: 38 | # Table processor 39 | return template + self.format_cells(document, child_blocks) 40 | else: 41 | # Default text lines and spans 42 | return f"

{template}

" 43 | -------------------------------------------------------------------------------- /marker/schema/blocks/caption.py: -------------------------------------------------------------------------------- 1 | from marker.schema import BlockTypes 2 | from marker.schema.blocks import Block 3 | 4 | 5 | class Caption(Block): 6 | block_type: BlockTypes = BlockTypes.Caption 7 | block_description: str = "A text caption that is directly above or below an image or table. Only used for text describing the image or table. " 8 | replace_output_newlines: bool = True 9 | html: str | None = None 10 | 11 | def assemble_html(self, document, child_blocks, parent_structure): 12 | if self.html: 13 | return super().handle_html_output(document, child_blocks, parent_structure) 14 | 15 | return super().assemble_html(document, child_blocks, parent_structure) 16 | 17 | -------------------------------------------------------------------------------- /marker/schema/blocks/code.py: -------------------------------------------------------------------------------- 1 | import html 2 | 3 | from marker.schema import BlockTypes 4 | from marker.schema.blocks import Block 5 | 6 | 7 | class Code(Block): 8 | block_type: BlockTypes = BlockTypes.Code 9 | code: str | None = None 10 | block_description: str = "A programming code block." 11 | 12 | def assemble_html(self, document, child_blocks, parent_structure): 13 | code = self.code or "" 14 | return (f"
"
15 |                 f"{html.escape(code)}"
16 |                 f"
") 17 | -------------------------------------------------------------------------------- /marker/schema/blocks/complexregion.py: -------------------------------------------------------------------------------- 1 | from marker.schema import BlockTypes 2 | from marker.schema.blocks import Block 3 | 4 | 5 | class ComplexRegion(Block): 6 | block_type: BlockTypes = BlockTypes.ComplexRegion 7 | html: str | None = None 8 | block_description: str = "A complex region that can consist of multiple different types of blocks mixed with images. This block is chosen when it is difficult to categorize the region as a single block type." 9 | 10 | def assemble_html(self, document, child_blocks, parent_structure): 11 | if self.html: 12 | child_ref_blocks = [block for block in child_blocks if block.id.block_type == BlockTypes.Reference] 13 | html = super().assemble_html(document, child_ref_blocks, parent_structure) 14 | return html + self.html 15 | else: 16 | template = super().assemble_html(document, child_blocks, parent_structure) 17 | return f"

{template}

" 18 | -------------------------------------------------------------------------------- /marker/schema/blocks/equation.py: -------------------------------------------------------------------------------- 1 | from marker.schema import BlockTypes 2 | from marker.schema.blocks import Block 3 | 4 | 5 | class Equation(Block): 6 | block_type: BlockTypes = BlockTypes.Equation 7 | html: str | None = None 8 | block_description: str = "A block math equation." 9 | 10 | def assemble_html(self, document, child_blocks, parent_structure=None): 11 | if self.html: 12 | child_ref_blocks = [block for block in child_blocks if block.id.block_type == BlockTypes.Reference] 13 | html_out = super().assemble_html(document, child_ref_blocks, parent_structure) 14 | html_out += f"""

{self.html}

""" 15 | return html_out 16 | else: 17 | template = super().assemble_html(document, child_blocks, parent_structure) 18 | return f"

{template}

" 19 | -------------------------------------------------------------------------------- /marker/schema/blocks/figure.py: -------------------------------------------------------------------------------- 1 | from marker.schema import BlockTypes 2 | from marker.schema.blocks import Block 3 | 4 | 5 | class Figure(Block): 6 | block_type: BlockTypes = BlockTypes.Figure 7 | description: str | None = None 8 | block_description: str = "A chart or other image that contains data." 9 | 10 | def assemble_html(self, document, child_blocks, parent_structure): 11 | child_ref_blocks = [block for block in child_blocks if block.id.block_type == BlockTypes.Reference] 12 | html = super().assemble_html(document, child_ref_blocks, parent_structure) 13 | if self.description: 14 | html += f"

Image {self.id} description: {self.description}

" 15 | return html 16 | -------------------------------------------------------------------------------- /marker/schema/blocks/footnote.py: -------------------------------------------------------------------------------- 1 | from marker.schema import BlockTypes 2 | from marker.schema.blocks import Block 3 | 4 | 5 | class Footnote(Block): 6 | block_type: BlockTypes = BlockTypes.Footnote 7 | block_description: str = "A footnote that explains a term or concept in the document." 8 | replace_output_newlines: bool = True 9 | html: str | None = None 10 | 11 | def assemble_html(self, document, child_blocks, parent_structure): 12 | if self.html: 13 | return super().handle_html_output(document, child_blocks, parent_structure) 14 | 15 | return super().assemble_html(document, child_blocks, parent_structure) 16 | -------------------------------------------------------------------------------- /marker/schema/blocks/form.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | from marker.schema import BlockTypes 4 | from marker.schema.blocks.basetable import BaseTable 5 | 6 | 7 | class Form(BaseTable): 8 | block_type: BlockTypes = BlockTypes.Form 9 | block_description: str = "A form, such as a tax form, that contains fields and labels. It most likely doesn't have a table structure." 10 | -------------------------------------------------------------------------------- /marker/schema/blocks/handwriting.py: -------------------------------------------------------------------------------- 1 | from marker.schema import BlockTypes 2 | from marker.schema.blocks import Block 3 | 4 | 5 | class Handwriting(Block): 6 | block_type: BlockTypes = BlockTypes.Handwriting 7 | block_description: str = "A region that contains handwriting." 8 | html: str | None = None 9 | replace_output_newlines: bool = True 10 | 11 | def assemble_html(self, document, child_blocks, parent_structure): 12 | if self.html: 13 | return self.html 14 | else: 15 | return super().assemble_html(document, child_blocks, parent_structure) 16 | -------------------------------------------------------------------------------- /marker/schema/blocks/inlinemath.py: -------------------------------------------------------------------------------- 1 | from marker.schema import BlockTypes 2 | from marker.schema.blocks import Block 3 | 4 | 5 | class InlineMath(Block): 6 | block_type: BlockTypes = BlockTypes.TextInlineMath 7 | has_continuation: bool = False 8 | blockquote: bool = False 9 | blockquote_level: int = 0 10 | block_description: str = "A text block that contains inline math. This is not used for italic text or references - only for text that contains math." 11 | html: str | None = None 12 | 13 | def assemble_html(self, document, child_blocks, parent_structure): 14 | if self.ignore_for_output: 15 | return "" 16 | 17 | if self.html: 18 | return super().handle_html_output(document, child_blocks, parent_structure) 19 | 20 | template = super().assemble_html(document, child_blocks, parent_structure) 21 | template = template.replace("\n", " ") 22 | 23 | el_attr = f" block-type='{self.block_type}'" 24 | if self.has_continuation: 25 | el_attr += " class='has-continuation'" 26 | 27 | if self.blockquote: 28 | # Add indentation for blockquote levels 29 | blockquote_prefix = "
" * self.blockquote_level 30 | blockquote_suffix = "
" * self.blockquote_level 31 | return f"{blockquote_prefix}{template}

{blockquote_suffix}" 32 | else: 33 | return f"{template}

" 34 | -------------------------------------------------------------------------------- /marker/schema/blocks/listitem.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from marker.schema import BlockTypes 4 | from marker.schema.blocks import Block 5 | 6 | 7 | def replace_bullets(child_blocks): 8 | # Replace bullet characters with a - 9 | first_block = None 10 | while len(child_blocks) > 0: 11 | first_block = child_blocks[0] 12 | child_blocks = first_block.children 13 | 14 | if first_block is not None and first_block.id.block_type == BlockTypes.Line: 15 | bullet_pattern = r"(^|[\n ]|<[^>]*>)[•●○ഠ ം◦■▪▫–—-]( )" 16 | first_block.html = re.sub(bullet_pattern, r"\1\2", first_block.html) 17 | 18 | 19 | class ListItem(Block): 20 | block_type: BlockTypes = BlockTypes.ListItem 21 | list_indent_level: int = 0 22 | block_description: str = "A list item that is part of a list. This block is used to represent a single item in a list." 23 | html: str | None = None 24 | 25 | def assemble_html(self, document, child_blocks, parent_structure): 26 | template = super().assemble_html(document, child_blocks, parent_structure) 27 | template = template.replace("\n", " ") 28 | # Remove the first bullet character 29 | replace_bullets(child_blocks) 30 | 31 | if self.html: 32 | template = super().handle_html_output(document, child_blocks, parent_structure).strip() 33 | template = template.replace("
  • ", "").replace("
  • ", "") 34 | 35 | el_attr = f" block-type='{self.block_type}'" 36 | if self.list_indent_level: 37 | return f"
      {template}
    " 38 | return f"{template}" 39 | -------------------------------------------------------------------------------- /marker/schema/blocks/pagefooter.py: -------------------------------------------------------------------------------- 1 | from marker.schema import BlockTypes 2 | from marker.schema.blocks import Block 3 | 4 | 5 | class PageFooter(Block): 6 | block_type: str = BlockTypes.PageFooter 7 | block_description: str = "Text that appears at the bottom of a page, like a page number." 8 | replace_output_newlines: bool = True 9 | ignore_for_output: bool = True 10 | 11 | -------------------------------------------------------------------------------- /marker/schema/blocks/pageheader.py: -------------------------------------------------------------------------------- 1 | from marker.schema import BlockTypes 2 | from marker.schema.blocks import Block 3 | 4 | 5 | class PageHeader(Block): 6 | block_type: BlockTypes = BlockTypes.PageHeader 7 | block_description: str = "Text that appears at the top of a page, like a page title." 8 | replace_output_newlines: bool = True 9 | ignore_for_output: bool = True 10 | 11 | -------------------------------------------------------------------------------- /marker/schema/blocks/picture.py: -------------------------------------------------------------------------------- 1 | from marker.schema import BlockTypes 2 | from marker.schema.blocks import Block 3 | 4 | 5 | class Picture(Block): 6 | block_type: BlockTypes = BlockTypes.Picture 7 | description: str | None = None 8 | block_description: str = "An image block that represents a picture." 9 | 10 | def assemble_html(self, document, child_blocks, parent_structure): 11 | child_ref_blocks = [block for block in child_blocks if block.id.block_type == BlockTypes.Reference] 12 | html = super().assemble_html(document, child_ref_blocks, parent_structure) 13 | 14 | if self.description: 15 | return html + f"

    Image {self.id} description: {self.description}

    " 16 | return html 17 | -------------------------------------------------------------------------------- /marker/schema/blocks/reference.py: -------------------------------------------------------------------------------- 1 | from marker.schema import BlockTypes 2 | from marker.schema.blocks import Block 3 | 4 | 5 | class Reference(Block): 6 | block_type: BlockTypes = BlockTypes.Reference 7 | ref: str 8 | block_description: str = "A reference to this block from another block." 9 | 10 | def assemble_html(self, document, child_blocks, parent_structure=None): 11 | template = super().assemble_html(document, child_blocks, parent_structure) 12 | return f"{template}" 13 | -------------------------------------------------------------------------------- /marker/schema/blocks/sectionheader.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from marker.schema import BlockTypes 4 | from marker.schema.blocks import Block 5 | 6 | 7 | class SectionHeader(Block): 8 | block_type: BlockTypes = BlockTypes.SectionHeader 9 | heading_level: Optional[int] = None 10 | block_description: str = "The header of a section of text or other blocks." 11 | html: str | None = None 12 | 13 | def assemble_html(self, document, child_blocks, parent_structure): 14 | if self.ignore_for_output: 15 | return "" 16 | 17 | if self.html: 18 | return super().handle_html_output(document, child_blocks, parent_structure) 19 | 20 | template = super().assemble_html(document, child_blocks, parent_structure) 21 | template = template.replace("\n", " ") 22 | tag = f"h{self.heading_level}" if self.heading_level else "h2" 23 | return f"<{tag}>{template}" 24 | -------------------------------------------------------------------------------- /marker/schema/blocks/table.py: -------------------------------------------------------------------------------- 1 | from marker.schema import BlockTypes 2 | from marker.schema.blocks.basetable import BaseTable 3 | 4 | 5 | class Table(BaseTable): 6 | block_type: BlockTypes = BlockTypes.Table 7 | block_description: str = "A table of data, like a results table. It will be in a tabular format." 8 | -------------------------------------------------------------------------------- /marker/schema/blocks/tablecell.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | from marker.schema import BlockTypes 4 | from marker.schema.blocks import Block 5 | 6 | 7 | class TableCell(Block): 8 | block_type: BlockTypes = BlockTypes.TableCell 9 | rowspan: int 10 | colspan: int 11 | row_id: int 12 | col_id: int 13 | is_header: bool 14 | text_lines: List[str] | None = None 15 | block_description: str = "A cell in a table." 16 | 17 | @property 18 | def text(self): 19 | return "\n".join(self.text_lines) 20 | 21 | def assemble_html(self, document, child_blocks, parent_structure=None): 22 | tag_cls = "th" if self.is_header else "td" 23 | tag = f"<{tag_cls}" 24 | if self.rowspan > 1: 25 | tag += f" rowspan={self.rowspan}" 26 | if self.colspan > 1: 27 | tag += f" colspan={self.colspan}" 28 | if self.text_lines is None: 29 | self.text_lines = [] 30 | text = "
    ".join(self.text_lines) 31 | return f"{tag}>{text}" 32 | -------------------------------------------------------------------------------- /marker/schema/blocks/text.py: -------------------------------------------------------------------------------- 1 | from marker.schema import BlockTypes 2 | from marker.schema.blocks import Block 3 | 4 | 5 | class Text(Block): 6 | block_type: BlockTypes = BlockTypes.Text 7 | has_continuation: bool = False 8 | blockquote: bool = False 9 | blockquote_level: int = 0 10 | html: str | None = None 11 | block_description: str = "A paragraph or line of text." 12 | 13 | def assemble_html(self, document, child_blocks, parent_structure): 14 | if self.ignore_for_output: 15 | return "" 16 | 17 | # This happens when we used an llm processor 18 | if self.html: 19 | return super().handle_html_output(document, child_blocks, parent_structure) 20 | 21 | template = super().assemble_html(document, child_blocks, parent_structure) 22 | template = template.replace("\n", " ") 23 | 24 | el_attr = f" block-type='{self.block_type}'" 25 | if self.has_continuation: 26 | el_attr += " class='has-continuation'" 27 | 28 | if self.blockquote: 29 | blockquote_prefix = "
    " * self.blockquote_level 30 | blockquote_suffix = "
    " * self.blockquote_level 31 | return f"{blockquote_prefix}{template}

    {blockquote_suffix}" 32 | else: 33 | return f"{template}

    " 34 | -------------------------------------------------------------------------------- /marker/schema/blocks/toc.py: -------------------------------------------------------------------------------- 1 | from marker.schema import BlockTypes 2 | from marker.schema.blocks.basetable import BaseTable 3 | 4 | 5 | class TableOfContents(BaseTable): 6 | block_type: str = BlockTypes.TableOfContents 7 | block_description: str = "A table of contents." 8 | -------------------------------------------------------------------------------- /marker/schema/document.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import List, Sequence 4 | 5 | from pydantic import BaseModel 6 | 7 | from marker.schema import BlockTypes 8 | from marker.schema.blocks import Block, BlockId, BlockOutput 9 | from marker.schema.groups.page import PageGroup 10 | 11 | 12 | class DocumentOutput(BaseModel): 13 | children: List[BlockOutput] 14 | html: str 15 | block_type: BlockTypes = BlockTypes.Document 16 | 17 | 18 | class TocItem(BaseModel): 19 | title: str 20 | heading_level: int 21 | page_id: int 22 | polygon: List[List[float]] 23 | 24 | 25 | class Document(BaseModel): 26 | filepath: str 27 | pages: List[PageGroup] 28 | block_type: BlockTypes = BlockTypes.Document 29 | table_of_contents: List[TocItem] | None = None 30 | debug_data_path: str | None = None # Path that debug data was saved to 31 | 32 | def get_block(self, block_id: BlockId): 33 | page = self.get_page(block_id.page_id) 34 | block = page.get_block(block_id) 35 | if block: 36 | return block 37 | return None 38 | 39 | def get_page(self, page_id): 40 | for page in self.pages: 41 | if page.page_id == page_id: 42 | return page 43 | return None 44 | 45 | def get_next_block(self, block: Block, ignored_block_types: List[BlockTypes] = None): 46 | if ignored_block_types is None: 47 | ignored_block_types = [] 48 | next_block = None 49 | 50 | # Try to find the next block in the current page 51 | page = self.get_page(block.page_id) 52 | next_block = page.get_next_block(block, ignored_block_types) 53 | if next_block: 54 | return next_block 55 | 56 | # If no block found, search subsequent pages 57 | for page in self.pages[self.pages.index(page) + 1:]: 58 | next_block = page.get_next_block(None, ignored_block_types) 59 | if next_block: 60 | return next_block 61 | return None 62 | 63 | def get_next_page(self, page: PageGroup): 64 | page_idx = self.pages.index(page) 65 | if page_idx + 1 < len(self.pages): 66 | return self.pages[page_idx + 1] 67 | return None 68 | 69 | def get_prev_block(self, block: Block): 70 | page = self.get_page(block.page_id) 71 | prev_block = page.get_prev_block(block) 72 | if prev_block: 73 | return prev_block 74 | prev_page = self.get_prev_page(page) 75 | if not prev_page: 76 | return None 77 | return prev_page.get_block(prev_page.structure[-1]) 78 | 79 | def get_prev_page(self, page: PageGroup): 80 | page_idx = self.pages.index(page) 81 | if page_idx > 0: 82 | return self.pages[page_idx - 1] 83 | return None 84 | 85 | def assemble_html(self, child_blocks: List[Block]): 86 | template = "" 87 | for c in child_blocks: 88 | template += f"" 89 | return template 90 | 91 | def render(self): 92 | child_content = [] 93 | section_hierarchy = None 94 | for page in self.pages: 95 | rendered = page.render(self, None, section_hierarchy) 96 | section_hierarchy = rendered.section_hierarchy.copy() 97 | child_content.append(rendered) 98 | 99 | return DocumentOutput( 100 | children=child_content, 101 | html=self.assemble_html(child_content) 102 | ) 103 | 104 | def contained_blocks(self, block_types: Sequence[BlockTypes] = None) -> List[Block]: 105 | blocks = [] 106 | for page in self.pages: 107 | blocks += page.contained_blocks(self, block_types) 108 | return blocks 109 | -------------------------------------------------------------------------------- /marker/schema/groups/__init__.py: -------------------------------------------------------------------------------- 1 | from marker.schema.blocks.base import Block 2 | from marker.schema.groups.figure import FigureGroup 3 | from marker.schema.groups.table import TableGroup 4 | from marker.schema.groups.list import ListGroup 5 | from marker.schema.groups.picture import PictureGroup 6 | from marker.schema.groups.page import PageGroup 7 | -------------------------------------------------------------------------------- /marker/schema/groups/base.py: -------------------------------------------------------------------------------- 1 | from marker.schema.blocks import Block 2 | 3 | 4 | class Group(Block): 5 | pass -------------------------------------------------------------------------------- /marker/schema/groups/figure.py: -------------------------------------------------------------------------------- 1 | from marker.schema import BlockTypes 2 | from marker.schema.groups.base import Group 3 | 4 | 5 | class FigureGroup(Group): 6 | block_type: BlockTypes = BlockTypes.FigureGroup 7 | block_description: str = "A group that contains a figure and associated captions." 8 | -------------------------------------------------------------------------------- /marker/schema/groups/list.py: -------------------------------------------------------------------------------- 1 | from marker.schema import BlockTypes 2 | from marker.schema.groups.base import Group 3 | 4 | 5 | class ListGroup(Group): 6 | block_type: BlockTypes = BlockTypes.ListGroup 7 | has_continuation: bool = False 8 | block_description: str = "A group of list items that should be rendered together." 9 | 10 | def assemble_html(self, document, child_blocks, parent_structure): 11 | template = super().assemble_html(document, child_blocks, parent_structure) 12 | 13 | el_attr = f" block-type='{self.block_type}'" 14 | if self.has_continuation: 15 | el_attr += " class='has-continuation'" 16 | return f"
      {template}

    " 17 | -------------------------------------------------------------------------------- /marker/schema/groups/picture.py: -------------------------------------------------------------------------------- 1 | from marker.schema import BlockTypes 2 | from marker.schema.groups.base import Group 3 | 4 | 5 | class PictureGroup(Group): 6 | block_type: BlockTypes = BlockTypes.PictureGroup 7 | block_description: str = "A picture along with associated captions." 8 | -------------------------------------------------------------------------------- /marker/schema/groups/table.py: -------------------------------------------------------------------------------- 1 | from marker.schema import BlockTypes 2 | from marker.schema.groups.base import Group 3 | 4 | 5 | class TableGroup(Group): 6 | block_type: BlockTypes = BlockTypes.TableGroup 7 | block_description: str = "A table along with associated captions." 8 | -------------------------------------------------------------------------------- /marker/schema/registry.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Type 2 | from importlib import import_module 3 | 4 | from marker.schema import BlockTypes 5 | from marker.schema.blocks import ( 6 | Block, 7 | Caption, 8 | Code, 9 | Equation, 10 | Figure, 11 | Footnote, 12 | Form, 13 | Handwriting, 14 | InlineMath, 15 | ListItem, 16 | PageFooter, 17 | PageHeader, 18 | Picture, 19 | SectionHeader, 20 | Table, 21 | TableOfContents, 22 | Text, 23 | ComplexRegion, 24 | TableCell, 25 | Reference, 26 | ) 27 | from marker.schema.document import Document 28 | from marker.schema.groups import ( 29 | FigureGroup, 30 | ListGroup, 31 | PageGroup, 32 | PictureGroup, 33 | TableGroup, 34 | ) 35 | from marker.schema.text import Line, Span 36 | from marker.schema.text.char import Char 37 | 38 | BLOCK_REGISTRY: Dict[BlockTypes, str] = {} 39 | 40 | 41 | def register_block_class(block_type: BlockTypes, block_cls: Type[Block]): 42 | BLOCK_REGISTRY[block_type] = f"{block_cls.__module__}.{block_cls.__name__}" 43 | 44 | 45 | def get_block_class(block_type: BlockTypes) -> Type[Block]: 46 | class_path = BLOCK_REGISTRY[block_type] 47 | module_name, class_name = class_path.rsplit(".", 1) 48 | module = import_module(module_name) 49 | return getattr(module, class_name) 50 | 51 | 52 | register_block_class(BlockTypes.Line, Line) 53 | register_block_class(BlockTypes.Span, Span) 54 | register_block_class(BlockTypes.Char, Char) 55 | register_block_class(BlockTypes.FigureGroup, FigureGroup) 56 | register_block_class(BlockTypes.TableGroup, TableGroup) 57 | register_block_class(BlockTypes.ListGroup, ListGroup) 58 | register_block_class(BlockTypes.PictureGroup, PictureGroup) 59 | register_block_class(BlockTypes.Page, PageGroup) 60 | register_block_class(BlockTypes.Caption, Caption) 61 | register_block_class(BlockTypes.Code, Code) 62 | register_block_class(BlockTypes.Figure, Figure) 63 | register_block_class(BlockTypes.Footnote, Footnote) 64 | register_block_class(BlockTypes.Form, Form) 65 | register_block_class(BlockTypes.Equation, Equation) 66 | register_block_class(BlockTypes.Handwriting, Handwriting) 67 | register_block_class(BlockTypes.TextInlineMath, InlineMath) 68 | register_block_class(BlockTypes.ListItem, ListItem) 69 | register_block_class(BlockTypes.PageFooter, PageFooter) 70 | register_block_class(BlockTypes.PageHeader, PageHeader) 71 | register_block_class(BlockTypes.Picture, Picture) 72 | register_block_class(BlockTypes.SectionHeader, SectionHeader) 73 | register_block_class(BlockTypes.Table, Table) 74 | register_block_class(BlockTypes.Text, Text) 75 | register_block_class(BlockTypes.TableOfContents, TableOfContents) 76 | register_block_class(BlockTypes.ComplexRegion, ComplexRegion) 77 | register_block_class(BlockTypes.TableCell, TableCell) 78 | register_block_class(BlockTypes.Reference, Reference) 79 | register_block_class(BlockTypes.Document, Document) 80 | 81 | assert len(BLOCK_REGISTRY) == len(BlockTypes) 82 | assert all( 83 | [ 84 | get_block_class(k).model_fields["block_type"].default == k 85 | for k, _ in BLOCK_REGISTRY.items() 86 | ] 87 | ) 88 | -------------------------------------------------------------------------------- /marker/schema/text/__init__.py: -------------------------------------------------------------------------------- 1 | from marker.schema.text.line import Line 2 | from marker.schema.text.span import Span 3 | -------------------------------------------------------------------------------- /marker/schema/text/char.py: -------------------------------------------------------------------------------- 1 | from marker.schema import BlockTypes 2 | from marker.schema.blocks import Block 3 | 4 | 5 | class Char(Block): 6 | block_type: BlockTypes = BlockTypes.Char 7 | block_description: str = "A single character inside a span." 8 | 9 | text: str 10 | idx: int 11 | -------------------------------------------------------------------------------- /marker/schema/text/span.py: -------------------------------------------------------------------------------- 1 | import html 2 | import re 3 | from typing import List, Literal, Optional 4 | 5 | from marker.schema import BlockTypes 6 | from marker.schema.blocks import Block 7 | 8 | 9 | def cleanup_text(full_text): 10 | full_text = re.sub(r'(\n\s){3,}', '\n\n', full_text) 11 | full_text = full_text.replace('\xa0', ' ') # Replace non-breaking spaces 12 | return full_text 13 | 14 | 15 | class Span(Block): 16 | block_type: BlockTypes = BlockTypes.Span 17 | block_description: str = "A span of text inside a line." 18 | 19 | text: str 20 | font: str 21 | font_weight: float 22 | font_size: float 23 | minimum_position: int 24 | maximum_position: int 25 | formats: List[Literal['plain', 'math', 'chemical', 'bold', 'italic', 'highlight', 'subscript', 'superscript', 'small', 'code', 'underline']] 26 | has_superscript: bool = False 27 | has_subscript: bool = False 28 | url: Optional[str] = None 29 | html: Optional[str] = None 30 | 31 | @property 32 | def bold(self): 33 | return 'bold' in self.formats 34 | 35 | @property 36 | def italic(self): 37 | return 'italic' in self.formats 38 | 39 | @property 40 | def math(self): 41 | return 'math' in self.formats 42 | 43 | @property 44 | def highlight(self): 45 | return 'highlight' in self.formats 46 | 47 | @property 48 | def superscript(self): 49 | return 'superscript' in self.formats 50 | 51 | @property 52 | def subscript(self): 53 | return 'subscript' in self.formats 54 | 55 | @property 56 | def small(self): 57 | return 'small' in self.formats 58 | 59 | @property 60 | def code(self): 61 | return 'code' in self.formats 62 | 63 | @property 64 | def underline(self): 65 | return 'underline' in self.formats 66 | 67 | def assemble_html(self, document, child_blocks, parent_structure): 68 | if self.ignore_for_output: 69 | return "" 70 | 71 | if self.html: 72 | return self.html 73 | 74 | text = self.text 75 | 76 | # Remove trailing newlines 77 | replaced_newline = False 78 | while len(text) > 0 and text[-1] in ["\n", "\r"]: 79 | text = text[:-1] 80 | replaced_newline = True 81 | 82 | # Remove leading newlines 83 | while len(text) > 0 and text[0] in ["\n", "\r"]: 84 | text = text[1:] 85 | 86 | if replaced_newline and not text.endswith('-'): 87 | text += " " 88 | 89 | text = text.replace("-\n", "") # Remove hyphenated line breaks from the middle of the span 90 | text = html.escape(text) 91 | text = cleanup_text(text) 92 | 93 | if self.has_superscript: 94 | text = re.sub(r"^([0-9\W]+)(.*)", r"\1\2", text) 95 | 96 | # Handle full block superscript 97 | if "" not in text: 98 | text = f"{text}" 99 | 100 | if self.url: 101 | text = f"{text}" 102 | 103 | # TODO Support multiple formats 104 | if self.italic: 105 | text = f"{text}" 106 | elif self.bold: 107 | text = f"{text}" 108 | elif self.math: 109 | text = f"{text}" 110 | elif self.highlight: 111 | text = f"{text}" 112 | elif self.subscript: 113 | text = f"{text}" 114 | elif self.superscript: 115 | text = f"{text}" 116 | elif self.underline: 117 | text = f"{text}" 118 | elif self.small: 119 | text = f"{text}" 120 | elif self.code: 121 | text = f"{text}" 122 | 123 | return text 124 | -------------------------------------------------------------------------------- /marker/scripts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VikParuchuri/marker/86af672aec27f4f8c5a20abbfdc4fbcf0b630108/marker/scripts/__init__.py -------------------------------------------------------------------------------- /marker/scripts/chunk_convert.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import subprocess 4 | import pkg_resources 5 | 6 | 7 | def chunk_convert_cli(): 8 | parser = argparse.ArgumentParser(description="Convert a folder of PDFs to a folder of markdown files in chunks.") 9 | parser.add_argument("in_folder", help="Input folder with pdfs.") 10 | parser.add_argument("out_folder", help="Output folder") 11 | args = parser.parse_args() 12 | 13 | cur_dir = os.path.dirname(os.path.abspath(__file__)) 14 | script_path = os.path.join(cur_dir, "chunk_convert.sh") 15 | 16 | # Construct the command 17 | cmd = f"{script_path} {args.in_folder} {args.out_folder}" 18 | 19 | # Execute the shell script 20 | subprocess.run(cmd, shell=True, check=True) -------------------------------------------------------------------------------- /marker/scripts/chunk_convert.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | trap 'pkill -P $$' SIGINT 4 | 5 | # Check if NUM_DEVICES is set 6 | if [[ -z "$NUM_DEVICES" ]]; then 7 | echo "Please set the NUM_DEVICES environment variable." 8 | exit 1 9 | fi 10 | 11 | if [[ -z "$NUM_WORKERS" ]]; then 12 | echo "Please set the NUM_WORKERS environment variable." 13 | exit 1 14 | fi 15 | 16 | # Get input folder and output folder from args 17 | if [[ -z "$1" ]]; then 18 | echo "Please provide an input folder." 19 | exit 1 20 | fi 21 | 22 | if [[ -z "$2" ]]; then 23 | echo "Please provide an output folder." 24 | exit 1 25 | fi 26 | 27 | INPUT_FOLDER=$1 28 | OUTPUT_FOLDER=$2 29 | 30 | # Ensure output folder exists 31 | mkdir -p "$OUTPUT_FOLDER" 32 | 33 | # Loop from 0 to NUM_DEVICES and run the marker command in parallel 34 | for (( i=0; i<$NUM_DEVICES; i++ )); do 35 | DEVICE_NUM=$i 36 | export DEVICE_NUM 37 | export NUM_DEVICES 38 | export NUM_WORKERS 39 | echo "Running marker on GPU $DEVICE_NUM" 40 | cmd="CUDA_VISIBLE_DEVICES=$DEVICE_NUM marker $INPUT_FOLDER --output_dir $OUTPUT_FOLDER --num_chunks $NUM_DEVICES --chunk_idx $DEVICE_NUM --workers $NUM_WORKERS" 41 | eval $cmd & 42 | 43 | sleep 5 44 | done 45 | 46 | # Wait for all background processes to finish 47 | wait -------------------------------------------------------------------------------- /marker/scripts/convert_single.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | os.environ["GRPC_VERBOSITY"] = "ERROR" 4 | os.environ["GLOG_minloglevel"] = "2" 5 | os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = ( 6 | "1" # Transformers uses .isin for a simple op, which is not supported on MPS 7 | ) 8 | 9 | import time 10 | import click 11 | 12 | from marker.config.parser import ConfigParser 13 | from marker.config.printer import CustomClickPrinter 14 | from marker.logger import configure_logging, get_logger 15 | from marker.models import create_model_dict 16 | from marker.output import save_output 17 | 18 | configure_logging() 19 | logger = get_logger() 20 | 21 | 22 | @click.command(cls=CustomClickPrinter, help="Convert a single PDF to markdown.") 23 | @click.argument("fpath", type=str) 24 | @ConfigParser.common_options 25 | def convert_single_cli(fpath: str, **kwargs): 26 | models = create_model_dict() 27 | start = time.time() 28 | config_parser = ConfigParser(kwargs) 29 | 30 | converter_cls = config_parser.get_converter_cls() 31 | converter = converter_cls( 32 | config=config_parser.generate_config_dict(), 33 | artifact_dict=models, 34 | processor_list=config_parser.get_processors(), 35 | renderer=config_parser.get_renderer(), 36 | llm_service=config_parser.get_llm_service(), 37 | ) 38 | rendered = converter(fpath) 39 | out_folder = config_parser.get_output_folder(fpath) 40 | save_output(rendered, out_folder, config_parser.get_base_filename(fpath)) 41 | 42 | logger.info(f"Saved markdown to {out_folder}") 43 | logger.info(f"Total time: {time.time() - start}") 44 | -------------------------------------------------------------------------------- /marker/scripts/file_to_s3.py: -------------------------------------------------------------------------------- 1 | import json 2 | import shutil 3 | import datetime 4 | from pathlib import Path 5 | import boto3 6 | 7 | from huggingface_hub import snapshot_download 8 | 9 | import click 10 | 11 | S3_API_URL = "https://1afbe4656a6b40d982ab5e730a39f6b9.r2.cloudflarestorage.com" 12 | 13 | @click.command(help="Uploads files to an S3 bucket") 14 | @click.argument("filepath", type=str) 15 | @click.argument("s3_path", type=str) 16 | @click.option("--bucket_name", type=str, default="datalab") 17 | @click.option("--access_key_id", type=str, default="") 18 | @click.option("--access_key_secret", type=str, default="") 19 | def main(filepath: str, s3_path: str, bucket_name: str, access_key_id: str, access_key_secret: str): 20 | filepath = Path(filepath) 21 | # Upload the files to S3 22 | s3_client = boto3.client( 23 | 's3', 24 | endpoint_url=S3_API_URL, 25 | aws_access_key_id=access_key_id, 26 | aws_secret_access_key=access_key_secret, 27 | region_name="enam" 28 | ) 29 | 30 | s3_key = f"{s3_path}/{filepath.name}" 31 | 32 | try: 33 | s3_client.upload_file( 34 | str(filepath), 35 | bucket_name, 36 | s3_key 37 | ) 38 | except Exception as e: 39 | print(f"Error uploading {filepath}: {str(e)}") 40 | 41 | print(f"Uploaded files to {s3_path}") 42 | 43 | if __name__ == "__main__": 44 | main() 45 | 46 | 47 | 48 | -------------------------------------------------------------------------------- /marker/scripts/run_streamlit_app.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import os 3 | import sys 4 | 5 | 6 | def streamlit_app_cli(app_name: str = "streamlit_app.py"): 7 | argv = sys.argv[1:] 8 | cur_dir = os.path.dirname(os.path.abspath(__file__)) 9 | app_path = os.path.join(cur_dir, app_name) 10 | cmd = [ 11 | "streamlit", 12 | "run", 13 | app_path, 14 | "--server.fileWatcherType", 15 | "none", 16 | "--server.headless", 17 | "true", 18 | ] 19 | if argv: 20 | cmd += ["--"] + argv 21 | subprocess.run(cmd, env={**os.environ, "IN_STREAMLIT": "true"}) 22 | 23 | 24 | def extraction_app_cli(): 25 | streamlit_app_cli("extraction_app.py") 26 | -------------------------------------------------------------------------------- /marker/services/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, List, Annotated 2 | 3 | import PIL 4 | from pydantic import BaseModel 5 | 6 | from marker.schema.blocks import Block 7 | from marker.util import assign_config, verify_config_keys 8 | 9 | 10 | class BaseService: 11 | timeout: Annotated[int, "The timeout to use for the service."] = 30 12 | max_retries: Annotated[ 13 | int, "The maximum number of retries to use for the service." 14 | ] = 2 15 | retry_wait_time: Annotated[int, "The wait time between retries."] = 3 16 | 17 | def __init__(self, config: Optional[BaseModel | dict] = None): 18 | assign_config(self, config) 19 | 20 | # Ensure we have all necessary fields filled out (API keys, etc.) 21 | verify_config_keys(self) 22 | 23 | def __call__( 24 | self, 25 | prompt: str, 26 | image: PIL.Image.Image | List[PIL.Image.Image], 27 | block: Block, 28 | response_schema: type[BaseModel], 29 | max_retries: int | None = None, 30 | timeout: int | None = None, 31 | ): 32 | raise NotImplementedError 33 | -------------------------------------------------------------------------------- /marker/services/gemini.py: -------------------------------------------------------------------------------- 1 | import json 2 | import time 3 | from io import BytesIO 4 | from typing import List, Annotated 5 | 6 | import PIL 7 | from google import genai 8 | from google.genai import types 9 | from google.genai.errors import APIError 10 | from marker.logger import get_logger 11 | from pydantic import BaseModel 12 | 13 | from marker.schema.blocks import Block 14 | from marker.services import BaseService 15 | 16 | logger = get_logger() 17 | 18 | 19 | class BaseGeminiService(BaseService): 20 | gemini_model_name: Annotated[ 21 | str, "The name of the Google model to use for the service." 22 | ] = "gemini-2.0-flash" 23 | 24 | def img_to_bytes(self, img: PIL.Image.Image): 25 | image_bytes = BytesIO() 26 | img.save(image_bytes, format="WEBP") 27 | return image_bytes.getvalue() 28 | 29 | def get_google_client(self, timeout: int): 30 | raise NotImplementedError 31 | 32 | def __call__( 33 | self, 34 | prompt: str, 35 | image: PIL.Image.Image | List[PIL.Image.Image], 36 | block: Block, 37 | response_schema: type[BaseModel], 38 | max_retries: int | None = None, 39 | timeout: int | None = None, 40 | ): 41 | if max_retries is None: 42 | max_retries = self.max_retries 43 | 44 | if timeout is None: 45 | timeout = self.timeout 46 | 47 | if not isinstance(image, list): 48 | image = [image] 49 | 50 | client = self.get_google_client(timeout=timeout) 51 | image_parts = [ 52 | types.Part.from_bytes(data=self.img_to_bytes(img), mime_type="image/webp") 53 | for img in image 54 | ] 55 | 56 | tries = 0 57 | while tries < max_retries: 58 | try: 59 | responses = client.models.generate_content( 60 | model=self.gemini_model_name, 61 | contents=image_parts 62 | + [ 63 | prompt 64 | ], # According to gemini docs, it performs better if the image is the first element 65 | config={ 66 | "temperature": 0, 67 | "response_schema": response_schema, 68 | "response_mime_type": "application/json", 69 | }, 70 | ) 71 | output = responses.candidates[0].content.parts[0].text 72 | total_tokens = responses.usage_metadata.total_token_count 73 | block.update_metadata(llm_tokens_used=total_tokens, llm_request_count=1) 74 | return json.loads(output) 75 | except APIError as e: 76 | if e.code in [429, 443, 503]: 77 | # Rate limit exceeded 78 | tries += 1 79 | wait_time = tries * self.retry_wait_time 80 | logger.warning( 81 | f"APIError: {e}. Retrying in {wait_time} seconds... (Attempt {tries}/{max_retries})" 82 | ) 83 | time.sleep(wait_time) 84 | else: 85 | logger.error(f"APIError: {e}") 86 | break 87 | except Exception as e: 88 | logger.error(f"Exception: {e}") 89 | break 90 | 91 | return {} 92 | 93 | 94 | class GoogleGeminiService(BaseGeminiService): 95 | gemini_api_key: Annotated[str, "The Google API key to use for the service."] = None 96 | 97 | def get_google_client(self, timeout: int): 98 | return genai.Client( 99 | api_key=self.gemini_api_key, 100 | http_options={"timeout": timeout * 1000}, # Convert to milliseconds 101 | ) 102 | -------------------------------------------------------------------------------- /marker/services/ollama.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import json 3 | from io import BytesIO 4 | from typing import Annotated, List 5 | 6 | import PIL 7 | import requests 8 | from marker.logger import get_logger 9 | from pydantic import BaseModel 10 | 11 | from marker.schema.blocks import Block 12 | from marker.services import BaseService 13 | 14 | logger = get_logger() 15 | 16 | 17 | class OllamaService(BaseService): 18 | ollama_base_url: Annotated[ 19 | str, "The base url to use for ollama. No trailing slash." 20 | ] = "http://localhost:11434" 21 | ollama_model: Annotated[str, "The model name to use for ollama."] = ( 22 | "llama3.2-vision" 23 | ) 24 | 25 | def image_to_base64(self, image: PIL.Image.Image): 26 | image_bytes = BytesIO() 27 | image.save(image_bytes, format="PNG") 28 | return base64.b64encode(image_bytes.getvalue()).decode("utf-8") 29 | 30 | def __call__( 31 | self, 32 | prompt: str, 33 | image: PIL.Image.Image | List[PIL.Image.Image], 34 | block: Block, 35 | response_schema: type[BaseModel], 36 | max_retries: int | None = None, 37 | timeout: int | None = None, 38 | ): 39 | url = f"{self.ollama_base_url}/api/generate" 40 | headers = {"Content-Type": "application/json"} 41 | 42 | schema = response_schema.model_json_schema() 43 | format_schema = { 44 | "type": "object", 45 | "properties": schema["properties"], 46 | "required": schema["required"], 47 | } 48 | 49 | if not isinstance(image, list): 50 | image = [image] 51 | 52 | image_bytes = [self.image_to_base64(img) for img in image] 53 | 54 | payload = { 55 | "model": self.ollama_model, 56 | "prompt": prompt, 57 | "stream": False, 58 | "format": format_schema, 59 | "images": image_bytes, 60 | } 61 | 62 | try: 63 | response = requests.post(url, json=payload, headers=headers) 64 | response.raise_for_status() 65 | response_data = response.json() 66 | 67 | total_tokens = ( 68 | response_data["prompt_eval_count"] + response_data["eval_count"] 69 | ) 70 | block.update_metadata(llm_request_count=1, llm_tokens_used=total_tokens) 71 | 72 | data = response_data["response"] 73 | return json.loads(data) 74 | except Exception as e: 75 | logger.warning(f"Ollama inference failed: {e}") 76 | 77 | return {} 78 | -------------------------------------------------------------------------------- /marker/services/vertex.py: -------------------------------------------------------------------------------- 1 | from typing import Annotated 2 | 3 | from google import genai 4 | 5 | from marker.services.gemini import BaseGeminiService 6 | 7 | class GoogleVertexService(BaseGeminiService): 8 | vertex_project_id: Annotated[ 9 | str, 10 | "Google Cloud Project ID for Vertex AI.", 11 | ] = None 12 | vertex_location: Annotated[ 13 | str, 14 | "Google Cloud Location for Vertex AI.", 15 | ] = "us-central1" 16 | gemini_model_name: Annotated[ 17 | str, 18 | "The name of the Google model to use for the service." 19 | ] = "gemini-2.0-flash-001" 20 | vertex_dedicated: Annotated[ 21 | bool, 22 | "Whether to use a dedicated Vertex AI instance." 23 | ] = False 24 | 25 | def get_google_client(self, timeout: int): 26 | http_options = {"timeout": timeout * 1000} # Convert to milliseconds 27 | if self.vertex_dedicated: 28 | http_options["headers"] = {"x-vertex-ai-llm-request-type": "dedicated"} 29 | return genai.Client( 30 | vertexai=True, 31 | project=self.vertex_project_id, 32 | location=self.vertex_location, 33 | http_options=http_options, 34 | ) -------------------------------------------------------------------------------- /marker/settings.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from dotenv import find_dotenv 4 | from pydantic import computed_field 5 | from pydantic_settings import BaseSettings 6 | import torch 7 | import os 8 | 9 | 10 | class Settings(BaseSettings): 11 | # Paths 12 | BASE_DIR: str = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 13 | OUTPUT_DIR: str = os.path.join(BASE_DIR, "conversion_results") 14 | FONT_DIR: str = os.path.join(BASE_DIR, "static", "fonts") 15 | DEBUG_DATA_FOLDER: str = os.path.join(BASE_DIR, "debug_data") 16 | ARTIFACT_URL: str = "https://models.datalab.to/artifacts" 17 | FONT_NAME: str = "GoNotoCurrent-Regular.ttf" 18 | FONT_PATH: str = os.path.join(FONT_DIR, FONT_NAME) 19 | LOGLEVEL: str = "INFO" 20 | 21 | # General 22 | OUTPUT_ENCODING: str = "utf-8" 23 | OUTPUT_IMAGE_FORMAT: str = "JPEG" 24 | 25 | # LLM 26 | GOOGLE_API_KEY: Optional[str] = "" 27 | 28 | # General models 29 | TORCH_DEVICE: Optional[str] = ( 30 | None # Note: MPS device does not work for text detection, and will default to CPU 31 | ) 32 | 33 | @computed_field 34 | @property 35 | def TORCH_DEVICE_MODEL(self) -> str: 36 | if self.TORCH_DEVICE is not None: 37 | return self.TORCH_DEVICE 38 | 39 | if torch.cuda.is_available(): 40 | return "cuda" 41 | 42 | if torch.backends.mps.is_available(): 43 | return "mps" 44 | 45 | return "cpu" 46 | 47 | @computed_field 48 | @property 49 | def MODEL_DTYPE(self) -> torch.dtype: 50 | if self.TORCH_DEVICE_MODEL == "cuda": 51 | return torch.bfloat16 52 | else: 53 | return torch.float32 54 | 55 | class Config: 56 | env_file = find_dotenv("local.env") 57 | extra = "ignore" 58 | 59 | 60 | settings = Settings() 61 | -------------------------------------------------------------------------------- /marker_app.py: -------------------------------------------------------------------------------- 1 | from marker.scripts.run_streamlit_app import streamlit_app_cli 2 | 3 | if __name__ == "__main__": 4 | streamlit_app_cli() -------------------------------------------------------------------------------- /marker_server.py: -------------------------------------------------------------------------------- 1 | from marker.scripts.server import server_cli 2 | 3 | if __name__ == "__main__": 4 | server_cli() 5 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "marker-pdf" 3 | version = "1.7.4" 4 | description = "Convert documents to markdown with high speed and accuracy." 5 | authors = ["Vik Paruchuri "] 6 | readme = "README.md" 7 | license = "GPL-3.0-or-later" 8 | repository = "https://github.com/VikParuchuri/marker" 9 | keywords = ["pdf", "markdown", "ocr", "nlp"] 10 | packages = [ 11 | {include = "marker"} 12 | ] 13 | include = [ 14 | "marker/scripts/*.sh", 15 | "marker/scripts/*.html", 16 | ] 17 | 18 | [tool.poetry.dependencies] 19 | python = "^3.10" 20 | Pillow = "^10.1.0" 21 | pydantic = "^2.4.2" 22 | pydantic-settings = "^2.0.3" 23 | transformers = "^4.45.2" 24 | python-dotenv = "^1.0.0" 25 | torch = "^2.7.0" 26 | tqdm = "^4.66.1" 27 | ftfy = "^6.1.1" 28 | rapidfuzz = "^3.8.1" 29 | surya-ocr = "^0.14.5" 30 | regex = "^2024.4.28" 31 | pdftext = "~0.6.2" 32 | markdownify = "^0.13.1" 33 | click = "^8.2.0" 34 | markdown2 = "^2.5.2" 35 | filetype = "^1.2.0" 36 | scikit-learn = "^1.6.1" 37 | google-genai = "^1.0.0" 38 | anthropic = "^0.46.0" 39 | pre-commit = "^4.2.0" 40 | 41 | # Optional dependencies for documents 42 | mammoth = {version = "^1.9.0", optional = true} 43 | openpyxl = {version = "^3.1.5", optional = true} 44 | python-pptx = {version = "^1.0.2", optional = true} 45 | ebooklib = {version = "^0.18", optional = true} 46 | weasyprint = {version = "^63.1", optional = true} 47 | openai = "^1.65.2" 48 | 49 | [tool.poetry.group.dev.dependencies] 50 | jupyter = "^1.0.0" 51 | datasets = "^2.21.0" 52 | streamlit = "^1.37.1" 53 | fastapi = "^0.115.4" 54 | uvicorn = "^0.32.0" 55 | python-multipart = "^0.0.16" 56 | pytest = "^8.3.3" 57 | pytest-mock = "^3.14.0" 58 | apted = "1.0.3" 59 | distance = "0.1.3" 60 | lxml = "5.3.0" 61 | tabulate = "^0.9.0" 62 | latex2mathml = "^3.77.0" 63 | playwright = "^1.49.1" 64 | 65 | [tool.poetry.extras] 66 | full = ["mammoth", "openpyxl", "python-pptx", "ebooklib", "weasyprint"] 67 | 68 | [tool.poetry.scripts] 69 | marker = "marker.scripts.convert:convert_cli" 70 | marker_single = "marker.scripts.convert_single:convert_single_cli" 71 | marker_chunk_convert = "marker.scripts.chunk_convert:chunk_convert_cli" 72 | marker_gui = "marker.scripts.run_streamlit_app:streamlit_app_cli" 73 | marker_extract = "marker.scripts.run_streamlit_app:extraction_app_cli" 74 | marker_server = "marker.scripts.server:server_cli" 75 | 76 | [build-system] 77 | requires = ["poetry-core"] 78 | build-backend = "poetry.core.masonry.api" -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | testpaths=tests 3 | markers = 4 | filename(name): specify the filename for the pdf_document fixture 5 | filterwarnings = 6 | ignore::Warning -------------------------------------------------------------------------------- /static/fonts/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore -------------------------------------------------------------------------------- /tests/builders/test_blank_page.py: -------------------------------------------------------------------------------- 1 | from surya.layout.schema import LayoutResult 2 | 3 | from marker.builders.document import DocumentBuilder 4 | from marker.builders.layout import LayoutBuilder 5 | from marker.builders.line import LineBuilder 6 | 7 | 8 | def test_blank_page(config, doc_provider, layout_model, ocr_error_model, detection_model): 9 | layout_builder = LayoutBuilder(layout_model, config) 10 | line_builder = LineBuilder(detection_model, ocr_error_model) 11 | builder = DocumentBuilder(config) 12 | document = builder.build_document(doc_provider) 13 | 14 | layout_results = [LayoutResult( 15 | bboxes=[], 16 | image_bbox=p.polygon.bbox, 17 | ) for p in document.pages] 18 | provider_lines = {p.page_id: [] for p in document.pages} 19 | ocr_lines = {p.page_id: [] for p in document.pages} 20 | 21 | layout_builder.add_blocks_to_pages(document.pages, layout_results) 22 | line_builder.merge_blocks(document, provider_lines, ocr_lines) 23 | 24 | assert all([isinstance(p.children, list) for p in document.pages]) 25 | assert all([isinstance(p.structure, list) for p in document.pages]) -------------------------------------------------------------------------------- /tests/builders/test_document_builder.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from marker.schema import BlockTypes 4 | from marker.schema.text.line import Line 5 | 6 | 7 | @pytest.mark.config({"page_range": [0]}) 8 | def test_document_builder(pdf_document): 9 | first_page = pdf_document.pages[0] 10 | assert first_page.structure[0] == '/page/0/SectionHeader/0' 11 | 12 | first_block = first_page.get_block(first_page.structure[0]) 13 | assert first_block.block_type == BlockTypes.SectionHeader 14 | assert first_block.text_extraction_method == 'pdftext' 15 | 16 | first_text_block: Line = first_page.get_block(first_block.structure[0]) 17 | assert first_text_block.block_type == BlockTypes.Line 18 | 19 | first_span = first_page.get_block(first_text_block.structure[0]) 20 | assert first_span.block_type == BlockTypes.Span 21 | assert first_span.text == 'Subspace Adversarial Training' 22 | assert first_span.font == 'NimbusRomNo9L-Medi' 23 | assert first_span.formats == ['plain'] 24 | -------------------------------------------------------------------------------- /tests/builders/test_garbled_pdf.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from marker.builders.document import DocumentBuilder 4 | from marker.builders.line import LineBuilder 5 | from marker.processors.table import TableProcessor 6 | from marker.schema import BlockTypes 7 | 8 | 9 | @pytest.mark.filename("water_damage.pdf") 10 | def test_garbled_pdf(pdf_document, detection_model, recognition_model, table_rec_model): 11 | assert pdf_document.pages[0].structure[0] == "/page/0/Table/0" 12 | 13 | table_block = pdf_document.pages[0].get_block(pdf_document.pages[0].structure[0]) 14 | assert table_block.block_type == BlockTypes.Table 15 | assert table_block.structure[0] == "/page/0/Line/1" 16 | 17 | table_cell = pdf_document.pages[0].get_block(table_block.structure[0]) 18 | assert table_cell.block_type == BlockTypes.Line 19 | 20 | # We don't OCR in the initial pass, only with the TableProcessor 21 | processor = TableProcessor(detection_model, recognition_model, table_rec_model) 22 | processor(pdf_document) 23 | 24 | table = pdf_document.pages[0].contained_blocks(pdf_document, (BlockTypes.Table,))[0] 25 | assert "варіант" in table.raw_text(pdf_document) 26 | 27 | table_cell = pdf_document.pages[0].get_block(table_block.structure[0]) 28 | assert table_cell.block_type == BlockTypes.TableCell 29 | 30 | 31 | @pytest.mark.filename("hindi_judgement.pdf") 32 | @pytest.mark.config({"page_range": [2, 3], "disable_ocr": True}) 33 | def test_garbled_builder(config, doc_provider, detection_model, ocr_error_model): 34 | line_builder = LineBuilder(detection_model, ocr_error_model, config) 35 | builder = DocumentBuilder(config) 36 | document = builder.build_document(doc_provider) 37 | 38 | bad_ocr_results = line_builder.ocr_error_detection( 39 | document.pages, doc_provider.page_lines 40 | ) 41 | assert len(bad_ocr_results.labels) == 2 42 | assert any([label == "bad" for label in bad_ocr_results.labels]) 43 | 44 | 45 | @pytest.mark.filename("adversarial.pdf") 46 | @pytest.mark.config({"page_range": [2, 3], "disable_ocr": True}) 47 | def test_nongarbled_builder(config, doc_provider, detection_model, ocr_error_model): 48 | line_builder = LineBuilder(detection_model, ocr_error_model, config) 49 | builder = DocumentBuilder(config) 50 | document = builder.build_document(doc_provider) 51 | 52 | bad_ocr_results = line_builder.ocr_error_detection( 53 | document.pages, doc_provider.page_lines 54 | ) 55 | assert len(bad_ocr_results.labels) == 2 56 | assert all([label == "good" for label in bad_ocr_results.labels]) 57 | -------------------------------------------------------------------------------- /tests/builders/test_layout_replace.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from marker.builders.document import DocumentBuilder 4 | from marker.builders.layout import LayoutBuilder 5 | from marker.builders.line import LineBuilder 6 | from marker.renderers.markdown import MarkdownRenderer 7 | from marker.schema import BlockTypes 8 | from marker.schema.registry import get_block_class 9 | 10 | 11 | @pytest.mark.config({"page_range": [0]}) 12 | def test_layout_replace(request, config, doc_provider, layout_model, ocr_error_model, detection_model): 13 | # The llm layout builder replaces blocks - this makes sure text is still merged properly 14 | layout_builder = LayoutBuilder(layout_model, config) 15 | line_builder = LineBuilder(detection_model, ocr_error_model, config) 16 | builder = DocumentBuilder(config) 17 | document = builder.build_document(doc_provider) 18 | layout_builder(document, doc_provider) 19 | page = document.pages[0] 20 | new_blocks = [] 21 | for block in page.contained_blocks(document, (BlockTypes.Text,)): 22 | generated_block_class = get_block_class(BlockTypes.TextInlineMath) 23 | generated_block = generated_block_class( 24 | polygon=block.polygon, 25 | page_id=block.page_id, 26 | structure=block.structure, 27 | ) 28 | page.replace_block(block, generated_block) 29 | new_blocks.append(generated_block) 30 | line_builder(document, doc_provider) 31 | 32 | for block in new_blocks: 33 | assert block.raw_text(document).strip() 34 | 35 | renderer = MarkdownRenderer(config) 36 | rendered = renderer(document) 37 | 38 | assert "worst-case perturbations" in rendered.markdown 39 | assert "projected gradient descent" in rendered.markdown 40 | 41 | 42 | 43 | -------------------------------------------------------------------------------- /tests/builders/test_line_builder.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from marker.schema import BlockTypes 4 | 5 | # Page contains provider lines that are longer than detected lines 6 | # Any bad merging will cause broken final OCR results with format lines 7 | @pytest.mark.filename("mixed_eng_hindi.pdf") 8 | @pytest.mark.config({"page_range": [2], "format_lines": True}) 9 | def test_provider_detected_line_merge(pdf_document): 10 | page = pdf_document.pages[0] 11 | text_lines = page.contained_blocks(pdf_document, (BlockTypes.Line,)) 12 | 13 | # This count includes detected lines merged in with provider lines 14 | assert len(text_lines) == 83 15 | 16 | # Page provider lines only contain english, while the hindi is missing 17 | # format_lines should fill in the missing lines 18 | @pytest.mark.filename("mixed_eng_hindi.pdf") 19 | @pytest.mark.config({"page_range": [0], "format_lines": True}) 20 | def test_fill_missing_provider_lines(pdf_document): 21 | page = pdf_document.pages[0] 22 | raw_text = page.raw_text(pdf_document) 23 | assert "प्राधिकार से प्रकाशित" in raw_text 24 | assert "खान मंत्रालय" in raw_text -------------------------------------------------------------------------------- /tests/builders/test_merged_lines.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from marker.schema import BlockTypes 4 | 5 | 6 | @pytest.mark.config({"page_range": [6], "format_lines": True, "disable_ocr": True}) 7 | @pytest.mark.filename("bad_math.pdf") 8 | def test_keep_ocr(pdf_document): 9 | contained_lines = pdf_document.pages[0].contained_blocks( 10 | pdf_document, [BlockTypes.Line] 11 | ) 12 | 13 | # Check that we grabbed the right text 14 | assert "Lemma" in contained_lines[-1].formatted_text(pdf_document) 15 | assert "distribution" in contained_lines[-2].formatted_text(pdf_document) 16 | 17 | # Line 2 comes after line 1 18 | assert contained_lines[-1].polygon.bbox[1] > contained_lines[-2].polygon.bbox[3] 19 | -------------------------------------------------------------------------------- /tests/builders/test_ocr_builder.py: -------------------------------------------------------------------------------- 1 | from PIL import Image 2 | 3 | from marker.builders.ocr import OcrBuilder 4 | 5 | 6 | def test_blank_char_builder(recognition_model): 7 | builder = OcrBuilder(recognition_model) 8 | image = Image.new("RGB", (100, 100)) 9 | spans = builder.spans_from_html_chars([], None, image) # Test with empty char list 10 | assert len(spans) == 0 11 | -------------------------------------------------------------------------------- /tests/builders/test_ocr_pipeline.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from marker.schema import BlockTypes 4 | from marker.schema.text.line import Line 5 | 6 | 7 | def _ocr_pipeline_test(pdf_document): 8 | first_page = pdf_document.pages[0] 9 | assert first_page.structure[0] == "/page/0/SectionHeader/0" 10 | 11 | first_block = first_page.get_block(first_page.structure[0]) 12 | assert first_block.text_extraction_method == "surya" 13 | assert first_block.block_type == BlockTypes.SectionHeader 14 | 15 | first_text_block: Line = first_page.get_block(first_block.structure[0]) 16 | assert first_text_block.block_type == BlockTypes.Line 17 | 18 | first_span = first_page.get_block(first_text_block.structure[0]) 19 | assert first_span.block_type == BlockTypes.Span 20 | assert first_span.text.strip() == "Subspace Adversarial Training" 21 | 22 | # Ensure we match all text lines up properly 23 | # Makes sure the OCR bbox is being scaled to the same scale as the layout boxes 24 | text_lines = first_page.contained_blocks(pdf_document, (BlockTypes.Line,)) 25 | text_blocks = first_page.contained_blocks( 26 | pdf_document, (BlockTypes.Text, BlockTypes.TextInlineMath) 27 | ) 28 | assert len(text_lines) == 83 29 | 30 | # Ensure the bbox sizes match up 31 | max_line_position = max([line.polygon.y_end for line in text_lines]) 32 | max_block_position = max( 33 | [block.polygon.y_end for block in text_blocks if block.source == "layout"] 34 | ) 35 | assert max_line_position <= (max_block_position * 1.02) 36 | 37 | 38 | @pytest.mark.config({"force_ocr": True, "page_range": [0]}) 39 | def test_ocr_pipeline(pdf_document): 40 | _ocr_pipeline_test(pdf_document) 41 | 42 | 43 | @pytest.mark.config({"force_ocr": True, "page_range": [0], "use_llm": True}) 44 | def test_ocr_with_inline_pipeline(pdf_document): 45 | _ocr_pipeline_test(pdf_document) 46 | -------------------------------------------------------------------------------- /tests/builders/test_overriding.py: -------------------------------------------------------------------------------- 1 | import multiprocessing as mp 2 | 3 | import pytest 4 | 5 | from marker.providers.pdf import PdfProvider 6 | from marker.schema import BlockTypes 7 | from marker.schema.blocks import SectionHeader 8 | from marker.schema.document import Document 9 | from marker.schema.registry import register_block_class 10 | from marker.schema.text import Line 11 | from tests.utils import setup_pdf_provider 12 | 13 | 14 | class NewSectionHeader(SectionHeader): 15 | pass 16 | 17 | 18 | class NewLine(Line): 19 | pass 20 | 21 | 22 | @pytest.mark.config({ 23 | "page_range": [0], 24 | "override_map": {BlockTypes.SectionHeader: NewSectionHeader} 25 | }) 26 | def test_overriding(pdf_document: Document): 27 | assert pdf_document.pages[0]\ 28 | .get_block(pdf_document.pages[0].structure[0]).__class__ == NewSectionHeader 29 | 30 | 31 | def get_lines(pdf: str, config=None): 32 | for block_type, block_cls in config["override_map"].items(): 33 | register_block_class(block_type, block_cls) 34 | 35 | provider: PdfProvider = setup_pdf_provider(pdf, config) 36 | return provider.get_page_lines(0) 37 | 38 | 39 | def test_overriding_mp(): 40 | config = { 41 | "page_range": [0], 42 | "override_map": {BlockTypes.Line: NewLine} 43 | } 44 | 45 | pdf_list = ["adversarial.pdf", "adversarial_rot.pdf"] 46 | 47 | with mp.Pool(processes=2) as pool: 48 | results = pool.starmap(get_lines, [(pdf, config) for pdf in pdf_list]) 49 | assert all([r[0].line.__class__ == NewLine for r in results]) 50 | -------------------------------------------------------------------------------- /tests/builders/test_pdf_links.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | import pytest 4 | 5 | from marker.converters.pdf import PdfConverter 6 | from marker.renderers.markdown import MarkdownOutput 7 | from marker.schema import BlockTypes 8 | from marker.schema.document import Document 9 | from marker.util import classes_to_strings 10 | 11 | 12 | @pytest.mark.filename("arxiv_test.pdf") 13 | @pytest.mark.output_format("markdown") 14 | def test_pdf_links(pdf_document: Document, config, renderer, model_dict, temp_doc): 15 | first_page = pdf_document.pages[1] 16 | 17 | processors = ["marker.processors.reference.ReferenceProcessor"] 18 | pdf_converter = PdfConverter( 19 | artifact_dict=model_dict, 20 | processor_list=processors, 21 | renderer=classes_to_strings([renderer])[0], 22 | config=config 23 | ) 24 | 25 | for section_header_span in first_page.contained_blocks(pdf_document, (BlockTypes.Span,)): 26 | if "II." in section_header_span.text: 27 | assert section_header_span.url == "#page-1-0" 28 | break 29 | else: 30 | raise ValueError("Could not find II. in the first page") 31 | 32 | section_header_block = first_page.contained_blocks(pdf_document, (BlockTypes.SectionHeader,))[0] 33 | assert section_header_block.raw_text(pdf_document) == 'II. THEORETICAL FRAMEWORK\n' 34 | 35 | assert first_page.refs[0].ref == "page-1-0" 36 | 37 | markdown_output: MarkdownOutput = pdf_converter(temp_doc.name) 38 | markdown = markdown_output.markdown 39 | 40 | assert '[II.](#page-1-0)' in markdown 41 | assert 'II. THEORETICAL FRAMEWORK' in markdown 42 | 43 | for ref in set([f'' for m in re.findall(r'\]\(#page-(\d+)-(\d+)\)', markdown)]): 44 | assert ref in markdown, f"Reference {ref} not found in markdown" 45 | -------------------------------------------------------------------------------- /tests/builders/test_rotated_bboxes.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from marker.schema import BlockTypes 4 | 5 | 6 | @pytest.mark.config({"page_range": [0]}) 7 | @pytest.mark.filename("adversarial_rot.pdf") 8 | def test_rotated_bboxes(pdf_document): 9 | first_page = pdf_document.pages[0] 10 | 11 | # Ensure we match all text lines up properly 12 | text_lines = first_page.contained_blocks(pdf_document, (BlockTypes.Line,)) 13 | text_blocks = first_page.contained_blocks( 14 | pdf_document, (BlockTypes.Text, BlockTypes.TextInlineMath) 15 | ) 16 | assert len(text_lines) == 85 17 | 18 | # Ensure the bbox sizes match up 19 | max_line_position = max([line.polygon.x_end for line in text_lines]) 20 | max_block_position = max( 21 | [block.polygon.x_end for block in text_blocks if block.source == "layout"] 22 | ) 23 | assert max_line_position <= max_block_position 24 | -------------------------------------------------------------------------------- /tests/builders/test_strip_existing_ocr.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | @pytest.mark.config({"page_range": [0], "strip_existing_ocr": True}) 5 | @pytest.mark.filename("handwritten.pdf") 6 | def test_strip_ocr(doc_provider): 7 | # Ensure that the OCR text isn't extracted 8 | assert len(doc_provider.page_lines) == 0 9 | 10 | 11 | @pytest.mark.config({"page_range": [0]}) 12 | @pytest.mark.filename("handwritten.pdf") 13 | def test_keep_ocr(doc_provider): 14 | assert len(doc_provider.page_lines) == 1 15 | -------------------------------------------------------------------------------- /tests/builders/test_structure.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from marker.builders.structure import StructureBuilder 4 | 5 | 6 | @pytest.mark.config({"page_range": [0]}) 7 | def test_structure_builder(pdf_document): 8 | structure = StructureBuilder() 9 | structure(pdf_document) 10 | assert len(pdf_document.pages[0].structure) > 0 11 | -------------------------------------------------------------------------------- /tests/config/test_config.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from contextlib import suppress 3 | import click 4 | 5 | from marker.config.printer import CustomClickPrinter 6 | from marker.config.crawler import crawler 7 | from marker.config.parser import ConfigParser 8 | 9 | 10 | def capture_kwargs(argv): 11 | command = click.command(cls=CustomClickPrinter) 12 | captured_kwargs = {} 13 | 14 | def parse_args(**kwargs): 15 | captured_kwargs.update(kwargs) 16 | return kwargs 17 | 18 | original_argv = sys.argv 19 | sys.argv = argv 20 | try: 21 | with suppress(SystemExit): 22 | command(ConfigParser.common_options(parse_args))() 23 | finally: 24 | sys.argv = original_argv 25 | 26 | return captured_kwargs 27 | 28 | 29 | def test_config_parser(): 30 | sys.argv = [ 31 | "test", 32 | "--disable_multiprocessing", 33 | "--output_dir", 34 | "output_dir", 35 | "--height_tolerance", 36 | "0.5", 37 | ] 38 | kwargs = capture_kwargs(sys.argv) 39 | parser = ConfigParser(kwargs) 40 | config_dict = parser.generate_config_dict() 41 | 42 | # Validate kwarg capturing 43 | assert kwargs["disable_multiprocessing"] 44 | assert kwargs["output_dir"] == "output_dir" 45 | 46 | assert config_dict["pdftext_workers"] == 1 # disabling multiprocessing does this 47 | assert config_dict["height_tolerance"] == 0.5 48 | assert "output_dir" not in config_dict # This is not a config key 49 | 50 | 51 | def test_config_none(): 52 | kwargs = capture_kwargs(["test"]) 53 | 54 | for key in crawler.attr_set: 55 | # We force some options to become flags for ease of use on the CLI 56 | value = None 57 | assert kwargs.get(key) is value 58 | 59 | 60 | def test_config_llm(): 61 | kwargs = capture_kwargs(["test", "--use_llm"]) 62 | parser = ConfigParser(kwargs) 63 | config_dict = parser.generate_config_dict() 64 | 65 | # Validate kwarg capturing 66 | assert config_dict["use_llm"] 67 | 68 | 69 | def test_config_force_ocr(): 70 | kwargs = capture_kwargs(["test", "--force_ocr", "--format_lines"]) 71 | parser = ConfigParser(kwargs) 72 | config_dict = parser.generate_config_dict() 73 | 74 | # Validate kwarg capturing 75 | assert config_dict["force_ocr"] 76 | assert config_dict["format_lines"] 77 | -------------------------------------------------------------------------------- /tests/converters/test_extraction_converter.py: -------------------------------------------------------------------------------- 1 | import json 2 | import pytest 3 | 4 | from marker.converters.extraction import ExtractionConverter 5 | from marker.extractors.page import PageExtractionSchema 6 | from marker.services import BaseService 7 | 8 | 9 | class MockLLMService(BaseService): 10 | def __call__(self, prompt, image=None, page=None, response_schema=None, **kwargs): 11 | assert response_schema == PageExtractionSchema 12 | return { 13 | "description": "Mock extraction description", 14 | "extracted_json": json.dumps({"test_key": "test_value"}), 15 | "existence_confidence": 5, 16 | "value_confidence": 5, 17 | } 18 | 19 | 20 | @pytest.fixture 21 | def mock_llm_service(): 22 | return MockLLMService 23 | 24 | 25 | @pytest.fixture 26 | def extraction_converter(config, model_dict, mock_llm_service): 27 | test_schema = { 28 | "title": "TestSchema", 29 | "type": "object", 30 | "properties": {"test_key": {"title": "Test Key", "type": "string"}}, 31 | "required": ["test_key"], 32 | } 33 | 34 | config["page_schema"] = json.dumps(test_schema) 35 | config["output_format"] = "markdown" 36 | model_dict["llm_service"] = mock_llm_service 37 | 38 | converter = ExtractionConverter( 39 | artifact_dict=model_dict, processor_list=None, config=config 40 | ) 41 | converter.default_llm_service = MockLLMService 42 | return converter 43 | 44 | 45 | @pytest.mark.config({"page_range": [0]}) 46 | def test_extraction_converter_invalid_schema( 47 | config, model_dict, mock_llm_service, temp_doc 48 | ): 49 | config["page_schema"] = "invalid json" 50 | 51 | model_dict["llm_service"] = mock_llm_service 52 | converter = ExtractionConverter( 53 | artifact_dict=model_dict, processor_list=None, config=config 54 | ) 55 | 56 | with pytest.raises(ValueError): 57 | converter(temp_doc.name) 58 | 59 | 60 | @pytest.mark.config({"page_range": [0, 1]}) 61 | def test_extraction_converter_multiple_pages(extraction_converter, temp_doc): 62 | result = extraction_converter(temp_doc.name) 63 | 64 | assert result is not None 65 | assert result.document_json is not None 66 | assert result.document_json == {"test_key": "test_value"} 67 | -------------------------------------------------------------------------------- /tests/converters/test_ocr_converter.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from marker.converters.ocr import OCRConverter 4 | from marker.renderers.ocr_json import OCRJSONOutput, OCRJSONPageOutput 5 | 6 | 7 | def _ocr_converter(config, model_dict, temp_pdf, line_count: int, eq_count: int): 8 | converter = OCRConverter(artifact_dict=model_dict, config=config) 9 | 10 | ocr_json: OCRJSONOutput = converter(temp_pdf.name) 11 | pages = ocr_json.children 12 | 13 | assert len(pages) == 1 14 | assert len(pages[0].children) == line_count 15 | eqs = [line for line in pages[0].children if line.block_type == "Equation"] 16 | assert len(eqs) == eq_count 17 | return pages 18 | 19 | 20 | def check_bboxes(page: OCRJSONPageOutput, lines): 21 | page_size = page.bbox 22 | for line in lines: 23 | assert len(line.children) > 0 24 | for child in line.children: 25 | bbox = child.bbox 26 | assert all( 27 | [ 28 | bbox[0] >= page_size[0], 29 | bbox[1] >= page_size[1], 30 | bbox[2] <= page_size[2], 31 | bbox[3] <= page_size[3], 32 | ] 33 | ), "Child bbox is outside page bbox" 34 | 35 | 36 | @pytest.mark.config({"page_range": [0]}) 37 | def test_ocr_converter(config, model_dict, temp_doc): 38 | _ocr_converter(config, model_dict, temp_doc, 84, 2) 39 | 40 | 41 | @pytest.mark.filename("pres.pdf") 42 | @pytest.mark.config({"page_range": [1], "force_ocr": True, "keep_chars": True}) 43 | def test_ocr_converter_force(config, model_dict, temp_doc): 44 | pages = _ocr_converter(config, model_dict, temp_doc, 10, 0) 45 | lines = [line for line in pages[0].children if line.block_type == "Line"] 46 | check_bboxes(pages[0], lines) 47 | 48 | 49 | @pytest.mark.filename("pres.pdf") 50 | @pytest.mark.config({"page_range": [1], "keep_chars": True}) 51 | def test_ocr_converter_keep(config, model_dict, temp_doc): 52 | pages = _ocr_converter(config, model_dict, temp_doc, 9, 0) 53 | lines = [line for line in pages[0].children if line.block_type == "Line"] 54 | check_bboxes(pages[0], lines) 55 | -------------------------------------------------------------------------------- /tests/converters/test_table_converter.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from marker.converters.table import TableConverter 3 | from marker.renderers.markdown import MarkdownOutput 4 | from marker.util import classes_to_strings 5 | 6 | def _table_converter(config, model_dict, renderer, temp_pdf): 7 | converter = TableConverter( 8 | artifact_dict=model_dict, 9 | processor_list=None, 10 | renderer=classes_to_strings([renderer])[0], 11 | config=config 12 | ) 13 | 14 | markdown_output: MarkdownOutput = converter(temp_pdf.name) 15 | markdown = markdown_output.markdown 16 | 17 | assert len(markdown) > 0 18 | assert "cyclic" in markdown 19 | 20 | 21 | @pytest.mark.output_format("markdown") 22 | @pytest.mark.config({"page_range": [5]}) 23 | def test_table_converter(config, model_dict, renderer, temp_doc): 24 | _table_converter(config, model_dict, renderer, temp_doc) 25 | 26 | @pytest.mark.output_format("markdown") 27 | @pytest.mark.config({"page_range": [5], "force_ocr": True}) 28 | def test_table_converter_ocr(config, model_dict, renderer, temp_doc): 29 | _table_converter(config, model_dict, renderer, temp_doc) 30 | 31 | -------------------------------------------------------------------------------- /tests/processors/test_document_toc_processor.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from marker.processors.document_toc import DocumentTOCProcessor 4 | 5 | 6 | @pytest.mark.config({"page_range": [0]}) 7 | def test_document_toc_processor(pdf_document, detection_model, recognition_model, table_rec_model): 8 | processor = DocumentTOCProcessor() 9 | processor(pdf_document) 10 | 11 | assert len(pdf_document.table_of_contents) == 3 12 | assert pdf_document.table_of_contents[0]["title"] == "Subspace Adversarial Training" 13 | -------------------------------------------------------------------------------- /tests/processors/test_equation_processor.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from marker.schema import BlockTypes 4 | from marker.processors.equation import EquationProcessor 5 | 6 | 7 | @pytest.mark.config({"page_range": [0]}) 8 | def test_equation_processor(pdf_document, recognition_model): 9 | processor = EquationProcessor(recognition_model) 10 | processor(pdf_document) 11 | 12 | for block in pdf_document.pages[0].children: 13 | if block.block_type == BlockTypes.Equation: 14 | assert block.html is not None -------------------------------------------------------------------------------- /tests/processors/test_footnote_processor.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from marker.processors.footnote import FootnoteProcessor 4 | from marker.schema import BlockTypes 5 | 6 | 7 | @pytest.mark.filename("population_stats.pdf") 8 | @pytest.mark.config({"page_range": [4]}) 9 | def test_footnote_processor(pdf_document): 10 | processor = FootnoteProcessor() 11 | processor(pdf_document) 12 | 13 | page0_footnotes = pdf_document.pages[0].contained_blocks(pdf_document, [BlockTypes.Footnote]) 14 | assert len(page0_footnotes) >= 2 15 | 16 | assert page0_footnotes[-1].raw_text(pdf_document).strip().startswith("5") 17 | -------------------------------------------------------------------------------- /tests/processors/test_ignoretext.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from marker.processors.ignoretext import IgnoreTextProcessor 4 | from marker.schema import BlockTypes 5 | 6 | 7 | @pytest.mark.filename("bio_pdf.pdf") 8 | @pytest.mark.config({"page_range": list(range(10))}) 9 | def test_ignoretext_processor(pdf_document): 10 | processor = IgnoreTextProcessor() 11 | processor(pdf_document) 12 | 13 | page1_header = pdf_document.pages[1].contained_blocks(pdf_document, [BlockTypes.Text])[0] 14 | assert "bioRxiv" in page1_header.raw_text(pdf_document) 15 | 16 | assert page1_header.ignore_for_output is True 17 | -------------------------------------------------------------------------------- /tests/processors/test_table_merge.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import Mock 2 | 3 | import pytest 4 | 5 | from marker.processors.llm.llm_table_merge import LLMTableMergeProcessor 6 | from marker.processors.table import TableProcessor 7 | from marker.schema import BlockTypes 8 | 9 | 10 | @pytest.mark.filename("table_ex2.pdf") 11 | def test_llm_table_processor_nomerge(pdf_document, detection_model, table_rec_model, recognition_model, mocker): 12 | mock_cls = Mock() 13 | mock_cls.return_value = { 14 | "merge": "true", 15 | "direction": "right" 16 | } 17 | 18 | cell_processor = TableProcessor(detection_model, recognition_model, table_rec_model) 19 | cell_processor(pdf_document) 20 | 21 | tables = pdf_document.contained_blocks((BlockTypes.Table,)) 22 | assert len(tables) == 3 23 | 24 | processor = LLMTableMergeProcessor(mock_cls, {"use_llm": True, "gemini_api_key": "test"}) 25 | processor(pdf_document) 26 | 27 | tables = pdf_document.contained_blocks((BlockTypes.Table,)) 28 | assert len(tables) == 3 -------------------------------------------------------------------------------- /tests/processors/test_table_processor.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | import pytest 4 | from marker.renderers.json import JSONRenderer 5 | 6 | from marker.renderers.markdown import MarkdownRenderer 7 | from marker.schema import BlockTypes 8 | from marker.processors.table import TableProcessor 9 | from marker.schema.blocks import TableCell 10 | 11 | 12 | @pytest.mark.config({"page_range": [5]}) 13 | def test_table_processor(pdf_document, detection_model, recognition_model, table_rec_model): 14 | processor = TableProcessor(detection_model, recognition_model, table_rec_model) 15 | processor(pdf_document) 16 | 17 | for block in pdf_document.pages[0].children: 18 | if block.block_type == BlockTypes.Table: 19 | children = block.contained_blocks(pdf_document, (BlockTypes.TableCell,)) 20 | assert children 21 | assert len(children) > 0 22 | assert isinstance(children[0], TableCell) 23 | 24 | assert len(pdf_document.contained_blocks((BlockTypes.Table,))) == 2 25 | 26 | renderer = MarkdownRenderer() 27 | table_output = renderer(pdf_document) 28 | assert "Schedule" in table_output.markdown 29 | 30 | 31 | @pytest.mark.filename("table_ex.pdf") 32 | @pytest.mark.config({"page_range": [0], "force_ocr": True}) 33 | def test_avoid_double_ocr(pdf_document, detection_model, recognition_model, table_rec_model): 34 | tables = pdf_document.contained_blocks((BlockTypes.Table,)) 35 | lines = tables[0].contained_blocks(pdf_document, (BlockTypes.Line,)) 36 | assert len(lines) == 0 37 | 38 | processor = TableProcessor(detection_model, recognition_model, table_rec_model, config={"force_ocr": True}) 39 | processor(pdf_document) 40 | 41 | renderer = MarkdownRenderer() 42 | table_output = renderer(pdf_document) 43 | assert "Participants" in table_output.markdown 44 | 45 | 46 | @pytest.mark.filename("multicol-blocks.pdf") 47 | @pytest.mark.config({"page_range": [3]}) 48 | def test_overlap_blocks(pdf_document, detection_model, recognition_model, table_rec_model): 49 | page = pdf_document.pages[0] 50 | assert "Cascading, and the Auxiliary Problem Principle" in page.raw_text(pdf_document) 51 | 52 | processor = TableProcessor(detection_model, recognition_model, table_rec_model) 53 | processor(pdf_document) 54 | 55 | assert "Cascading, and the Auxiliary Problem Principle" in page.raw_text(pdf_document) 56 | 57 | 58 | @pytest.mark.filename("pres.pdf") 59 | @pytest.mark.config({"page_range": [4]}) 60 | def test_ocr_table(pdf_document, detection_model, recognition_model, table_rec_model): 61 | processor = TableProcessor(detection_model, recognition_model, table_rec_model) 62 | processor(pdf_document) 63 | 64 | renderer = MarkdownRenderer() 65 | table_output = renderer(pdf_document) 66 | assert "1.2E-38" in table_output.markdown 67 | 68 | 69 | @pytest.mark.config({"page_range": [11]}) 70 | def test_split_rows(pdf_document, detection_model, recognition_model, table_rec_model): 71 | processor = TableProcessor(detection_model, recognition_model, table_rec_model) 72 | processor(pdf_document) 73 | 74 | table = pdf_document.contained_blocks((BlockTypes.Table,))[-1] 75 | cells: List[TableCell] = table.contained_blocks(pdf_document, (BlockTypes.TableCell,)) 76 | unique_rows = len(set([cell.row_id for cell in cells])) 77 | assert unique_rows == 6 78 | 79 | 80 | -------------------------------------------------------------------------------- /tests/providers/test_document_providers.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | @pytest.mark.config({"page_range": [0]}) 5 | @pytest.mark.filename("lambda.pptx") 6 | def test_pptx_provider(doc_provider): 7 | assert doc_provider.get_images([0], 72)[0].size == (842, 596) 8 | 9 | page_lines = doc_provider.get_page_lines(0) 10 | 11 | spans = page_lines[0].spans 12 | assert spans[0].text == "Lambda Calculus" 13 | 14 | spans = page_lines[1].spans 15 | assert spans[0].text == "CSE 340 – Principles of Programming Languages" 16 | 17 | 18 | @pytest.mark.config({"page_range": [0]}) 19 | @pytest.mark.filename("manual.epub") 20 | def test_epub_provider(doc_provider): 21 | assert doc_provider.get_images([0], 72)[0].size == (596, 842) 22 | 23 | page_lines = doc_provider.get_page_lines(0) 24 | 25 | spans = page_lines[0].spans 26 | assert spans[0].text == "The Project Gutenberg eBook of Simple" 27 | 28 | 29 | @pytest.mark.config({"page_range": [0]}) 30 | @pytest.mark.filename("china.html") 31 | def test_html_provider(doc_provider): 32 | assert doc_provider.get_images([0], 72)[0].size == (596, 842) 33 | 34 | page_lines = doc_provider.get_page_lines(0) 35 | 36 | spans = page_lines[0].spans 37 | assert spans[0].text == "Jump to content" 38 | 39 | @pytest.mark.config({"page_range": [0]}) 40 | @pytest.mark.filename("gatsby.docx") 41 | def test_docx_provider(doc_provider): 42 | assert doc_provider.get_images([0], 72)[0].size == (596, 842) 43 | 44 | page_lines = doc_provider.get_page_lines(0) 45 | 46 | spans = page_lines[0].spans 47 | assert spans[0].text == "Themes" 48 | 49 | 50 | @pytest.mark.config({"page_range": [0]}) 51 | @pytest.mark.filename("single_sheet.xlsx") 52 | def test_xlsx_provider(doc_provider): 53 | assert doc_provider.get_images([0], 72)[0].size == (842, 596) 54 | 55 | page_lines = doc_provider.get_page_lines(0) 56 | 57 | spans = page_lines[0].spans 58 | assert spans[0].text == "Sheet1" -------------------------------------------------------------------------------- /tests/providers/test_image_provider.py: -------------------------------------------------------------------------------- 1 | from marker.providers.image import ImageProvider 2 | from marker.renderers.markdown import MarkdownOutput 3 | 4 | 5 | def test_image_provider(config, temp_image): 6 | provider = ImageProvider(temp_image.name, config) 7 | assert len(provider) == 1 8 | assert provider.get_images([0], 72)[0].size == (512, 512) 9 | 10 | page_lines = provider.get_page_lines(0) 11 | assert len(page_lines) == 0 12 | 13 | def test_image_provider_conversion(pdf_converter, temp_image): 14 | markdown_output: MarkdownOutput = pdf_converter(temp_image.name) 15 | assert "Hello, World!" in markdown_output.markdown 16 | 17 | 18 | -------------------------------------------------------------------------------- /tests/providers/test_pdf_provider.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | @pytest.mark.config({"page_range": [0]}) 5 | def test_pdf_provider(doc_provider): 6 | assert len(doc_provider) == 12 7 | assert doc_provider.get_images([0], 72)[0].size == (612, 792) 8 | assert doc_provider.get_images([0], 96)[0].size == (816, 1056) 9 | 10 | page_lines = doc_provider.get_page_lines(0) 11 | assert len(page_lines) == 87 12 | 13 | spans = page_lines[0].spans 14 | assert len(spans) == 2 15 | assert spans[0].text == "Subspace Adversarial Training" 16 | assert spans[0].font == "NimbusRomNo9L-Medi" 17 | assert spans[0].formats == ["plain"] 18 | -------------------------------------------------------------------------------- /tests/renderers/test_extract_images.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from marker.renderers.markdown import MarkdownRenderer 4 | 5 | 6 | @pytest.mark.config({"page_range": [0]}) 7 | @pytest.mark.filename("A17_FlightPlan.pdf") 8 | def test_disable_extract_images(pdf_document): 9 | renderer = MarkdownRenderer({"extract_images": False}) 10 | md = renderer(pdf_document).markdown 11 | 12 | # Verify markdown 13 | assert len(md) == 0 14 | 15 | 16 | @pytest.mark.config({"page_range": [0]}) 17 | @pytest.mark.filename("A17_FlightPlan.pdf") 18 | def test_extract_images(pdf_document): 19 | renderer = MarkdownRenderer() 20 | md = renderer(pdf_document).markdown 21 | 22 | # Verify markdown 23 | assert "jpeg" in md -------------------------------------------------------------------------------- /tests/renderers/test_json_renderer.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from marker.renderers.json import JSONRenderer 4 | 5 | 6 | @pytest.mark.config({"page_range": [0]}) 7 | def test_markdown_renderer_pagination(pdf_document): 8 | renderer = JSONRenderer() 9 | pages = renderer(pdf_document).children 10 | 11 | assert len(pages) == 1 12 | assert pages[0].block_type == "Page" 13 | assert pages[0].children[0].block_type == "SectionHeader" -------------------------------------------------------------------------------- /tests/renderers/test_markdown_renderer.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from marker.renderers.markdown import MarkdownRenderer 4 | from marker.schema import BlockTypes 5 | from marker.schema.blocks import TableCell 6 | 7 | 8 | @pytest.mark.config({"page_range": [0]}) 9 | def test_markdown_renderer(pdf_document): 10 | renderer = MarkdownRenderer() 11 | md = renderer(pdf_document).markdown 12 | 13 | # Verify markdown 14 | assert '# Subspace Adversarial Training' in md 15 | 16 | 17 | @pytest.mark.config({"page_range": [0, 1], "paginate_output": True}) 18 | def test_markdown_renderer_pagination(pdf_document): 19 | renderer = MarkdownRenderer({"paginate_output": True}) 20 | md = renderer(pdf_document).markdown 21 | 22 | assert "{0}-" in md 23 | assert "{1}-" in md 24 | 25 | 26 | @pytest.mark.config({"page_range": [0, 1]}) 27 | def test_markdown_renderer_metadata(pdf_document): 28 | renderer = MarkdownRenderer({"paginate_output": True}) 29 | metadata = renderer(pdf_document).metadata 30 | assert "table_of_contents" in metadata 31 | 32 | 33 | @pytest.mark.config({"page_range": [0, 1]}) 34 | def test_markdown_renderer_images(pdf_document): 35 | renderer = MarkdownRenderer({"extract_images": False}) 36 | markdown_output = renderer(pdf_document) 37 | 38 | assert len(markdown_output.images) == 0 39 | assert '![](' not in markdown_output.markdown 40 | 41 | @pytest.mark.config({"page_range": [5]}) 42 | def test_markdown_renderer_tables(pdf_document): 43 | table = pdf_document.contained_blocks((BlockTypes.Table,))[0] 44 | page = pdf_document.pages[0] 45 | 46 | cell = TableCell( 47 | polygon=table.polygon, 48 | text_lines=["54.4567
    89x"], 49 | rowspan=1, 50 | colspan=1, 51 | row_id=0, 52 | col_id=0, 53 | is_header=False, 54 | page_id=page.page_id, 55 | ) 56 | page.add_full_block(cell) 57 | table.structure = [] 58 | table.add_structure(cell) 59 | 60 | renderer = MarkdownRenderer() 61 | md = renderer(pdf_document).markdown 62 | assert "54 .45 67
    89 $x$" in md 63 | 64 | 65 | -------------------------------------------------------------------------------- /tests/schema/groups/test_list_grouping.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from marker.builders.structure import StructureBuilder 4 | from marker.schema import BlockTypes 5 | 6 | 7 | @pytest.mark.config({"page_range": [4]}) 8 | def test_list_grouping(pdf_document): 9 | structure = StructureBuilder() 10 | structure(pdf_document) 11 | 12 | page = pdf_document.pages[0] 13 | list_groups = [] 14 | for block in page.children: 15 | if block.block_type == BlockTypes.ListGroup: 16 | list_groups.append(block) 17 | 18 | # The model breaks this up, since it has equations in it 19 | assert len(list_groups) == 3 20 | -------------------------------------------------------------------------------- /tests/services/test_service_init.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from marker.converters.pdf import PdfConverter 4 | from marker.services.gemini import GoogleGeminiService 5 | from marker.services.ollama import OllamaService 6 | from marker.services.vertex import GoogleVertexService 7 | from marker.services.openai import OpenAIService 8 | 9 | 10 | @pytest.mark.output_format("markdown") 11 | @pytest.mark.config({"page_range": [0]}) 12 | def test_empty_llm(pdf_converter: PdfConverter, temp_doc): 13 | assert pdf_converter.artifact_dict["llm_service"] is None 14 | assert pdf_converter.llm_service is None 15 | 16 | 17 | def test_llm_no_keys(model_dict, config): 18 | with pytest.raises(AssertionError): 19 | PdfConverter( 20 | artifact_dict=model_dict, 21 | config={"use_llm": True} 22 | ) 23 | 24 | @pytest.mark.output_format("markdown") 25 | @pytest.mark.config({"page_range": [0], "use_llm": True, "gemini_api_key": "test"}) 26 | def test_llm_gemini(pdf_converter: PdfConverter, temp_doc): 27 | assert pdf_converter.artifact_dict["llm_service"] is not None 28 | assert isinstance(pdf_converter.llm_service, GoogleGeminiService) 29 | 30 | 31 | @pytest.mark.output_format("markdown") 32 | @pytest.mark.config({"page_range": [0], "use_llm": True, "vertex_project_id": "test", "llm_service": "marker.services.vertex.GoogleVertexService"}) 33 | def test_llm_vertex(pdf_converter: PdfConverter, temp_doc): 34 | assert pdf_converter.artifact_dict["llm_service"] is not None 35 | assert isinstance(pdf_converter.llm_service, GoogleVertexService) 36 | 37 | 38 | @pytest.mark.output_format("markdown") 39 | @pytest.mark.config({"page_range": [0], "use_llm": True, "llm_service": "marker.services.ollama.OllamaService"}) 40 | def test_llm_ollama(pdf_converter: PdfConverter, temp_doc): 41 | assert pdf_converter.artifact_dict["llm_service"] is not None 42 | assert isinstance(pdf_converter.llm_service, OllamaService) 43 | 44 | @pytest.mark.output_format("markdown") 45 | @pytest.mark.config({"page_range": [0], "use_llm": True, "llm_service": "marker.services.openai.OpenAIService", "openai_api_key": "test"}) 46 | def test_llm_ollama(pdf_converter: PdfConverter, temp_doc): 47 | assert pdf_converter.artifact_dict["llm_service"] is not None 48 | assert isinstance(pdf_converter.llm_service, OpenAIService) -------------------------------------------------------------------------------- /tests/utils.py: -------------------------------------------------------------------------------- 1 | from marker.providers.pdf import PdfProvider 2 | import tempfile 3 | 4 | import datasets 5 | 6 | 7 | def setup_pdf_provider( 8 | filename='adversarial.pdf', 9 | config=None, 10 | ) -> PdfProvider: 11 | dataset = datasets.load_dataset("datalab-to/pdfs", split="train") 12 | idx = dataset['filename'].index(filename) 13 | 14 | temp_pdf = tempfile.NamedTemporaryFile(suffix=".pdf") 15 | temp_pdf.write(dataset['pdf'][idx]) 16 | temp_pdf.flush() 17 | 18 | provider = PdfProvider(temp_pdf.name, config) 19 | return provider 20 | --------------------------------------------------------------------------------