├── .dockerignore
├── .github
    ├── CONTRIBUTING.md
    ├── ISSUE_TEMPLATE
    │   ├── bug_report.yml
    │   ├── documentation.yml
    │   └── feature_request.yml
    ├── actions
    │   └── setup-venv
    │   │   └── action.yml
    ├── dependabot.yml
    ├── pull_request_template.md
    └── workflows
    │   ├── main.yml
    │   └── pr_checks.yml
├── .gitignore
├── .readthedocs.yaml
├── CHANGELOG.md
├── Dockerfile
├── LICENSE
├── Makefile
├── README.md
├── RELEASE_PROCESS.md
├── docs
    ├── .gitignore
    ├── Makefile
    ├── make.bat
    └── source
    │   ├── CHANGELOG.md
    │   ├── CONTRIBUTING.md
    │   ├── _static
    │       ├── css
    │       │   └── custom.css
    │       └── favicon.ico
    │   ├── conf.py
    │   ├── index.md
    │   ├── installation.md
    │   ├── ocr_pareto.pdf
    │   ├── ocr_pareto.png
    │   └── overview.md
├── gantry-requirements.txt
├── olmocr
    ├── __init__.py
    ├── bench
    │   ├── README.md
    │   ├── __init__.py
    │   ├── benchmark.py
    │   ├── convert.py
    │   ├── katex
    │   │   ├── __init__.py
    │   │   ├── auto-render.min.js
    │   │   ├── katex.min.css
    │   │   ├── katex.min.js
    │   │   └── render.py
    │   ├── miners
    │   │   ├── check_headers_footers.py
    │   │   ├── check_multicolumn.py
    │   │   ├── check_old_scans_math.py
    │   │   ├── cleanup_data.py
    │   │   ├── cleanup_urls.py
    │   │   ├── delete_rejected.py
    │   │   ├── download_math.py
    │   │   ├── mine_diffs.py
    │   │   ├── mine_headers_footers.py
    │   │   ├── mine_long_tiny_text.py
    │   │   ├── mine_math.py
    │   │   ├── mine_multi_column.py
    │   │   ├── mine_old_scan_pdf.py
    │   │   ├── mine_old_scans.py
    │   │   ├── mine_old_scans_math.py
    │   │   ├── mine_reading_order.py
    │   │   ├── mine_tables_gemini.py
    │   │   ├── mine_tables_gpt.py
    │   │   └── pick_mediod.py
    │   ├── prompts.py
    │   ├── report.py
    │   ├── review_app.py
    │   ├── review_app_latex.py
    │   ├── runners
    │   │   ├── __init__.py
    │   │   ├── run_chatgpt.py
    │   │   ├── run_claude.py
    │   │   ├── run_docling.py
    │   │   ├── run_gemini.py
    │   │   ├── run_gotocr.py
    │   │   ├── run_marker.py
    │   │   ├── run_mineru.py
    │   │   ├── run_mistral.py
    │   │   ├── run_olmocr_pipeline.py
    │   │   ├── run_rolmocr.py
    │   │   ├── run_server.py
    │   │   └── run_transformers.py
    │   ├── sample_data
    │   │   ├── dataset.jsonl
    │   │   ├── olmocr_pipeline
    │   │   │   ├── buildingnotes_pg1_repeat1.md
    │   │   │   ├── discoverworld_crazy_table4_pg1_repeat1.md
    │   │   │   ├── earnings_pg1_repeat1.md
    │   │   │   ├── headers_footers
    │   │   │   │   ├── ff0f0b22c55d8b90dd77d153f48e144fc9db_pg2_pg1_repeat1.md
    │   │   │   │   ├── ff1fc6a205ad039139ce566851b6b260c929_pg1_pg1_repeat1.md
    │   │   │   │   ├── ff3d6e051903fe5ca9bc172ece14964c5632_pg1_pg1_repeat1.md
    │   │   │   │   ├── ff4f7dad78081cff727d19ab51c181d4a661_pg1_pg1_repeat1.md
    │   │   │   │   ├── ff518b1240a66978f22035528ccb029450b5_pg2_pg1_repeat1.md
    │   │   │   │   ├── ffaac214730d2b8c2ec842e3618ccb9c4259_pg1_pg1_repeat1.md
    │   │   │   │   └── fff590bed29a2854ac1f874dad5752ede1aa_pg1_pg1_repeat1.md
    │   │   │   ├── lincoln_letter_pg1_repeat1.md
    │   │   │   ├── math_2503_04086_pg1_repeat1.md
    │   │   │   ├── mathfuncs_colswitch_pg1_repeat1.md
    │   │   │   ├── mathfuncs_pg1_repeat1.md
    │   │   │   ├── mattsnotes_pg1_repeat1.md
    │   │   │   ├── mattsnotes_pg2_repeat1.md
    │   │   │   ├── mattsnotes_pg3_repeat1.md
    │   │   │   ├── multi_column_miss_pg1_repeat1.md
    │   │   │   ├── olmo2-pg4_pg1_repeat1.md
    │   │   │   ├── openstax_caculus_pg_273_pg1_repeat1.md
    │   │   │   ├── small_page_size_pg1_repeat1.md
    │   │   │   └── test-graphical-text_pg1_repeat1.md
    │   │   └── pdfs
    │   │   │   ├── buildingnotes.pdf
    │   │   │   ├── discoverworld_crazy_table4.pdf
    │   │   │   ├── earnings.pdf
    │   │   │   ├── headers_footers
    │   │   │       ├── ff0f0b22c55d8b90dd77d153f48e144fc9db_pg2.pdf
    │   │   │       ├── ff1fc6a205ad039139ce566851b6b260c929_pg1.pdf
    │   │   │       ├── ff3d6e051903fe5ca9bc172ece14964c5632_pg1.pdf
    │   │   │       ├── ff4f7dad78081cff727d19ab51c181d4a661_pg1.pdf
    │   │   │       ├── ff518b1240a66978f22035528ccb029450b5_pg2.pdf
    │   │   │       ├── ffaac214730d2b8c2ec842e3618ccb9c4259_pg1.pdf
    │   │   │       └── fff590bed29a2854ac1f874dad5752ede1aa_pg1.pdf
    │   │   │   ├── lincoln_letter.pdf
    │   │   │   ├── math_2503_04086.pdf
    │   │   │   ├── mathfuncs.pdf
    │   │   │   ├── mathfuncs_colswitch.pdf
    │   │   │   ├── mattsnotes.pdf
    │   │   │   ├── multi_column_miss.pdf
    │   │   │   ├── olmo2-pg4.pdf
    │   │   │   ├── openstax_caculus_pg_273.pdf
    │   │   │   ├── small_page_size.pdf
    │   │   │   └── test-graphical-text.pdf
    │   ├── scripts
    │   │   ├── convert_all.sh
    │   │   ├── difference_viewer.py
    │   │   ├── run_difference.py
    │   │   ├── url_matcher.py
    │   │   └── workspace_to_bench.py
    │   ├── synth
    │   │   ├── __init__.py
    │   │   ├── mine_html_templates.py
    │   │   └── test_mine.py
    │   ├── templates
    │   │   ├── all_done.html
    │   │   ├── all_done_latex.html
    │   │   ├── review.html
    │   │   └── review_latex.html
    │   ├── tests.py
    │   └── utils.py
    ├── check.py
    ├── data
    │   ├── __init__.py
    │   ├── buildsilver.py
    │   ├── buildsilverdatasummary.py
    │   ├── buildtestset.py
    │   ├── convertsilver_birr.py
    │   ├── convertsilver_openai.py
    │   ├── renderpdf.py
    │   └── runopenaibatch.py
    ├── datatypes.py
    ├── eval
    │   ├── __init__.py
    │   ├── buildelo.py
    │   ├── dolma_refine
    │   │   ├── aligners.py
    │   │   ├── metrics.py
    │   │   ├── registry.py
    │   │   └── segmenters.py
    │   ├── evalhtml.py
    │   ├── evalhtml_template.html
    │   ├── runeval.py
    │   └── scoreelo.py
    ├── filter
    │   ├── __init__.py
    │   ├── coherency.py
    │   └── filter.py
    ├── image_utils.py
    ├── loadertest.py
    ├── metrics.py
    ├── pipeline.py
    ├── prompts
    │   ├── __init__.py
    │   ├── anchor.py
    │   └── prompts.py
    ├── py.typed
    ├── repeatdetect.py
    ├── s3_utils.py
    ├── train
    │   ├── __init__.py
    │   ├── config
    │   │   ├── molmo-o-lora-8192.yaml
    │   │   ├── molmo-o-lora.yaml
    │   │   ├── qwen25vl-7b.yaml
    │   │   ├── qwen2vl-2b-lora.yaml
    │   │   ├── qwen2vl-2b.yaml
    │   │   ├── qwen2vl-7b-lora.yaml
    │   │   └── qwen2vl-7b.yaml
    │   ├── core
    │   │   ├── __init__.py
    │   │   ├── adapters.py
    │   │   ├── cli.py
    │   │   ├── compression.py
    │   │   ├── config.py
    │   │   ├── errors.py
    │   │   ├── loggers.py
    │   │   ├── paths.py
    │   │   └── state.py
    │   ├── dataloader.py
    │   ├── dataprep.py
    │   ├── fixqwen25vlcheckpoint.py
    │   ├── hf
    │   │   ├── __init__.py
    │   │   ├── convertjsontoparquet.py
    │   │   ├── hfhub_upload.py
    │   │   └── warc_parser.py
    │   ├── inference.py
    │   ├── loaddataset.py
    │   ├── molmo
    │   │   ├── __init__.py
    │   │   ├── config_molmo.py
    │   │   ├── image_processing_molmo.py
    │   │   ├── modeling_molmo.py
    │   │   └── preprocessing_molmo.py
    │   ├── train.py
    │   └── utils.py
    ├── version.py
    ├── viewer
    │   ├── __init__.py
    │   ├── dolmaviewer.py
    │   └── dolmaviewer_template.html
    └── work_queue.py
├── pyproject.toml
├── scripts
    ├── autoscan_dolmadocs.py
    ├── beaker
    │   ├── Dockerfile-gpu-ci
    │   ├── Dockerfile-inference
    │   ├── Dockerfile-tagging
    │   ├── Dockerfile-train
    │   ├── gpu-ci-script.sh
    │   ├── jupiter-ib.sh
    │   └── pluto-ib.sh
    ├── benchmark_throughput.py
    ├── birr
    │   └── config
    │   │   └── qwen2-vl-7b-pdf-weka.yaml
    ├── build-docker.sh
    ├── chatgpt_tag_dolmadocs_v1.py
    ├── chatgpt_tag_dolmadocs_v2.py
    ├── check_qual.sh
    ├── elo
    │   ├── README.md
    │   ├── boxplots.png
    │   ├── calculate_elo_ratings.py
    │   ├── draw_boxplots.py
    │   ├── ratings.csv
    │   └── results.txt
    ├── infinigram_count.py
    ├── jsonl_to_markdown.py
    ├── molmo-7b-lora-gantry.sh
    ├── movedolmadocs_to_md.py
    ├── pareto_plot.py
    ├── parse_with_pdfminer.py
    ├── pii_rule_comparison.py
    ├── prepare_changelog.py
    ├── qwen25vl-7b-gantry.sh
    ├── qwen2vl-2b-gantry.sh
    ├── qwen2vl-7b-gantry.sh
    ├── qwen2vl-7b-lora-gantry.sh
    ├── release.sh
    ├── release_notes.py
    ├── rich_tagging_pipeline.py
    ├── run_benchmark.sh
    ├── run_integration_test.sh
    ├── run_tagging_pipeline.sh
    ├── s2orc_extractor.sh
    ├── scan_dolmadocs.py
    ├── tagging_pipeline.py
    └── tagging_pipeline_v2.py
└── tests
    ├── __init__.py
    ├── gnarly_pdfs
        ├── ambiguous.pdf
        ├── badlines.pdf
        ├── bws_book_ch2.pdf
        ├── discoverworld_crazy_tables.pdf
        ├── dolma-page-1.pdf
        ├── edgar.pdf
        ├── failing_anchor_pg4.pdf
        ├── failing_pdf_pg9.pdf
        ├── form_on_later_pages.pdf
        ├── guidebook_failed_pages.pdf
        ├── handwriting_bad_ocr.pdf
        ├── horribleocr.pdf
        ├── instructions_and_schematics.pdf
        ├── large_prompt_hint1.pdf
        ├── large_prompt_hint2.pdf
        ├── large_prompt_hint3.pdf
        ├── load_v_error.pdf
        ├── lots_of_chem_tables.pdf
        ├── lots_of_sci_tables.pdf
        ├── map1.pdf
        ├── most_content_in_image_form.pdf
        ├── newspaper.pdf
        ├── not_parsing.pdf
        ├── not_parsing2.pdf
        ├── olmo-page-1.pdf
        ├── overrun_on_pg8.pdf
        ├── pdftotext_two_column_issue.pdf
        ├── repeating_references_on_pg9_pg10.pdf
        ├── skinnypage.pdf
        ├── slideshow_mostly_good_some_pages_should_get_filtered.pdf
        ├── slideshow_mostly_images.pdf
        ├── small_page_size.pdf
        ├── some_ocr1.pdf
        ├── ti89_guidebook_programming.pdf
        └── tobacco_missed_tokens_pg1.pdf
    ├── test_anchor.py
    ├── test_dataloader.py
    ├── test_dataprep.py
    ├── test_filter.py
    ├── test_integration.py
    ├── test_molmo.py
    ├── test_renders
        ├── output_image.png
        └── output_image_rotated90.png
    ├── test_s3_work_queue.py
    ├── test_sglang.py
    └── test_tests.py


/.dockerignore:
--------------------------------------------------------------------------------
1 | .git
2 | .github
3 | .mypy_cache
4 | .pytest_cache
5 | .venv
6 | __pycache__
7 | *.egg-info
8 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.yml:
--------------------------------------------------------------------------------
 1 | name: 🐛 Bug Report
 2 | description: Create a report to help us reproduce and fix the bug
 3 | labels: 'bug'
 4 | 
 5 | body:
 6 | - type: markdown
 7 |   attributes:
 8 |     value: >
 9 |       #### Before submitting a bug, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/allenai/olmocr/issues?q=is%3Aissue+sort%3Acreated-desc+).
10 | - type: textarea
11 |   attributes:
12 |     label: 🐛 Describe the bug
13 |     description: |
14 |       Please provide a clear and concise description of what the bug is.
15 | 
16 |       If relevant, add a minimal example so that we can reproduce the error by running the code. It is very important for the snippet to be as succinct (minimal) as possible, so please take time to trim down any irrelevant code to help us debug efficiently. We are going to copy-paste your code and we expect to get the same result as you did: avoid any external data, and include the relevant imports, etc. For example:
17 | 
18 |       ```python
19 |       # All necessary imports at the beginning
20 |       import olmocr
21 | 
22 |       # A succinct reproducing example trimmed down to the essential parts:
23 |       assert False is True, "Oh no!"
24 |       ```
25 | 
26 |       If the code is too long (hopefully, it isn't), feel free to put it in a public gist and link it in the issue: https://gist.github.com.
27 | 
28 |       Please also paste or describe the results you observe instead of the expected results. If you observe an error, please paste the error message including the **full** traceback of the exception. It may be relevant to wrap error messages in ```` ```triple quotes blocks``` ````.
29 |     placeholder: |
30 |       A clear and concise description of what the bug is.
31 |   validations:
32 |     required: true
33 | - type: textarea
34 |   attributes:
35 |     label: Versions
36 |     description: |
37 |       Please run the following and paste the output below.
38 |       ```sh
39 |       python --version && pip freeze
40 |       ```
41 |   validations:
42 |     required: true
43 | - type: markdown
44 |   attributes:
45 |     value: >
46 |       Thanks for contributing 🎉!
47 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/documentation.yml:
--------------------------------------------------------------------------------
 1 | name: 📚 Documentation
 2 | description: Report an issue related to https://olmocr.readthedocs.io/latest
 3 | labels: 'documentation'
 4 | 
 5 | body:
 6 | - type: textarea
 7 |   attributes:
 8 |     label: 📚 The doc issue
 9 |     description: >
10 |       A clear and concise description of what content in https://olmocr.readthedocs.io/latest is an issue.
11 |   validations:
12 |     required: true
13 | - type: textarea
14 |   attributes:
15 |     label: Suggest a potential alternative/fix
16 |     description: >
17 |       Tell us how we could improve the documentation in this regard.
18 | - type: markdown
19 |   attributes:
20 |     value: >
21 |       Thanks for contributing 🎉!
22 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.yml:
--------------------------------------------------------------------------------
 1 | name: 🚀 Feature request
 2 | description: Submit a proposal/request for a new feature
 3 | labels: 'feature request'
 4 | 
 5 | body:
 6 | - type: textarea
 7 |   attributes:
 8 |     label: 🚀 The feature, motivation and pitch
 9 |     description: >
10 |       A clear and concise description of the feature proposal. Please outline the motivation for the proposal. Is your feature request related to a specific problem? e.g., *"I'm working on X and would like Y to be possible"*. If this is related to another GitHub issue, please link here too.
11 |   validations:
12 |     required: true
13 | - type: textarea
14 |   attributes:
15 |     label: Alternatives
16 |     description: >
17 |       A description of any alternative solutions or features you've considered, if any.
18 | - type: textarea
19 |   attributes:
20 |     label: Additional context
21 |     description: >
22 |       Add any other context or screenshots about the feature request.
23 | - type: markdown
24 |   attributes:
25 |     value: >
26 |       Thanks for contributing 🎉!
27 | 


--------------------------------------------------------------------------------
/.github/actions/setup-venv/action.yml:
--------------------------------------------------------------------------------
 1 | name: Python virtualenv
 2 | description: Set up a Python virtual environment with caching
 3 | inputs:
 4 |   python-version:
 5 |     description: The Python version to use
 6 |     required: true
 7 |   cache-prefix:
 8 |     description: Update this to invalidate the cache
 9 |     required: true
10 |     default: v0
11 | runs:
12 |   using: composite
13 |   steps:
14 |     - name: Setup Python
15 |       uses: actions/setup-python@v4
16 |       with:
17 |         python-version: ${{ inputs.python-version }}
18 | 
19 |     - shell: bash
20 |       run: |
21 |         # Install prerequisites.
22 |         pip install --upgrade pip setuptools wheel virtualenv
23 | 
24 |     - shell: bash
25 |       run: |
26 |         # Get the exact Python version to use in the cache key.
27 |         echo "PYTHON_VERSION=$(python --version)" >> $GITHUB_ENV
28 | 
29 |     - uses: actions/cache@v3
30 |       id: virtualenv-cache
31 |       with:
32 |         path: .venv
33 |         key: ${{ inputs.cache-prefix }}-${{ runner.os }}-${{ env.PYTHON_VERSION }}-${{ hashFiles('pyproject.toml') }}
34 | 
35 |     - if: steps.virtualenv-cache.outputs.cache-hit != 'true'
36 |       shell: bash
37 |       run: |
38 |         # Set up virtual environment without cache hit.
39 |         test -d .venv || virtualenv -p $(which python) --copies --reset-app-data .venv
40 |         . .venv/bin/activate
41 |         pip install -e .[dev]
42 |         pip install -e .[bench]
43 | 
44 |     - if: steps.virtualenv-cache.outputs.cache-hit == 'true'
45 |       shell: bash
46 |       run: |
47 |         # Set up virtual environment from cache hit.
48 |         . .venv/bin/activate
49 |         pip install --no-deps -e .[dev]
50 |         pip install --no-deps -e .[bench]
51 | 
52 |     - shell: bash
53 |       run: |
54 |         # Show environment info.
55 |         . .venv/bin/activate
56 |         echo "✓ Installed $(python --version) virtual environment to $(which python)"
57 |         echo "Packages:"
58 |         pip freeze
59 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | updates:
 3 | - package-ecosystem: "pip"
 4 |   directory: "/"
 5 |   schedule:
 6 |     interval: "daily"
 7 |   open-pull-requests-limit: 10
 8 | - package-ecosystem: "github-actions"
 9 |   directory: "/"
10 |   schedule:
11 |     interval: "daily"
12 | 


--------------------------------------------------------------------------------
/.github/pull_request_template.md:
--------------------------------------------------------------------------------
 1 | <!-- To ensure we can review your pull request promptly please complete this template entirely. -->
 2 | 
 3 | <!-- Please reference the issue number here. You can replace "Fixes" with "Closes" if it makes more sense. -->
 4 | Fixes #
 5 | 
 6 | Changes proposed in this pull request:
 7 | <!-- Please list all changes/additions here. -->
 8 | -
 9 | 
10 | ## Before submitting
11 | 
12 | <!-- Please complete this checklist BEFORE submitting your PR to speed along the review process. -->
13 | - [ ] I've read and followed all steps in the [Making a pull request](https://github.com/allenai/olmocr/blob/main/.github/CONTRIBUTING.md#making-a-pull-request)
14 |     section of the `CONTRIBUTING` docs.
15 | - [ ] I've updated or added any relevant docstrings following the syntax described in the
16 |     [Writing docstrings](https://github.com/allenai/olmocr/blob/main/.github/CONTRIBUTING.md#writing-docstrings) section of the `CONTRIBUTING` docs.
17 | - [ ] If this PR fixes a bug, I've added a test that will fail without my fix.
18 | - [ ] If this PR adds a new feature, I've added tests that sufficiently cover my new functionality.
19 | 


--------------------------------------------------------------------------------
/.github/workflows/pr_checks.yml:
--------------------------------------------------------------------------------
 1 | name: PR Checks
 2 | 
 3 | concurrency:
 4 |   group: ${{ github.workflow }}-${{ github.ref }}
 5 |   cancel-in-progress: true
 6 | 
 7 | on:
 8 |   pull_request:
 9 |     branches:
10 |       - main
11 |     paths:
12 |       - 'olmocr/**'
13 | 
14 | jobs:
15 |   changelog:
16 |     name: CHANGELOG
17 |     runs-on: ubuntu-latest
18 |     if: github.event_name == 'pull_request'
19 | 
20 |     steps:
21 |     - uses: actions/checkout@v3
22 |       with:
23 |         fetch-depth: 0
24 | 
25 |     - name: Check that CHANGELOG has been updated
26 |       run: |
27 |         # If this step fails, this means you haven't updated the CHANGELOG.md
28 |         # file with notes on your contribution.
29 |         git diff --name-only $(git merge-base origin/main HEAD) | grep '^CHANGELOG.md$' && echo "Thanks for helping keep our CHANGELOG up-to-date!"
30 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # ml stuff
 2 | wandb/
 3 | *histogram.png
 4 | *.json
 5 | dolma_previews/*
 6 | s2_previews/*
 7 | gnarly_previews/*
 8 | s2orc_previews/*
 9 | s2orc_previews_3200/*
10 | sample200_vllm/*
11 | sample200_sglang/*
12 | pdelfin_testset/*
13 | localworkspace/*
14 | math_data/*
15 | math_data_big/*
16 | gpt4otestset/*
17 | gpt4otestset_output/*
18 | pdfs/*
19 | olmOCR-bench/*
20 | table_data*/
21 | /synth*/
22 | dolma_samples/*
23 | /*.html
24 | scoreelo.csv
25 | debug.log
26 | birrpipeline-debug.log
27 | beakerpipeline-debug.log
28 | olmocr-pipeline-debug.log
29 | 
30 | # build artifacts
31 | 
32 | .eggs/
33 | .mypy_cache
34 | *.egg-info/
35 | build/
36 | dist/
37 | pip-wheel-metadata/
38 | 
39 | 
40 | # dev tools
41 | 
42 | .envrc
43 | .python-version
44 | .idea
45 | .venv/
46 | .vscode/
47 | /*.iml
48 | pyrightconfig.json
49 | 
50 | 
51 | # jupyter notebooks
52 | 
53 | .ipynb_checkpoints
54 | 
55 | 
56 | # miscellaneous
57 | 
58 | .cache/
59 | doc/_build/
60 | *.swp
61 | .DS_Store
62 | 
63 | 
64 | # python
65 | 
66 | *.pyc
67 | *.pyo
68 | __pycache__
69 | 
70 | 
71 | # testing and continuous integration
72 | 
73 | .coverage
74 | .pytest_cache/
75 | .benchmarks
76 | 
77 | # documentation build artifacts
78 | 
79 | docs/build
80 | site/
81 | 


--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | sphinx:
 4 |   configuration: docs/source/conf.py
 5 |   fail_on_warning: true
 6 | 
 7 | python:
 8 |   version: "3.8"
 9 |   install:
10 |     - method: pip
11 |       path: .
12 |       extra_requirements:
13 |         - dev
14 | 
15 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # Changelog
 2 | 
 3 | All notable changes to this project will be documented in this file.
 4 | 
 5 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 6 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 7 | 
 8 | ## Unreleased
 9 | 
10 | ## [v0.1.71](https://github.com/allenai/olmocr/releases/tag/v0.1.71) - 2025-05-30
11 | 
12 | ## [v0.1.70](https://github.com/allenai/olmocr/releases/tag/v0.1.70) - 2025-05-23
13 | 
14 | ## [v0.1.69](https://github.com/allenai/olmocr/releases/tag/v0.1.69) - 2025-05-20
15 | 
16 | ## [v0.1.68](https://github.com/allenai/olmocr/releases/tag/v0.1.68) - 2025-05-19
17 | 
18 | ## [v0.1.60](https://github.com/allenai/olmocr/releases/tag/v0.1.60) - 2025-03-17
19 | 
20 | ## [v0.1.58](https://github.com/allenai/olmocr/releases/tag/v0.1.58) - 2025-02-15
21 | 
22 | ## [v0.1.53](https://github.com/allenai/olmocr/releases/tag/v0.1.53) - 2025-02-14
23 | 
24 | - Fixed git checks
25 | 
26 | - Added gemini and claude runners and a viewer.


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM --platform=linux/amd64 nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu20.04
 2 | 
 3 | RUN apt-get update -y && apt-get install -y software-properties-common \
 4 |     && add-apt-repository ppa:deadsnakes/ppa \
 5 |     && apt-get -y update
 6 | 
 7 | RUN apt-get update && apt-get -y install python3-apt
 8 | RUN echo "ttf-mscorefonts-installer msttcorefonts/accepted-mscorefonts-eula select true" | debconf-set-selections
 9 | RUN apt-get update -y && apt-get install -y poppler-utils ttf-mscorefonts-installer msttcorefonts fonts-crosextra-caladea fonts-crosextra-carlito gsfonts lcdf-typetools
10 | 
11 | RUN apt-get update -y && apt-get install -y --no-install-recommends \
12 |     git \
13 |     git-lfs \
14 |     python3.11 \
15 |     python3.11-dev \
16 |     python3.11-distutils \
17 |     ca-certificates \
18 |     build-essential \
19 |     curl \
20 |     wget \
21 |     unzip
22 | 
23 | RUN rm -rf /var/lib/apt/lists/* \
24 |     && unlink /usr/bin/python3 \
25 |     && ln -s /usr/bin/python3.11 /usr/bin/python3 \
26 |     && ln -s /usr/bin/python3 /usr/bin/python \
27 |     && curl -sS https://bootstrap.pypa.io/get-pip.py | python \
28 |     && pip3 install -U pip   
29 | 
30 | RUN apt-get update && apt-get -y install python3.11-venv 
31 | ADD --chmod=755 https://astral.sh/uv/install.sh /install.sh
32 | RUN /install.sh && rm /install.sh
33 | 
34 | ENV PYTHONUNBUFFERED=1
35 | 
36 | WORKDIR /root
37 | COPY pyproject.toml pyproject.toml
38 | COPY olmocr/version.py olmocr/version.py
39 | 
40 | RUN /root/.local/bin/uv pip install --system --no-cache -e .
41 | RUN /root/.local/bin/uv pip install --system --no-cache ".[gpu]" --find-links https://flashinfer.ai/whl/cu124/torch2.4/flashinfer/
42 | RUN /root/.local/bin/uv pip install --system --no-cache ".[bench]"
43 | RUN playwright install-deps
44 | RUN playwright install chromium
45 | COPY olmocr olmocr
46 | COPY scripts scripts
47 | 
48 | RUN python3 -m sglang.launch_server --help
49 | RUN python3 -m olmocr.pipeline --help


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY : docs
 2 | docs :
 3 | 	rm -rf docs/build/
 4 | 	sphinx-autobuild -b html --watch olmocr/ docs/source/ docs/build/
 5 | 
 6 | .PHONY : run-checks
 7 | run-checks :
 8 | 	isort --check .
 9 | 	black --check .
10 | 	ruff check .
11 | 	mypy .
12 | 	CUDA_VISIBLE_DEVICES='' pytest -v --color=yes --doctest-modules tests/ olmocr/
13 | 
14 | .PHONY : build
15 | build :
16 | 	rm -rf *.egg-info/
17 | 	python -m build
18 | 


--------------------------------------------------------------------------------
/RELEASE_PROCESS.md:
--------------------------------------------------------------------------------
 1 | # GitHub Release Process
 2 | 
 3 | ## Steps
 4 | 
 5 | 1. Update the version in `olmocr/version.py`.
 6 | 
 7 | 3. Run the release script:
 8 | 
 9 |     ```bash
10 |     ./scripts/release.sh
11 |     ```
12 | 
13 |     This will commit the changes to the CHANGELOG and `version.py` files and then create a new tag in git
14 |     which will trigger a workflow on GitHub Actions that handles the rest.
15 | 
16 | ## Fixing a failed release
17 | 
18 | If for some reason the GitHub Actions release workflow failed with an error that needs to be fixed, you'll have to delete both the tag and corresponding release from GitHub. After you've pushed a fix, delete the tag from your local clone with
19 | 
20 | ```bash
21 | git tag -l | xargs git tag -d && git fetch -t
22 | ```
23 | 
24 | Then repeat the steps above.
25 | 


--------------------------------------------------------------------------------
/docs/.gitignore:
--------------------------------------------------------------------------------
1 | build
2 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?= -W
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.https://www.sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/docs/source/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | ../../CHANGELOG.md


--------------------------------------------------------------------------------
/docs/source/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | ../../.github/CONTRIBUTING.md


--------------------------------------------------------------------------------
/docs/source/_static/css/custom.css:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/docs/source/_static/css/custom.css


--------------------------------------------------------------------------------
/docs/source/_static/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/docs/source/_static/favicon.ico


--------------------------------------------------------------------------------
/docs/source/index.md:
--------------------------------------------------------------------------------
 1 | # **olmocr**
 2 | 
 3 | ```{toctree}
 4 | :maxdepth: 2
 5 | :hidden:
 6 | :caption: Getting started
 7 | 
 8 | installation
 9 | overview
10 | ```
11 | 
12 | ```{toctree}
13 | :hidden:
14 | :caption: Development
15 | 
16 | CHANGELOG
17 | CONTRIBUTING
18 | License <https://raw.githubusercontent.com/allenai/olmocr/main/LICENSE>
19 | GitHub Repository <https://github.com/allenai/olmocr>
20 | ```
21 | 
22 | ## Indices and tables
23 | 
24 | ```{eval-rst}
25 | * :ref:`genindex`
26 | * :ref:`modindex`
27 | ```
28 | 


--------------------------------------------------------------------------------
/docs/source/installation.md:
--------------------------------------------------------------------------------
 1 | Installation
 2 | ============
 3 | 
 4 | **olmocr** supports Python >= 3.8.
 5 | 
 6 | ## Installing with `pip`
 7 | 
 8 | **olmocr** is available [on PyPI](https://pypi.org/project/olmocr/). Just run
 9 | 
10 | ```bash
11 | pip install olmocr
12 | ```
13 | 
14 | ## Installing from source
15 | 
16 | To install **olmocr** from source, first clone [the repository](https://github.com/allenai/olmocr):
17 | 
18 | ```bash
19 | git clone https://github.com/allenai/olmocr.git
20 | cd olmocr
21 | ```
22 | 
23 | Then run
24 | 
25 | ```bash
26 | pip install -e .
27 | ```
28 | 


--------------------------------------------------------------------------------
/docs/source/ocr_pareto.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/docs/source/ocr_pareto.pdf


--------------------------------------------------------------------------------
/docs/source/ocr_pareto.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/docs/source/ocr_pareto.png


--------------------------------------------------------------------------------
/docs/source/overview.md:
--------------------------------------------------------------------------------
1 | Overview
2 | ========
3 | 
4 | 


--------------------------------------------------------------------------------
/gantry-requirements.txt:
--------------------------------------------------------------------------------
 1 | torchvision
 2 | cached-path
 3 | smart_open
 4 | pypdf
 5 | pypdfium2
 6 | lingua-language-detector
 7 | Pillow
 8 | ruff
 9 | mypy>=1.0,<1.5
10 | black>=23.0,<24.0
11 | isort>=5.12,<5.13
12 | pytest
13 | pytest-sphinx
14 | pytest-cov
15 | twine>=1.11.0
16 | build
17 | setuptools
18 | wheel
19 | Sphinx>=4.3.0,<7.1.0
20 | furo==2023.7.26
21 | myst-parser>=1.0,<2.1
22 | sphinx-copybutton==0.5.2
23 | sphinx-autobuild==2021.3.14
24 | sphinx-autodoc-typehints==1.23.3
25 | packaging
26 | necessary
27 | accelerate>=0.34.2
28 | datasets==3.0.0
29 | peft
30 | wandb
31 | omegaconf
32 | s3fs
33 | transformers>=4.45.1
34 | bitsandbytes
35 | ftfy
36 | 


--------------------------------------------------------------------------------
/olmocr/__init__.py:
--------------------------------------------------------------------------------
1 | from .version import VERSION, VERSION_SHORT
2 | 


--------------------------------------------------------------------------------
/olmocr/bench/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/olmocr/bench/__init__.py


--------------------------------------------------------------------------------
/olmocr/bench/katex/__init__.py:
--------------------------------------------------------------------------------
1 | from .render import compare_rendered_equations, render_equation
2 | 


--------------------------------------------------------------------------------
/olmocr/bench/katex/auto-render.min.js:
--------------------------------------------------------------------------------
1 | !function(e,t){"object"==typeof exports&&"object"==typeof module?module.exports=t(require("katex")):"function"==typeof define&&define.amd?define(["katex"],t):"object"==typeof exports?exports.renderMathInElement=t(require("katex")):e.renderMathInElement=t(e.katex)}("undefined"!=typeof self?self:this,(function(e){return function(){"use strict";var t={757:function(t){t.exports=e}},n={};function r(e){var o=n[e];if(void 0!==o)return o.exports;var i=n[e]={exports:{}};return t[e](i,i.exports,r),i.exports}r.n=function(e){var t=e&&e.__esModule?function(){return e.default}:function(){return e};return r.d(t,{a:t}),t},r.d=function(e,t){for(var n in t)r.o(t,n)&&!r.o(e,n)&&Object.defineProperty(e,n,{enumerable:!0,get:t[n]})},r.o=function(e,t){return Object.prototype.hasOwnProperty.call(e,t)};var o={};r.d(o,{default:function(){return p}});var i=r(757),a=r.n(i);const l=function(e,t,n){let r=n,o=0;const i=e.length;for(;r<t.length;){const n=t[r];if(o<=0&&t.slice(r,r+i)===e)return r;"\\"===n?r++:"{"===n?o++:"}"===n&&o--,r++}return-1},s=/^\\begin{/;var d=function(e,t){let n;const r=[],o=new RegExp("("+t.map((e=>e.left.replace(/[-/\\^$*+?.()|[\]{}]/g,"\\$&"))).join("|")+")");for(;n=e.search(o),-1!==n;){n>0&&(r.push({type:"text",data:e.slice(0,n)}),e=e.slice(n));const o=t.findIndex((t=>e.startsWith(t.left)));if(n=l(t[o].right,e,t[o].left.length),-1===n)break;const i=e.slice(0,n+t[o].right.length),a=s.test(i)?i:e.slice(t[o].left.length,n);r.push({type:"math",data:a,rawData:i,display:t[o].display}),e=e.slice(n+t[o].right.length)}return""!==e&&r.push({type:"text",data:e}),r};const c=function(e,t){const n=d(e,t.delimiters);if(1===n.length&&"text"===n[0].type)return null;const r=document.createDocumentFragment();for(let e=0;e<n.length;e++)if("text"===n[e].type)r.appendChild(document.createTextNode(n[e].data));else{const o=document.createElement("span");let i=n[e].data;t.displayMode=n[e].display;try{t.preProcess&&(i=t.preProcess(i)),a().render(i,o,t)}catch(o){if(!(o instanceof a().ParseError))throw o;t.errorCallback("KaTeX auto-render: Failed to parse `"+n[e].data+"` with ",o),r.appendChild(document.createTextNode(n[e].rawData));continue}r.appendChild(o)}return r},f=function(e,t){for(let n=0;n<e.childNodes.length;n++){const r=e.childNodes[n];if(3===r.nodeType){let o=r.textContent,i=r.nextSibling,a=0;for(;i&&i.nodeType===Node.TEXT_NODE;)o+=i.textContent,i=i.nextSibling,a++;const l=c(o,t);if(l){for(let e=0;e<a;e++)r.nextSibling.remove();n+=l.childNodes.length-1,e.replaceChild(l,r)}else n+=a}else if(1===r.nodeType){const e=" "+r.className+" ";-1===t.ignoredTags.indexOf(r.nodeName.toLowerCase())&&t.ignoredClasses.every((t=>-1===e.indexOf(" "+t+" ")))&&f(r,t)}}};var p=function(e,t){if(!e)throw new Error("No element provided to render");const n={};for(const e in t)t.hasOwnProperty(e)&&(n[e]=t[e]);n.delimiters=n.delimiters||[{left:"$$",right:"$$",display:!0},{left:"\\(",right:"\\)",display:!1},{left:"\\begin{equation}",right:"\\end{equation}",display:!0},{left:"\\begin{align}",right:"\\end{align}",display:!0},{left:"\\begin{alignat}",right:"\\end{alignat}",display:!0},{left:"\\begin{gather}",right:"\\end{gather}",display:!0},{left:"\\begin{CD}",right:"\\end{CD}",display:!0},{left:"\\[",right:"\\]",display:!0}],n.ignoredTags=n.ignoredTags||["script","noscript","style","textarea","pre","code","option"],n.ignoredClasses=n.ignoredClasses||[],n.errorCallback=n.errorCallback||console.error,n.macros=n.macros||{},f(e,n)};return o=o.default}()}));


--------------------------------------------------------------------------------
/olmocr/bench/miners/cleanup_urls.py:
--------------------------------------------------------------------------------
 1 | # Rewrites all URLs in a dataset.jsonl file using a sql lite database lookup
 2 | import argparse
 3 | import json
 4 | import re
 5 | import sqlite3
 6 | from typing import Optional
 7 | 
 8 | 
 9 | def parse_pdf_hash(pretty_pdf_path: str) -> Optional[str]:
10 |     pattern = r"s3://ai2-s2-pdfs/([a-f0-9]{4})/([a-f0-9]+)\.pdf"
11 |     match = re.match(pattern, pretty_pdf_path)
12 |     if match:
13 |         return match.group(1) + match.group(2)
14 |     return None
15 | 
16 | 
17 | def get_uri_from_db(db_path: str, pdf_hash: str) -> Optional[str]:
18 |     conn = sqlite3.connect(db_path)
19 |     cursor = conn.cursor()
20 |     cursor.execute("SELECT uri FROM pdf_mapping WHERE pdf_hash = ?", (pdf_hash,))
21 |     result = cursor.fetchone()
22 |     conn.close()
23 |     return result[0] if result else None
24 | 
25 | 
26 | if __name__ == "__main__":
27 |     parser = argparse.ArgumentParser(description="Rewrites all URLs in a dataset.jsonl file using a sql lite database lookup")
28 |     parser.add_argument("jsonl", type=str, help="JSONL file containing s3 paths")
29 |     parser.add_argument("--db", type=str, required=True, help="Path to sqlite database mapping internal s3 urls to external ones")
30 |     parser.add_argument("--force", action="store_true", help="Path to sqlite database mapping internal s3 urls to external ones")
31 |     args = parser.parse_args()
32 | 
33 |     data = []
34 |     skipped = 0
35 | 
36 |     with open(args.jsonl, "r") as inpf:
37 |         for row in inpf:
38 |             if len(row.strip()) > 0:
39 |                 j = json.loads(row)
40 | 
41 |                 assert j["url"]
42 |                 hash = parse_pdf_hash(j["url"])
43 |                 if hash:
44 |                     url = get_uri_from_db(args.db, hash)
45 | 
46 |                     if url:
47 |                         j["url"] = url
48 |                         data.append(j)
49 |                     else:
50 |                         skipped += 1
51 |                 else:
52 |                     data.append(j)
53 | 
54 |     print(data)
55 | 
56 |     print(f"{skipped} entries were skipped!")
57 | 
58 |     if not args.force:
59 |         print("Now run with --force to write data")
60 |         quit()
61 | 
62 |     with open(args.jsonl, "w") as inpf:
63 |         for row in data:
64 |             print(json.dumps(row), file=inpf)
65 | 


--------------------------------------------------------------------------------
/olmocr/bench/prompts.py:
--------------------------------------------------------------------------------
 1 | def build_basic_prompt() -> str:
 2 |     return "Please provide a natural, plain text representation of the document, formatted in Markdown. Skip any headers and footers. For ALL mathematical expressions, use LaTeX notation with \( and \) for inline equations and \[ and \] for display equations. Convert any tables into Markdown format."
 3 | 
 4 | 
 5 | def build_openai_silver_data_prompt_no_document_anchoring(_base_text: str) -> str:
 6 |     return (
 7 |         "Below is the image of one page of a PDF document. "
 8 |         "Just return the plain text representation of this document as if you were reading it naturally.\n"
 9 |         "Turn equations into a LaTeX representation, and tables into markdown format. Remove the headers and footers, but keep references and footnotes.\n"
10 |         "Read any natural handwriting.\n"
11 |         "This is likely one page out of several in the document, so be sure to preserve any sentences that come from the previous page, or continue onto the next page, exactly as they are.\n"
12 |         "If there is no text at all that you think you should read, you can output null.\n"
13 |         "Do not hallucinate."
14 |     )
15 | 
16 | 
17 | def claude_response_format_schema() -> dict:
18 |     return (
19 |         {
20 |             "name": "page_response",
21 |             "description": "Extracts text from pdf's.",
22 |             "input_schema": {
23 |                 "type": "object",
24 |                 "properties": {
25 |                     "primary_language": {
26 |                         "type": ["string", "null"],
27 |                         "description": "The primary language of the text using two-letter codes or null if there is no text at all that you think you should read.",
28 |                     },
29 |                     "is_rotation_valid": {
30 |                         "type": "boolean",
31 |                         "description": "Is this page oriented correctly for reading? Answer only considering the textual content, do not factor in the rotation of any charts, tables, drawings, or figures.",
32 |                     },
33 |                     "rotation_correction": {
34 |                         "type": "integer",
35 |                         "description": "Indicates the degree of clockwise rotation needed if the page is not oriented correctly.",
36 |                         "enum": [0, 90, 180, 270],
37 |                         "default": 0,
38 |                     },
39 |                     "is_table": {
40 |                         "type": "boolean",
41 |                         "description": "Indicates if the majority of the page content is in tabular format.",
42 |                     },
43 |                     "is_diagram": {
44 |                         "type": "boolean",
45 |                         "description": "Indicates if the majority of the page content is a visual diagram.",
46 |                     },
47 |                     "natural_text": {
48 |                         "type": ["string", "null"],
49 |                         "description": "The natural text content extracted from the page.",
50 |                     },
51 |                 },
52 |                 "required": [
53 |                     "primary_language",
54 |                     "is_rotation_valid",
55 |                     "rotation_correction",
56 |                     "is_table",
57 |                     "is_diagram",
58 |                     "natural_text",
59 |                 ],
60 |             },
61 |         },
62 |     )
63 | 


--------------------------------------------------------------------------------
/olmocr/bench/runners/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/olmocr/bench/runners/__init__.py


--------------------------------------------------------------------------------
/olmocr/bench/runners/run_chatgpt.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | from typing import Literal
 4 | 
 5 | from openai import OpenAI
 6 | 
 7 | from olmocr.bench.prompts import (
 8 |     build_basic_prompt,
 9 |     build_openai_silver_data_prompt_no_document_anchoring,
10 | )
11 | from olmocr.data.renderpdf import render_pdf_to_base64png
12 | from olmocr.prompts.anchor import get_anchor_text
13 | from olmocr.prompts.prompts import (
14 |     PageResponse,
15 |     build_finetuning_prompt,
16 |     build_openai_silver_data_prompt,
17 |     openai_response_format_schema,
18 | )
19 | 
20 | 
21 | def run_chatgpt(
22 |     pdf_path: str,
23 |     page_num: int = 1,
24 |     model: str = "gpt-4o-2024-08-06",
25 |     temperature: float = 0.1,
26 |     target_longest_image_dim: int = 2048,
27 |     prompt_template: Literal["full", "full_no_document_anchoring", "basic", "finetune"] = "finetune",
28 |     response_template: Literal["plain", "json"] = "json",
29 | ) -> str:
30 |     """
31 |     Convert page of a PDF file to markdown using the commercial openAI APIs.
32 | 
33 |     See run_server.py for running against an openai compatible server
34 | 
35 |     Args:
36 |         pdf_path (str): The local path to the PDF file.
37 | 
38 |     Returns:
39 |         str: The OCR result in markdown format.
40 |     """
41 |     # Convert the first page of the PDF to a base64-encoded PNG image.
42 |     image_base64 = render_pdf_to_base64png(pdf_path, page_num=page_num, target_longest_image_dim=target_longest_image_dim)
43 |     anchor_text = get_anchor_text(pdf_path, page_num, pdf_engine="pdfreport")
44 | 
45 |     if not os.getenv("OPENAI_API_KEY"):
46 |         raise SystemExit("You must specify an OPENAI_API_KEY")
47 | 
48 |     client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
49 | 
50 |     if prompt_template == "full":
51 |         prompt = build_openai_silver_data_prompt(anchor_text)
52 |     elif prompt_template == "full_no_document_anchoring":
53 |         prompt = build_openai_silver_data_prompt_no_document_anchoring(anchor_text)
54 |     elif prompt_template == "finetune":
55 |         prompt = build_finetuning_prompt(anchor_text)
56 |     elif prompt_template == "basic":
57 |         prompt = build_basic_prompt()
58 |     else:
59 |         raise ValueError("Unknown prompt template")
60 | 
61 |     response = client.chat.completions.create(
62 |         model=model,
63 |         messages=[
64 |             {
65 |                 "role": "user",
66 |                 "content": [
67 |                     {"type": "text", "text": prompt},
68 |                     {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
69 |                 ],
70 |             }
71 |         ],
72 |         temperature=temperature,
73 |         max_tokens=3000,
74 |         response_format=openai_response_format_schema() if response_template == "json" else None,
75 |     )
76 | 
77 |     raw_response = response.choices[0].message.content
78 | 
79 |     assert len(response.choices) > 0
80 |     assert response.choices[0].message.refusal is None
81 |     assert response.choices[0].finish_reason == "stop"
82 | 
83 |     if response_template == "json":
84 |         data = json.loads(raw_response)
85 |         data = PageResponse(**data)
86 | 
87 |         return data.natural_text
88 |     else:
89 |         return raw_response
90 | 


--------------------------------------------------------------------------------
/olmocr/bench/runners/run_claude.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | 
 4 | from anthropic import Anthropic
 5 | from prompts import build_openai_silver_data_prompt, claude_response_format_schema
 6 | 
 7 | from olmocr.data.renderpdf import render_pdf_to_base64png
 8 | from olmocr.prompts.anchor import get_anchor_text
 9 | 
10 | 
11 | def run_claude(pdf_path: str, page_num: int = 1, model: str = "claude-3-7-sonnet-20250219", temperature: float = 0.1) -> str:
12 |     """
13 |     Convert page of a PDF file to markdown using Claude OCR.
14 |     This function renders the specified page of the PDF to an image, runs OCR on that image,
15 |     and returns the OCR result as a markdown-formatted string.
16 | 
17 |     Args:
18 |         pdf_path (str): The local path to the PDF file.
19 |         page_num (int): The page number to process (starting from 1).
20 |         model (str): The Claude model to use.
21 |         temperature (float): The temperature parameter for generation.
22 | 
23 |     Returns:
24 |         str: The OCR result in markdown format.
25 |     """
26 | 
27 |     if not os.getenv("ANTHROPIC_API_KEY"):
28 |         raise SystemExit("You must specify an ANTHROPIC_API_KEY")
29 | 
30 |     image_base64 = render_pdf_to_base64png(pdf_path, page_num=page_num, target_longest_image_dim=2048)
31 |     anchor_text = get_anchor_text(pdf_path, page_num, pdf_engine="pdfreport")
32 |     client = Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
33 |     response = client.messages.create(
34 |         model=model,
35 |         max_tokens=3000,
36 |         temperature=temperature,
37 |         # system=system_prompt,
38 |         tools=claude_response_format_schema(),
39 |         messages=[
40 |             {
41 |                 "role": "user",
42 |                 "content": [
43 |                     {"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": image_base64}},
44 |                     {
45 |                         "type": "text",
46 |                         "text": f"{build_openai_silver_data_prompt(anchor_text)}. Use the page_response tool to respond. If the propeties are true, then extract the text from them and respond in natural_text.",
47 |                     },
48 |                 ],
49 |             }
50 |         ],
51 |     )
52 | 
53 |     json_sentiment = None
54 |     for content in response.content:
55 |         if content.type == "tool_use" and content.name == "page_response":
56 |             json_sentiment = content.input
57 |             break
58 | 
59 |     if json_sentiment:
60 |         response = json.dumps(json_sentiment, indent=2)
61 |         return response
62 | 


--------------------------------------------------------------------------------
/olmocr/bench/runners/run_docling.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import os
 3 | import tempfile
 4 | from typing import Literal
 5 | 
 6 | from pypdf import PdfReader, PdfWriter
 7 | 
 8 | 
 9 | async def run_docling(
10 |     pdf_path: str,
11 |     page_num: int = 1,
12 |     output_format: Literal["markdown"] = "markdown",
13 |     use_smoldocling: bool = False,
14 | ) -> str:
15 |     """Run docling CLI on a PDF file and return the results.
16 | 
17 |     Args:
18 |         pdf_path: Path to the PDF file
19 |         page_num: Page number to process (1-indexed)
20 |         output_format: Output format (only markdown is supported for CLI version)
21 | 
22 |     Returns:
23 |         String containing the markdown output
24 |     """
25 |     if output_format != "markdown":
26 |         raise ValueError("Only markdown output format is supported for CLI version")
27 | 
28 |     # Extract the specific page using pypdf
29 |     pdf_reader = PdfReader(pdf_path)
30 |     pdf_writer = PdfWriter()
31 | 
32 |     # Convert from 1-indexed to 0-indexed
33 |     zero_based_page_num = page_num - 1
34 | 
35 |     if zero_based_page_num >= len(pdf_reader.pages) or zero_based_page_num < 0:
36 |         raise ValueError(f"Page number {page_num} is out of bounds for PDF with {len(pdf_reader.pages)} pages")
37 | 
38 |     # Add the selected page to the writer
39 |     pdf_writer.add_page(pdf_reader.pages[zero_based_page_num])
40 | 
41 |     # Create temporary files for the single-page PDF and output markdown
42 |     with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_pdf_file, tempfile.NamedTemporaryFile(suffix=".md", delete=False) as tmp_md_file:
43 |         tmp_pdf_path = tmp_pdf_file.name
44 |         tmp_md_path = tmp_md_file.name
45 | 
46 |     try:
47 |         # Write the single-page PDF to the temporary file
48 |         with open(tmp_pdf_path, "wb") as f:
49 |             pdf_writer.write(f)
50 | 
51 |         # Build the command to run docling on the single-page PDF
52 |         if use_smoldocling:
53 |             cmd = ["docling", tmp_pdf_path, "-o", tmp_md_path]  # Output file
54 |         else:
55 |             cmd = ["docling", "--pipeline", "vlm", "--vlm-model", "smoldocling", tmp_pdf_path, "-o", tmp_md_path]  # Output file
56 | 
57 |         # Run the command asynchronously
58 |         proc = await asyncio.create_subprocess_exec(*cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE)
59 | 
60 |         stdout, stderr = await proc.communicate()
61 | 
62 |         if proc.returncode != 0:
63 |             error_msg = stderr.decode() if stderr else "Unknown error"
64 |             raise RuntimeError(f"docling command failed with return code {proc.returncode}: {error_msg}")
65 | 
66 |         # Read the results from the temporary markdown file
67 |         with open(tmp_md_path, "r", encoding="utf-8") as f:
68 |             result = f.read()
69 | 
70 |         return result
71 | 
72 |     finally:
73 |         # Clean up the temporary files
74 |         for path in [tmp_pdf_path, tmp_md_path]:
75 |             if os.path.exists(path):
76 |                 os.unlink(path)
77 | 


--------------------------------------------------------------------------------
/olmocr/bench/runners/run_gotocr.py:
--------------------------------------------------------------------------------
 1 | import base64
 2 | import os
 3 | import tempfile
 4 | 
 5 | import torch
 6 | from transformers import AutoModel, AutoTokenizer
 7 | 
 8 | from olmocr.data.renderpdf import render_pdf_to_base64png
 9 | 
10 | # Global cache for the model and tokenizer.
11 | _device = "cuda" if torch.cuda.is_available() else "cpu"
12 | _model = None
13 | _tokenizer = None
14 | 
15 | 
16 | def load_model():
17 |     """
18 |     Load the GOT-OCR model and tokenizer if they haven't been loaded already.
19 |     Returns:
20 |         model: The GOT-OCR model loaded on the appropriate device.
21 |         tokenizer: The corresponding tokenizer.
22 |     """
23 |     global _model, _tokenizer
24 |     if _model is None or _tokenizer is None:
25 |         _tokenizer = AutoTokenizer.from_pretrained("ucaslcl/GOT-OCR2_0", trust_remote_code=True)
26 |         _model = AutoModel.from_pretrained(
27 |             "ucaslcl/GOT-OCR2_0",
28 |             trust_remote_code=True,
29 |             use_safetensors=True,
30 |             revision="979938bf89ccdc949c0131ddd3841e24578a4742",
31 |             pad_token_id=_tokenizer.eos_token_id,
32 |         )
33 |         _model = _model.eval().to(_device)
34 |     return _model, _tokenizer
35 | 
36 | 
37 | def run_gotocr(pdf_path: str, page_num: int = 1, ocr_type: str = "ocr") -> str:
38 |     """
39 |     Convert page of a PDF file to markdown using GOT-OCR.
40 | 
41 |     This function renders the first page of the PDF to an image, runs OCR on that image,
42 |     and returns the OCR result as a markdown-formatted string.
43 | 
44 |     Args:
45 |         pdf_path (str): The local path to the PDF file.
46 | 
47 |     Returns:
48 |         str: The OCR result in markdown format.
49 |     """
50 |     # Ensure the model is loaded (cached across calls)
51 |     model, tokenizer = load_model()
52 | 
53 |     # Convert the first page of the PDF to a base64-encoded PNG image.
54 |     base64image = render_pdf_to_base64png(pdf_path, page_num=page_num, target_longest_image_dim=1024)
55 | 
56 |     # Write the image to a temporary file.
57 |     with tempfile.NamedTemporaryFile("wb", suffix=".png", delete=False) as tmp:
58 |         tmp.write(base64.b64decode(base64image))
59 |         tmp_filename = tmp.name
60 | 
61 |     # Run GOT-OCR on the saved image.
62 |     result = model.chat(tokenizer, tmp_filename, ocr_type=ocr_type)
63 | 
64 |     # Clean up the temporary file.
65 |     os.remove(tmp_filename)
66 | 
67 |     return result
68 | 


--------------------------------------------------------------------------------
/olmocr/bench/runners/run_marker.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import tempfile
 3 | 
 4 | from marker.converters.pdf import PdfConverter
 5 | from marker.models import create_model_dict
 6 | from marker.output import text_from_rendered
 7 | from pypdf import PdfReader, PdfWriter
 8 | 
 9 | _marker_converter = None
10 | 
11 | 
12 | def run_marker(pdf_path: str, page_num: int = 1) -> str:
13 |     global _marker_converter
14 | 
15 |     if _marker_converter is None:
16 |         # Create a configuration dictionary with the necessary settings
17 |         config = {
18 |             "texify_inline_spans": True,  # This enables conversion of inline math to LaTeX
19 |         }
20 | 
21 |         _marker_converter = PdfConverter(artifact_dict=create_model_dict(), config=config)
22 | 
23 |     # Extract the specific page from the PDF
24 |     pdf_to_process = pdf_path
25 |     temp_file = None
26 | 
27 |     if page_num > 0:  # If a specific page is requested
28 |         reader = PdfReader(pdf_path)
29 | 
30 |         # Check if the requested page exists
31 |         if page_num > len(reader.pages):
32 |             raise ValueError(f"Page {page_num} does not exist in the PDF. PDF has {len(reader.pages)} pages.")
33 | 
34 |         # Create a new PDF with just the requested page
35 |         writer = PdfWriter()
36 |         # pypdf uses 0-based indexing, so subtract 1 from page_num
37 |         writer.add_page(reader.pages[page_num - 1])
38 | 
39 |         # Save the extracted page to a temporary file
40 |         temp_file = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False)
41 |         temp_file.close()  # Close the file but keep the name
42 | 
43 |         with open(temp_file.name, "wb") as output_pdf:
44 |             writer.write(output_pdf)
45 | 
46 |         pdf_to_process = temp_file.name
47 | 
48 |     try:
49 |         # Process the PDF (either original or single-page extract)
50 |         rendered = _marker_converter(pdf_to_process)
51 |         text, _, images = text_from_rendered(rendered)
52 |         return text
53 |     finally:
54 |         # Clean up the temporary file if it was created
55 |         if temp_file and os.path.exists(temp_file.name):
56 |             os.unlink(temp_file.name)
57 | 


--------------------------------------------------------------------------------
/olmocr/bench/runners/run_mineru.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import tempfile
 3 | 
 4 | from magic_pdf.config.enums import SupportedPdfParseMethod
 5 | from magic_pdf.data.data_reader_writer import FileBasedDataReader, FileBasedDataWriter
 6 | from magic_pdf.data.dataset import PymuDocDataset
 7 | from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
 8 | from pypdf import PdfReader, PdfWriter
 9 | 
10 | 
11 | def run_mineru(pdf_path: str, page_num: int = 1) -> str:
12 |     output_folder = tempfile.TemporaryDirectory()
13 |     image_output_folder = tempfile.TemporaryDirectory()
14 | 
15 |     # Initialize writers (same for all PDFs)
16 |     image_writer = FileBasedDataWriter(image_output_folder.name)
17 |     md_writer = FileBasedDataWriter(output_folder.name)
18 | 
19 |     if page_num > 0:  # If a specific page is requested
20 |         reader = PdfReader(pdf_path)
21 | 
22 |         # Check if the requested page exists
23 |         if page_num > len(reader.pages):
24 |             raise ValueError(f"Page {page_num} does not exist in the PDF. PDF has {len(reader.pages)} pages.")
25 | 
26 |         # Create a new PDF with just the requested page
27 |         writer = PdfWriter()
28 |         # pypdf uses 0-based indexing, so subtract 1 from page_num
29 |         writer.add_page(reader.pages[page_num - 1])
30 | 
31 |         # Save the extracted page to a temporary file
32 |         temp_file = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False)
33 |         temp_file.close()  # Close the file but keep the name
34 | 
35 |         with open(temp_file.name, "wb") as output_pdf:
36 |             writer.write(output_pdf)
37 | 
38 |         pdf_to_process = temp_file.name
39 |     else:
40 |         pdf_to_process = pdf_path
41 | 
42 |     try:
43 |         # Read the PDF file bytes
44 |         reader = FileBasedDataReader("")
45 |         pdf_bytes = reader.read(pdf_to_process)
46 | 
47 |         # Create dataset instance
48 |         ds = PymuDocDataset(pdf_bytes)
49 | 
50 |         # Inference: decide whether to run OCR mode based on dataset classification
51 |         if ds.classify() == SupportedPdfParseMethod.OCR:
52 |             infer_result = ds.apply(doc_analyze, ocr=True)
53 |             pipe_result = infer_result.pipe_ocr_mode(image_writer)
54 |         else:
55 |             infer_result = ds.apply(doc_analyze, ocr=False)
56 |             pipe_result = infer_result.pipe_txt_mode(image_writer)
57 | 
58 |         # Generate markdown content; the image directory is the basename of the images output folder
59 |         image_dir_basename = os.path.basename(image_output_folder.name)
60 |         # md_content = pipe_result.get_markdown(image_dir_basename)
61 | 
62 |         # Dump markdown file
63 |         with tempfile.NamedTemporaryFile("w+", suffix="md") as tf:
64 |             pipe_result.dump_md(md_writer, tf.name, image_dir_basename)
65 |             tf.flush()
66 | 
67 |             tf.seek(0)
68 |             md_data = tf.read()
69 | 
70 |         return md_data
71 |     finally:
72 |         # Clean up the temporary file if it was created
73 |         if temp_file and os.path.exists(temp_file.name):
74 |             os.unlink(temp_file.name)
75 | 


--------------------------------------------------------------------------------
/olmocr/bench/runners/run_mistral.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import tempfile
 3 | 
 4 | from mistralai import Mistral
 5 | from pypdf import PdfReader, PdfWriter
 6 | 
 7 | 
 8 | def run_mistral(pdf_path: str, page_num: int = 1) -> str:
 9 |     """
10 |     Convert page of a PDF file to markdown using the mistral OCR api
11 |     https://docs.mistral.ai/capabilities/document/
12 | 
13 |     Args:
14 |         pdf_path (str): The local path to the PDF file.
15 | 
16 |     Returns:
17 |         str: The OCR result in markdown format.
18 |     """
19 |     if not os.getenv("MISTRAL_API_KEY"):
20 |         raise SystemExit("You must specify an MISTRAL_API_KEY")
21 | 
22 |     api_key = os.environ["MISTRAL_API_KEY"]
23 |     client = Mistral(api_key=api_key)
24 | 
25 |     if page_num > 0:  # If a specific page is requested
26 |         reader = PdfReader(pdf_path)
27 | 
28 |         # Check if the requested page exists
29 |         if page_num > len(reader.pages):
30 |             raise ValueError(f"Page {page_num} does not exist in the PDF. PDF has {len(reader.pages)} pages.")
31 | 
32 |         # Create a new PDF with just the requested page
33 |         writer = PdfWriter()
34 |         # pypdf uses 0-based indexing, so subtract 1 from page_num
35 |         writer.add_page(reader.pages[page_num - 1])
36 | 
37 |         # Save the extracted page to a temporary file
38 |         temp_file = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False)
39 |         temp_file.close()  # Close the file but keep the name
40 | 
41 |         with open(temp_file.name, "wb") as output_pdf:
42 |             writer.write(output_pdf)
43 | 
44 |         pdf_to_process = temp_file.name
45 |     else:
46 |         pdf_to_process = pdf_path
47 | 
48 |     try:
49 |         with open(pdf_to_process, "rb") as pf:
50 |             uploaded_pdf = client.files.upload(
51 |                 file={
52 |                     "file_name": os.path.basename(pdf_path),
53 |                     "content": pf,
54 |                 },
55 |                 purpose="ocr",
56 |             )
57 | 
58 |         signed_url = client.files.get_signed_url(file_id=uploaded_pdf.id)
59 | 
60 |         ocr_response = client.ocr.process(
61 |             model="mistral-ocr-2503",
62 |             document={
63 |                 "type": "document_url",
64 |                 "document_url": signed_url.url,
65 |             },
66 |         )
67 | 
68 |         client.files.delete(file_id=uploaded_pdf.id)
69 | 
70 |         return ocr_response.pages[0].markdown
71 |     finally:
72 |         # Clean up the temporary file if it was created
73 |         if temp_file and os.path.exists(temp_file.name):
74 |             os.unlink(temp_file.name)
75 | 


--------------------------------------------------------------------------------
/olmocr/bench/runners/run_olmocr_pipeline.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import logging
  3 | from dataclasses import dataclass
  4 | from typing import Optional
  5 | 
  6 | # Import necessary components from olmocr
  7 | from olmocr.pipeline import (
  8 |     MetricsKeeper,
  9 |     PageResult,
 10 |     WorkerTracker,
 11 |     process_page,
 12 |     sglang_server_host,
 13 |     sglang_server_ready,
 14 | )
 15 | 
 16 | # Setup basic logging
 17 | logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
 18 | logger = logging.getLogger("olmocr_runner")
 19 | 
 20 | 
 21 | # Basic configuration
 22 | @dataclass
 23 | class Args:
 24 |     model: str = "allenai/olmOCR-7B-0225-preview"
 25 |     model_chat_template: str = "qwen2-vl"
 26 |     model_max_context: int = 8192
 27 |     target_longest_image_dim: int = 1024
 28 |     target_anchor_text_len: int = 6000
 29 |     max_page_retries: int = 8
 30 |     max_page_error_rate: float = 0.004
 31 | 
 32 | 
 33 | server_check_lock = asyncio.Lock()
 34 | 
 35 | 
 36 | async def run_olmocr_pipeline(pdf_path: str, page_num: int = 1, model: str = "allenai/olmOCR-7B-0225-preview") -> Optional[str]:
 37 |     """
 38 |     Process a single page of a PDF using the official olmocr pipeline's process_page function
 39 | 
 40 |     Args:
 41 |         pdf_path: Path to the PDF file
 42 |         page_num: Page number to process (1-indexed)
 43 | 
 44 |     Returns:
 45 |         The extracted text from the page or None if processing failed
 46 |     """
 47 |     # Ensure global variables are initialized
 48 |     global metrics, tracker
 49 |     if "metrics" not in globals() or metrics is None:
 50 |         metrics = MetricsKeeper(window=60 * 5)
 51 |     if "tracker" not in globals() or tracker is None:
 52 |         tracker = WorkerTracker()
 53 | 
 54 |     args = Args()
 55 |     args.model = model
 56 |     semaphore = asyncio.Semaphore(1)
 57 |     worker_id = 0  # Using 0 as default worker ID
 58 | 
 59 |     # Ensure server is running
 60 |     async with server_check_lock:
 61 |         _server_task = None
 62 |         try:
 63 |             await asyncio.wait_for(sglang_server_ready(), timeout=5)
 64 |             logger.info("Using existing sglang server")
 65 |         except Exception:
 66 |             logger.info("Starting new sglang server")
 67 |             _server_task = asyncio.create_task(sglang_server_host(args.model, args, semaphore))
 68 |             await sglang_server_ready()
 69 | 
 70 |     try:
 71 |         # Process the page using the pipeline's process_page function
 72 |         # Note: process_page expects both original path and local path
 73 |         # In our case, we're using the same path for both
 74 |         page_result: PageResult = await process_page(args=args, worker_id=worker_id, pdf_orig_path=pdf_path, pdf_local_path=pdf_path, page_num=page_num)
 75 | 
 76 |         # Return the natural text from the response
 77 |         if page_result and page_result.response and not page_result.is_fallback:
 78 |             return page_result.response.natural_text
 79 |         return None
 80 | 
 81 |     except Exception as e:
 82 |         logger.error(f"Error processing page: {type(e).__name__} - {str(e)}")
 83 |         return None
 84 | 
 85 |     finally:
 86 |         # We leave the server running for potential reuse
 87 |         pass
 88 | 
 89 | 
 90 | async def main():
 91 |     # Example usage
 92 |     pdf_path = "your_pdf_path.pdf"
 93 |     page_num = 1
 94 | 
 95 |     result = await run_olmocr_pipeline(pdf_path, page_num)
 96 |     if result:
 97 |         print(f"Extracted text: {result[:200]}...")  # Print first 200 chars
 98 |     else:
 99 |         print("Failed to extract text from the page")
100 | 
101 | 
102 | if __name__ == "__main__":
103 |     asyncio.run(main())
104 | 


--------------------------------------------------------------------------------
/olmocr/bench/runners/run_rolmocr.py:
--------------------------------------------------------------------------------
 1 | import httpx
 2 | 
 3 | from olmocr.data.renderpdf import render_pdf_to_base64png
 4 | 
 5 | 
 6 | async def run_rolmocr(
 7 |     pdf_path: str,
 8 |     page_num: int = 1,
 9 |     server: str = "localhost:30000",
10 |     model: str = "reducto/RolmOCR",
11 |     temperature: float = 0.2,
12 |     target_longest_image_dim: int = 1024,
13 | ) -> str:
14 |     """
15 | 
16 | 
17 |     Returns:
18 |         str: The OCR result in markdown format.
19 |     """
20 |     # Convert the first page of the PDF to a base64-encoded PNG image.
21 |     image_base64 = render_pdf_to_base64png(pdf_path, page_num=page_num, target_longest_image_dim=target_longest_image_dim)
22 | 
23 |     request = {
24 |         "model": model,
25 |         "messages": [
26 |             {
27 |                 "role": "user",
28 |                 "content": [
29 |                     {
30 |                         "type": "image_url",
31 |                         "image_url": {"url": f"data:image/png;base64,{image_base64}"},
32 |                     },
33 |                     {
34 |                         "type": "text",
35 |                         "text": "Return the plain text representation of this document as if you were reading it naturally.\n",
36 |                     },
37 |                 ],
38 |             }
39 |         ],
40 |         "temperature": temperature,
41 |         "max_tokens": 4096,
42 |     }
43 | 
44 |     # Make request and get response using httpx
45 |     url = f"http://{server}/v1/chat/completions"
46 | 
47 |     async with httpx.AsyncClient(timeout=300) as client:
48 |         response = await client.post(url, json=request)
49 | 
50 |         response.raise_for_status()
51 |         data = response.json()
52 | 
53 |         choice = data["choices"][0]
54 |         assert (
55 |             choice["finish_reason"] == "stop"
56 |         ), "Response from server did not finish with finish_reason stop as expected, this is probably going to lead to bad data"
57 | 
58 |         return choice["message"]["content"]
59 | 


--------------------------------------------------------------------------------
/olmocr/bench/runners/run_server.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from typing import Literal
 3 | 
 4 | import httpx
 5 | 
 6 | from olmocr.bench.prompts import (
 7 |     build_basic_prompt,
 8 |     build_openai_silver_data_prompt_no_document_anchoring,
 9 | )
10 | from olmocr.data.renderpdf import render_pdf_to_base64png
11 | from olmocr.prompts.anchor import get_anchor_text
12 | from olmocr.prompts.prompts import (
13 |     PageResponse,
14 |     build_finetuning_prompt,
15 |     build_openai_silver_data_prompt,
16 | )
17 | 
18 | 
19 | async def run_server(
20 |     pdf_path: str,
21 |     page_num: int = 1,
22 |     server: str = "localhost:30000",
23 |     model: str = "allenai/olmOCR-7B-0225-preview",
24 |     temperature: float = 0.1,
25 |     target_longest_image_dim: int = 1024,
26 |     prompt_template: Literal["full", "full_no_document_anchoring", "basic", "finetune"] = "finetune",
27 |     response_template: Literal["plain", "json"] = "json",
28 | ) -> str:
29 |     """
30 |     Convert page of a PDF file to markdown by calling a request
31 |     running against an openai compatible server.
32 | 
33 |     You can use this for running against vllm, sglang, servers
34 |     as well as mixing and matching different model's.
35 | 
36 |     It will only make one direct request, with no retries or error checking.
37 | 
38 |     Returns:
39 |         str: The OCR result in markdown format.
40 |     """
41 |     # Convert the first page of the PDF to a base64-encoded PNG image.
42 |     image_base64 = render_pdf_to_base64png(pdf_path, page_num=page_num, target_longest_image_dim=target_longest_image_dim)
43 |     anchor_text = get_anchor_text(pdf_path, page_num, pdf_engine="pdfreport")
44 | 
45 |     if prompt_template == "full":
46 |         prompt = build_openai_silver_data_prompt(anchor_text)
47 |     elif prompt_template == "full_no_document_anchoring":
48 |         prompt = build_openai_silver_data_prompt_no_document_anchoring(anchor_text)
49 |     elif prompt_template == "finetune":
50 |         prompt = build_finetuning_prompt(anchor_text)
51 |     elif prompt_template == "basic":
52 |         prompt = build_basic_prompt()
53 |     else:
54 |         raise ValueError("Unknown prompt template")
55 | 
56 |     request = {
57 |         "model": model,
58 |         "messages": [
59 |             {
60 |                 "role": "user",
61 |                 "content": [
62 |                     {"type": "text", "text": prompt},
63 |                     {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
64 |                 ],
65 |             }
66 |         ],
67 |         "temperature": temperature,
68 |         "max_tokens": 3000,
69 |     }
70 | 
71 |     # Make request and get response using httpx
72 |     url = f"http://{server}/v1/chat/completions"
73 | 
74 |     async with httpx.AsyncClient(timeout=300) as client:
75 |         response = await client.post(url, json=request)
76 | 
77 |         response.raise_for_status()
78 |         data = response.json()
79 | 
80 |         choice = data["choices"][0]
81 |         assert (
82 |             choice["finish_reason"] == "stop"
83 |         ), "Response from server did not finish with finish_reason stop as expected, this is probably going to lead to bad data"
84 | 
85 |         if response_template == "json":
86 |             page_data = json.loads(choice["message"]["content"])
87 |             page_response = PageResponse(**page_data)
88 |             return page_response.natural_text
89 |         elif response_template == "plain":
90 |             return choice["message"]["content"]
91 | 


--------------------------------------------------------------------------------
/olmocr/bench/runners/run_transformers.py:
--------------------------------------------------------------------------------
  1 | import base64
  2 | import json
  3 | from io import BytesIO
  4 | from typing import Literal
  5 | 
  6 | import torch
  7 | from PIL import Image
  8 | from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
  9 | 
 10 | from olmocr.data.renderpdf import render_pdf_to_base64png
 11 | from olmocr.prompts.anchor import get_anchor_text
 12 | from olmocr.prompts.prompts import (
 13 |     PageResponse,
 14 |     build_finetuning_prompt,
 15 |     build_openai_silver_data_prompt,
 16 | )
 17 | 
 18 | _cached_model = None
 19 | _cached_processor = None
 20 | 
 21 | 
 22 | def run_transformers(
 23 |     pdf_path: str,
 24 |     page_num: int = 1,
 25 |     model: str = "allenai/olmOCR-7B-0225-preview",
 26 |     temperature: float = 0.1,
 27 |     target_longest_image_dim: int = 1024,
 28 |     prompt_template: Literal["full", "finetune"] = "finetune",
 29 |     response_template: Literal["plain", "json"] = "json",
 30 | ) -> str:
 31 |     """
 32 |     Convert page of a PDF file to markdown by calling a request
 33 |     running against an openai compatible server.
 34 | 
 35 |     You can use this for running against vllm, sglang, servers
 36 |     as well as mixing and matching different model's.
 37 | 
 38 |     It will only make one direct request, with no retries or error checking.
 39 | 
 40 |     Returns:
 41 |         str: The OCR result in markdown format.
 42 |     """
 43 |     # Initialize the model
 44 |     global _cached_model, _cached_processor
 45 |     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 46 | 
 47 |     if _cached_model is None:
 48 |         model = Qwen2VLForConditionalGeneration.from_pretrained(model, torch_dtype=torch.bfloat16).eval()
 49 |         processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
 50 |         model = model.to(device)
 51 | 
 52 |         _cached_model = model
 53 |         _cached_processor = processor
 54 |     else:
 55 |         model = _cached_model
 56 |         processor = _cached_processor
 57 | 
 58 |     # Convert the first page of the PDF to a base64-encoded PNG image.
 59 |     image_base64 = render_pdf_to_base64png(pdf_path, page_num=page_num, target_longest_image_dim=target_longest_image_dim)
 60 |     anchor_text = get_anchor_text(pdf_path, page_num, pdf_engine="pdfreport")
 61 | 
 62 |     if prompt_template == "full":
 63 |         prompt = build_openai_silver_data_prompt(anchor_text)
 64 |     else:
 65 |         prompt = build_finetuning_prompt(anchor_text)
 66 | 
 67 |     messages = [
 68 |         {
 69 |             "role": "user",
 70 |             "content": [
 71 |                 {"type": "text", "text": prompt},
 72 |                 {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
 73 |             ],
 74 |         }
 75 |     ]
 76 | 
 77 |     # Apply the chat template and processor
 78 |     text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
 79 |     main_image = Image.open(BytesIO(base64.b64decode(image_base64)))
 80 | 
 81 |     inputs = processor(
 82 |         text=[text],
 83 |         images=[main_image],
 84 |         padding=True,
 85 |         return_tensors="pt",
 86 |     )
 87 |     inputs = {key: value.to(device) for (key, value) in inputs.items()}
 88 | 
 89 |     # Generate the output
 90 |     MAX_NEW_TOKENS = 3000
 91 |     with torch.no_grad():
 92 |         output = model.generate(
 93 |             **inputs,
 94 |             temperature=temperature,
 95 |             max_new_tokens=MAX_NEW_TOKENS,
 96 |             num_return_sequences=1,
 97 |             do_sample=True,
 98 |         )
 99 | 
100 |     # Decode the output
101 |     prompt_length = inputs["input_ids"].shape[1]
102 |     new_tokens = output[:, prompt_length:]
103 |     text_output = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)[0]
104 | 
105 |     assert new_tokens.shape[1] < MAX_NEW_TOKENS, "Output exceed max new tokens"
106 | 
107 |     if response_template == "json":
108 |         page_data = json.loads(text_output)
109 |         page_response = PageResponse(**page_data)
110 |         return page_response.natural_text
111 |     elif response_template == "plain":
112 |         return text_output
113 | 


--------------------------------------------------------------------------------
/olmocr/bench/sample_data/olmocr_pipeline/buildingnotes_pg1_repeat1.md:
--------------------------------------------------------------------------------
 1 | Master - 7 1/4 - 36"
 2 | Master Bath - 7 1/4 - 30"
 3 | Laundry - 4 3/4 - 36"
 4 | Bath - 7 1/4 - 24"
 5 | MUD - 7 - 36"
 6 | UTIL - 8 1/4 - 36"
 7 | DOWN BATH - 7 1/4 - 32"
 8 | BUT KIT - 6 3/4 - 30
 9 | PANTRY - 4 3/4 - 24
10 | 6 WEST - 32 9/8 - 32
11 | 6 WEST BATH 5" - 24"


--------------------------------------------------------------------------------
/olmocr/bench/sample_data/olmocr_pipeline/earnings_pg1_repeat1.md:
--------------------------------------------------------------------------------
 1 | Recently Issued Accounting Pronouncements
 2 | 
 3 | Recently Adopted Accounting Pronouncement
 4 | 
 5 | In November 2023, the Financial Accounting Standards Board, or FASB, issued a new accounting standard requiring disclosures of significant expenses in operating segments. We adopted this standard in our fiscal year 2025 annual report. Refer to Note 16 of the Notes to the Consolidated Financial Statements in Part IV, Item 15 of this Annual Report on Form 10-K for further information.
 6 | 
 7 | Recent Accounting Pronouncements Not Yet Adopted
 8 | 
 9 | In December 2023, the FASB issued a new accounting standard which includes new and updated income tax disclosures, including disaggregation of information in the rate reconciliation and income taxes paid. We expect to adopt this standard in our fiscal year 2026 annual report. We do not expect the adoption of this standard to have a material impact on our Consolidated Financial Statements other than additional disclosures.
10 | 
11 | In November 2024, the FASB issued a new accounting standard requiring disclosures of certain additional expense information on an annual and interim basis, including, among other items, the amounts of purchases of inventory, employee compensation, depreciation and intangible asset amortization included within each income statement expense caption, as applicable. We expect to adopt this standard in our fiscal year 2028 annual report. We do not expect the adoption of this standard to have a material impact on our Consolidated Financial Statements other than additional disclosures.
12 | 
13 | Note 2 - Business Combination
14 | 
15 | Termination of the Arm Share Purchase Agreement
16 | 
17 | In February 2022, NVIDIA and SoftBank Group Corp, or SoftBank, announced the termination of the Share Purchase Agreement whereby NVIDIA would have acquired Arm from SoftBank. The parties agreed to terminate it due to significant regulatory challenges preventing the completion of the transaction. We recorded an acquisition termination cost of $1.4 billion in fiscal year 2023 reflecting the write-off of the prepayment provided at signing.
18 | 
19 | Note 3 - Stock-Based Compensation
20 | 
21 | Stock-based compensation expense is associated with RSUs, PSUs, market-based PSUs, and our ESPP.
22 | 
23 | Consolidated Statements of Income include stock-based compensation expense, net of amounts capitalized into inventory and subsequently recognized to cost of revenue, as follows:
24 | 
25 | | Year Ended | Jan 29, 2023 | Jan 28, 2024 | Jan 29, 2023 |
26 | |------------|--------------|--------------|--------------|
27 | | (In millions) | $138 | $141 | $138 |
28 | | Cost of revenue | $178 | $141 | $138 |
29 | | Research and development | 3,423 | 2,532 | 1,892 |
30 | | Sales, general and administrative | 1,136 | 876 | 680 |
31 | | Total | $4,737 | $3,549 | $2,710 |
32 | 
33 | Stock-based compensation capitalized in inventories was not significant during fiscal years 2025, 2024, and 2023.


--------------------------------------------------------------------------------
/olmocr/bench/sample_data/olmocr_pipeline/headers_footers/ff1fc6a205ad039139ce566851b6b260c929_pg1_pg1_repeat1.md:
--------------------------------------------------------------------------------
 1 | RTG Degradation Primer and Application to MMRTG
 2 | 
 3 | Nuclear and Emerging Technology for Space (NETS) 2015
 4 | February 23-26, 2015
 5 | Abstract 5107
 6 | 
 7 | Presenting Author: Tom Hammel, Teledyne Energy Systems
 8 | Co-Authors: Russell Bennett, Teledyne Energy Systems
 9 | Robert Sievers, Teledyne Energy Systems
10 | Bill Otting, Aerojet Rocketdyne


--------------------------------------------------------------------------------
/olmocr/bench/sample_data/olmocr_pipeline/headers_footers/ff3d6e051903fe5ca9bc172ece14964c5632_pg1_pg1_repeat1.md:
--------------------------------------------------------------------------------
1 | بررسی دیدگاه و نظرات کتابداران و اعضای هیئت علمی دانشگاه شیراز در بره گیری از فناوری شبکه‌های پی سیم در کتابخانه‌های دانشگاهی
2 | 
3 | چکیده: نظر به اهمیت و کاربرد گسترده شبکه‌های پی سیم در محیط‌های دانشگاهی در کشورهای پیشرفته و استفاده از آن در خدمات کتابخانه‌ای، بهره‌گیری از این فناوری در کتابخانه‌های دانشگاهی کشور احساس می‌شود. این پژوهش با استفاده از روش پیام‌برداری، با هدف معرفی شبکه‌های پی سیم، به بررسی دیدگاه و نظرات کتابداران و اعضای هیئت علمی دانشگاه شیراز، در یک کارگیری از این شبکه‌ها در کتابخانه‌های دانشگاهی پرداخت. باقی‌مانده‌های تحقیق نشان داد که کتابداران تا مایل زیادی به استفاده از شبکه‌های پی سیم در امر خدمات کتابخانه‌های تظیم می‌باشند و قفسه‌خوانی و دسترسی به فهرست عمومی پوسته دارند. و گسترش گی پوشش و سیاست‌های پیشرفته و سیاست‌های پیشرفته در کتابخانه‌ها را خواستار شده‌اند. در حالی که اعضای هیئت علمی، ضرورت استفاده بیشتر از این شبکه‌ها در کل محیط دانشگاه و دسترسی به منابع کتابخانه‌ای از خارج از محیط کتابخانه را خواستار شده‌اند. در کل، با توجه به نتایج حاصل از این تحقیق می‌توان گفت که هر چند استفاده از شبکه‌های پی سیم در کتابخانه‌ها پیش‌بینی‌های زیادی و نوآوری‌های می‌باشد و هنوز در کشور ما چندان مورد توجه قرار نگرفته و ناشناخته مانده است، اما تماشای کتابداران، پژوهشگران و استادان به استفاده و کاربرد آن در محیط‌های دانشگاهی زیاد است.
4 | 
5 | کلیدواژه‌ها: شبکه‌های پی سیم؛ کتابخانه‌های دانشگاهی؛ اعضای هیئت علمی؛ کتابداران؛ رایانه‌های قابل حمل؛ رایانه‌های دستی؛ منابع و خدمات کتابخانه‌ای؛ دانشگاه شیراز
6 | 
7 | farbod4ever@gmail.com
8 | 
9 | نویسنده رابطه:


--------------------------------------------------------------------------------
/olmocr/bench/sample_data/olmocr_pipeline/headers_footers/ff4f7dad78081cff727d19ab51c181d4a661_pg1_pg1_repeat1.md:
--------------------------------------------------------------------------------
 1 | Molecular markers of breast cancer metastasis
 2 | Weigelt, B.
 3 | 
 4 | Citation for published version (APA):
 5 | Weigelt, B. (2005). Molecular markers of breast cancer metastasis
 6 | 
 7 | General rights
 8 | It is not permitted to download or to forward/distribute the text or part of it without the consent of the author(s) and/or copyright holder(s), other than for strictly personal, individual use, unless the work is under an open content license (like Creative Commons).
 9 | 
10 | Disclaimer/Complaints regulations
11 | If you believe that digital publication of certain material infringes any of your rights or (privacy) interests, please let the Library know, stating your reasons. In case of a legitimate complaint, the Library will make the material inaccessible and/or remove it from the website. Please Ask the Library: http://uba.uva.nl/en/contact, or a letter to: Library of the University of Amsterdam, Secretariat, Singel 425, 1012 WP Amsterdam, The Netherlands. You will be contacted as soon as possible.


--------------------------------------------------------------------------------
/olmocr/bench/sample_data/olmocr_pipeline/headers_footers/ff518b1240a66978f22035528ccb029450b5_pg2_pg1_repeat1.md:
--------------------------------------------------------------------------------
 1 | Brief Notices
 2 | 
 3 | Prophet of the Jubilee, translated and edited by Ronald D. Dennis (Religious Studies Center, Brigham Young University, 1997)
 4 | 
 5 | In July 1846 in Rhydybont, Carmarthenshire, Wales, Dan Jones published the first issue of a monthly LDS periodical in the Welsh language on a press owned by John Jones, Dan’s brother, who was an ordained Congregational minister. The periodical, Prophwyd y Jubili (Prophet of the Jubilee), ran monthly thereafter through December 1848. Jones’s great-great-grandson Ronald Dennis has presented what he calls a “facsimile translation” (xxix) of the complete series, retaining original fonts, layout, and pagination, slightly enlarging font size for readability. Text and index are over seven hundred pages, and Geraint Bowen, former Archdruid of Wales, offers a superb introduction.
 6 | 
 7 | Many articles in Prophet of the Jubilee rebut arguments of local anti-Mormons or apostates. Articles entitled “The ‘Hater of Deceit’ Proving Himself a False Prophet Again!!” and “The ‘Rev. W. R. Davies, from Dowlais,’ and His Cruel and Shameful Persecution Again!—Again!!” give a glimpse of the intense feelings between early Welsh Saints and their religious adversaries. Jones garnishes his numerous doctrinal treatises with occasional fiction and poetry, excerpts translated from the Millennial Star, the neighboring LDS periodical in England, and portions of articles on religious topics taken from European and U. S. newspapers.
 8 | 
 9 | A brief summary of each article is provided at the beginning of the book, but after that the reader is left to plod through the text without annotations. While pagination is sure to confuse some readers, Prophet of the Jubilee opens up LDS historical documents that have been inaccessible to most English-speaking readers for 150 years. Here is a mass of interesting cultural and doctrinal history, as well as the voice of Dan Jones himself, one of the most prolific and persistent missionaries in the history of the Church.
10 | 
11 | —Jed L. Woodworth
12 | 
13 | Book of Mormon Authors: Their Words and Messages, by Roger R. Keller (Religious Studies Center, Brigham Young University, 1996)
14 | 
15 | The statistical study of Book of Mormon texts is a well-traveled road in Book of Mormon scholarship. However, in Book of Mormon Authors, Roger Keller shows


--------------------------------------------------------------------------------
/olmocr/bench/sample_data/olmocr_pipeline/headers_footers/ffaac214730d2b8c2ec842e3618ccb9c4259_pg1_pg1_repeat1.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/olmocr/bench/sample_data/olmocr_pipeline/headers_footers/ffaac214730d2b8c2ec842e3618ccb9c4259_pg1_pg1_repeat1.md


--------------------------------------------------------------------------------
/olmocr/bench/sample_data/olmocr_pipeline/headers_footers/fff590bed29a2854ac1f874dad5752ede1aa_pg1_pg1_repeat1.md:
--------------------------------------------------------------------------------
 1 | User’s Manual
 2 | 
 3 | Model 475
 4 | DSP Gaussmeter
 5 | 
 6 | Lake Shore Cryotronics, Inc.
 7 | 575 McCorkle Blvd.
 8 | Westerville, Ohio 43082-8888 USA
 9 | 
10 | E-mail Addresses:
11 | sales@lakeshore.com
12 | service@lakeshore.com
13 | 
14 | Visit Our Website At:
15 | www.lakeshore.com
16 | 
17 | Fax: (614) 891-1392
18 | Telephone: (614) 891-2243
19 | 
20 | Methods and apparatus disclosed and described herein have been developed solely on company funds of Lake Shore Cryotronics, Inc. No government or other contractual support or relationship whatsoever has existed which in any way affects or mitigates proprietary rights of Lake Shore Cryotronics, Inc. in these developments. Methods and apparatus disclosed herein may be subject to U.S. Patents existing or applied for. Lake Shore Cryotronics, Inc. reserves the right to add, improve, modify, or withdraw functions, design modifications, or products at any time without notice. Lake Shore shall not be liable for errors contained herein or for incidental or consequential damages in connection with furnishing, performance, or use of this material.


--------------------------------------------------------------------------------
/olmocr/bench/sample_data/olmocr_pipeline/lincoln_letter_pg1_repeat1.md:
--------------------------------------------------------------------------------
1 | Executive Mansion,
2 | 
3 | Washington City,
4 | 
5 | January 15th, 1864
6 | 
7 | Major General Hitchcock, Commissioner of Exchanges, is authorized and directed to offer Brigadier General Trimble, now a prisoner of war in Fort McHenry, in exchange for Major White, who is held as a prisoner at Richmond. He is also directed to send forward the offer of exchange by Henry M. Warfield, Esq. of Baltimore, under a flag of truce, and give him a pass to City Point.
8 | 
9 | Abraham Lincoln


--------------------------------------------------------------------------------
/olmocr/bench/sample_data/olmocr_pipeline/math_2503_04086_pg1_repeat1.md:
--------------------------------------------------------------------------------
 1 | Proof. Let $S$ be the generating set associated with $D$ as described in Proposition 2.5. By the circulant diagonalization theorem, the spectrum of $G_R(D) = \Gamma(R, S)$ is the multiset $\{\lambda_g\}_{g \in R}$ where
 2 | 
 3 | $$\lambda_g = \sum_{s \in S} \zeta_n^{\psi(gs)} = \sum_{i=1}^k \left[ \sum_{s, Rs = I_i} \zeta_n^{\psi(gs)} \right].$$
 4 | 
 5 | We remark that by Corollary 2.7, if $s \in R$ such that $Rs = I_i = Rx_i$ then $s$ has a unique representation of the form $s = ux_i$ where $u \in (R/\text{Ann}_R(x_i))^\times$ and $\hat{u}$ is a fixed lift of $u$ to $R^\times$. With this presentation, we can write
 6 | 
 7 | $$\sum_{s, Rs = I_i} \zeta_n^{\psi(gs)} = \sum_{u \in (R/\text{Ann}_R(x_i))^\times} \zeta_n^{\psi(gux_i)} = \sum_{u \in (R/\text{Ann}_R(x_i))^\times} \zeta_n^{\psi_xi(gu)} = c(g, R/\text{Ann}_R(x_i)).$$
 8 | 
 9 | Here we recall that $\psi_xi$ is the induced linear functional on $R/\text{Ann}_R(x_i)$. We conclude that $\lambda_g = \sum_{i=1}^k c(g, R/\text{Ann}_R(x_i)).$ \hfill $\square$
10 | 
11 | The following corollary is simple yet important for our future work on perfect state transfers on gcd-graphs.
12 | 
13 | **Corollary 4.17.** Suppose that $g' = ug$ for some $u \in R^\times$. Then $\lambda_g = \lambda_{g'}$.
14 | 
15 | **Acknowledgements**
16 | 
17 | We thank the Department of Mathematics and Computer Science at Lake Forest College for their generous financial support through an Overleaf subscription. We also thank Ján Mináč for his constant encouragement and support.
18 | 
19 | **References**
20 | 
21 | 1. Reza Akhtar, Megan Boggess, Tiffany Jackson-Henderson, Isidora Jiménez, Rachel Karpman, Amanda Kinzel, and Dan Pritikin, *On the unitary Cayley graph of a finite ring*, Electron. J. Combin. 16 (2009), no. 1, Research Paper 117, 13 pages.
22 | 2. Milan Bašić, Aleksandar Ilić, and Aleksandar Stamenković, *Maximal diameter of integral circulant graphs*, Information and Computation 301 (2024), 105208.
23 | 3. Maria Chudnovsky, Michal Cizek, Logan Crew, Ján Mináč, Tung T. Nguyen, Sophie Spirkl, and Nguyễn Duy Tấn, *On prime Cayley graphs*, arXiv:2401.06062, to appear in Journal of Combinatorics (2024).
24 | 4. Thomas Honold, *Characterization of finite frobenius rings*, Archiv der Mathematik 76 (2001), no. 6, 406–415.
25 | 5. Irving Kaplansky, *Elementary divisors and modules*, Transactions of the American Mathematical Society 66 (1949), no. 2, 464–491.
26 | 6. Walter Klotz and Torsten Sander, *Some properties of unitary Cayley graphs*, The Electronic Journal of Combinatorics 14 (2007), no. 1, R45, 12 pages.
27 | 7. Erich Lamprecht, *Allgemeine theorie der Gaußschen Summen in endlichen kommutativen Ringen*, Mathematische Nachrichten 9 (1953), no. 3, 149–196.
28 | 8. Ján Mináč, Tung T Nguyen, and Nguyen Duy Tấn, *Isomorphic gcd-graphs over polynomial rings*, arXiv preprint arXiv:2411.01768 (2024).
29 | 9. ______, *On the gcd graphs over polynomial rings*, arXiv preprint arXiv:2409.01929 (2024).


--------------------------------------------------------------------------------
/olmocr/bench/sample_data/olmocr_pipeline/mathfuncs_colswitch_pg1_repeat1.md:
--------------------------------------------------------------------------------
 1 | # The 20 Most Important Mathematical Equations
 2 | 
 3 | A journey through the most elegant and influential formulas in mathematics
 4 | 
 5 | | 1. Euler’s Identity | 3. The Fundamental Theorem of Calculus |
 6 | |---------------------|----------------------------------------|
 7 | | \( e^{i\pi} + 1 = 0 \) | \( \int_a^b f(x) \, dx = F(b) - F(a) \) |
 8 | | Connects five fundamental constants (e, i, π, 1, 0), revealing the profound relationship between exponential functions and trigonometry. | Establishes that differentiation and integration are inverse operations. If F is an antiderivative of f, the definite integral equals F(b) - F(a). Revolutionized mathematical problem-solving. |
 9 | 
10 | | 2. Pythagorean Theorem | 4. Maxwell’s Equations |
11 | |------------------------|-----------------------|
12 | | \( a^2 + b^2 = c^2 \) | \( \nabla \cdot \mathbf{E} = \frac{\rho}{\varepsilon_0} \) |
13 | | In right triangles, the hypotenuse squared equals the sum of the squares of the other sides. Cornerstone of geometry with applications in navigation and architecture. | \( \nabla \cdot \mathbf{B} = 0 \) |
14 | | | \( \nabla \times \mathbf{E} = -\frac{\partial \mathbf{B}}{\partial t} \) |
15 | | | \( \nabla \times \mathbf{B} = \mu_0 \mathbf{J} + \mu_0 \varepsilon_0 \frac{\partial \mathbf{E}}{\partial t} \) |
16 | | | Unified electricity and magnetism as manifestations of the same force. Describes electromagnetic field behavior, predicting waves traveling at light speed. Enabled technologies from radio to smartphones. |


--------------------------------------------------------------------------------
/olmocr/bench/sample_data/olmocr_pipeline/mathfuncs_pg1_repeat1.md:
--------------------------------------------------------------------------------
 1 | # The 20 Most Important Mathematical Equations
 2 | 
 3 | A journey through the most elegant and influential formulas in mathematics
 4 | 
 5 | | 1. Euler's Identity | 2. Pythagorean Theorem |
 6 | |--------------------|------------------------|
 7 | | \( e^{i\pi} + 1 = 0 \) | \( a^2 + b^2 = c^2 \) |
 8 | 
 9 | Connects five fundamental constants (e, i, π, 1, 0), revealing the profound relationship between exponential functions and trigonometry.
10 | 
11 | In right triangles, the hypotenuse squared equals the sum of the squares of the other sides. Cornerstone of geometry with applications in navigation and architecture.
12 | 
13 | | 3. The Fundamental Theorem of Calculus | 4. Maxwell's Equations |
14 | |----------------------------------------|------------------------|
15 | | \( \int_{a}^{b} f(x) \, dx = F(b) - F(a) \) | \( \nabla \cdot \mathbf{E} = \frac{Q}{\varepsilon_0} \) |
16 | 
17 | Establishes that differentiation and integration are inverse operations. If \( F \) is an antiderivative of \( f \), the definite integral equals \( F(b) - F(a) \). Revolutionized mathematical problem-solving.
18 | 
19 | Unified electricity and magnetism as manifestations of the same force. Describes electromagnetic field behavior, predicting waves traveling at light speed. Enabled technologies from radio to smartphones.


--------------------------------------------------------------------------------
/olmocr/bench/sample_data/olmocr_pipeline/mattsnotes_pg1_repeat1.md:
--------------------------------------------------------------------------------
 1 | V-February Flow
 2 | 
 3 | Data Components:
 4 | 
 5 | Code:
 6 | The-Stack-V2
 7 | 
 8 | CodeText:
 9 | SE, whatever we've scraped
10 | 
11 | WebText:
12 | HQ DCLM
13 | 
14 | DATA MIXES
15 | 
16 | ~85% Source Code
17 | ~10% CodeText
18 | ~5% Webtext
19 | 
20 | ~85% The-stack-V2
21 | ~15% CodeText
22 | ~0% Webtext
23 | 
24 | ~100% Source Code
25 | 
26 | Deepseek Coder
27 | 
28 | StarCoder 2
29 | 
30 | Arctic


--------------------------------------------------------------------------------
/olmocr/bench/sample_data/olmocr_pipeline/mattsnotes_pg2_repeat1.md:
--------------------------------------------------------------------------------
 1 | P1: 100% Source code
 2 | P2: 80% code
 3 | 20% language
 4 | 
 5 | Code Data Recipe [StackCoder]
 6 | 1) Order by Repo ✓
 7 | 2) Call Heuristic Filters ✗
 8 | 3) Group by Repo, lang → minhash ✓
 9 | 4) Pack into Repo-level docs □
10 | 5) Select PL's □
11 | 
12 | 6) Pack into FIM tokens ✗
13 | 
14 | ✓: Eng Done
15 | X: Eng definitely NOT done
16 | D: so so easy
17 | 
18 | Use Preprocessed code/text, web/text


--------------------------------------------------------------------------------
/olmocr/bench/sample_data/olmocr_pipeline/mattsnotes_pg3_repeat1.md:
--------------------------------------------------------------------------------
1 | ARCH + TRAINING
2 | 
3 | - Pick Arch like OLMO-IB
4 | - OR replicate a 3D model
5 | - Follow standard LR flow
6 | 
7 | Eval:
8 | 
9 | Hacky nonsense for now


--------------------------------------------------------------------------------
/olmocr/bench/sample_data/olmocr_pipeline/olmo2-pg4_pg1_repeat1.md:
--------------------------------------------------------------------------------
 1 | Table 1 Composition of the pretraining data for OLMo 2. The OLMo 2 1124 Mix is composed of StarCoder (Li et al., 2023b; Kocetkov et al., 2022), peS2o (Soldaini and Lo, 2023), web text from DCLM (Li et al., 2024) and Wiki come from Dolma 1.7 (Soldaini et al., 2024). arXiv comes from Red-Pajama (Together AI, 2023), while OpenWebMath (Paster et al., 2023) and Algebraic Stack come from ProofPile II (Azerbayev et al., 2023).
 2 | 
 3 | | Source                        | Type                  | Tokens  | Words  | Bytes  | Docs  |
 4 | |-------------------------------|-----------------------|---------|--------|--------|-------|
 5 | | DCLM-Baseline                 | Web pages             | 3.71T   | 3.32T  | 21.32T | 2.95B |
 6 | | StarCoder                     | Code                  | 83.0B   | 70.0B  | 459B   | 78.7M |
 7 | | filtered version from OLMoE Mix | Academic papers      | 58.6B   | 51.1B  | 413B   | 38.8M |
 8 | | peS2o from Dolma 1.7          | Math web pages        | 12.2B   | 11.1B  | 47.2B  | 2.89M |
 9 | | arXiv                         | Math proofs code      | 11.8B   | 10.8B  | 44.0B  | 2.83M |
10 | | OpenWebMath                   | Encyclopedic          | 3.7B    | 3.16B  | 16.2B  | 6.17M |
11 | | Wikipedia & Wikibooks from Dolma 1.7 |                      |         |        |        |       |
12 | | Total                         |                       | 3.90T   | 3.48T  | 22.38T | 3.08B |
13 | 
14 | 2.1.1 Pretraining data: OLMo 2 Mix 1124
15 | 
16 | The mix used for this stage is shown in Table 1. It consists of approximately 3.9 trillion tokens, with over 95% derived from web data. We refer to this set as OLMo 2 Mix 1124. This is the same pretraining data used in OLMoE (Muennighoff et al., 2024).
17 | 
18 | We combine data from DCLM (Li et al., 2024) and Dolma 1.7 (Soldaini et al., 2024). From DCLM, we use the “baseline 1.0” mix. From Dolma, we use the arXiv (Together AI, 2023), OpenWebMath (Paster et al., 2023), Algebraic Stack, peS2o (Soldaini and Lo, 2023), and Wikipedia subsets. arXiv, OpenWebMath, and Algebraic Stack were originally part of ProofPile II (Azerbayev et al., 2023).
19 | 
20 | Finally, we include code from StarCoder (Li et al., 2023b), which is derived from permissively-licensed repositories from GitHub (Kocetkov et al., 2022). In an attempt to include higher quality code, we remove any document from a repository with fewer than 2 stars on GitHub. Further, through manual inspection of this source, we found it to contain documents encoded in binary format or containing mostly numerical content; to remove them, we discarded documents whose most frequent word constitutes over 30% of the document, or whose top-2 most frequent words constitute over 50% of the document. To mitigate possible training loss spikes, we remove documents with repeated sequences of 32 or more n-grams. We report details and show effectiveness of this intervention in Section §3.1.
21 | 
22 | 2.1.2 Mid-training data: Dolmino Mix 1124
23 | 
24 | After the initial pretraining stage on mostly web data, we further train with a mixture of web data that has been more restrictively filtered for quality and a collection of domain-specific high quality data, much of which is synthetic. The purpose of this mixture is to imbue the model with math-centric skills and provide focused exposure to STEM references and high quality text. We generate several variants of this mixture, with varying sizes, but generally refer to this mixture as Dolmino Mix 1124. The base sources from which Dolmino Mix 1124 is subsampled are described in Table 2. We refer the reader to Section §4 for a deep dive detailing our processes for experimenting and curating data for this mix.


--------------------------------------------------------------------------------
/olmocr/bench/sample_data/olmocr_pipeline/openstax_caculus_pg_273_pg1_repeat1.md:
--------------------------------------------------------------------------------
 1 | 3.4 EXERCISES
 2 | 
 3 | For the following exercises, the given functions represent the position of a particle traveling along a horizontal line.
 4 | 
 5 | a. Find the velocity and acceleration functions.
 6 | 
 7 | b. Determine the time intervals when the object is slowing down or speeding up.
 8 | 
 9 | 150. \( s(t) = 2t^3 - 3t^2 - 12t + 8 \)
10 | 
11 | 151. \( s(t) = 2t^3 - 15t^2 + 36t - 10 \)
12 | 
13 | 152. \( s(t) = \frac{t}{1 + t^2} \)
14 | 
15 | 153. A rocket is fired vertically upward from the ground. The distance \( s \) in feet that the rocket travels from the ground after \( t \) seconds is given by \( s(t) = -16t^2 + 560t \).
16 | 
17 | a. Find the velocity of the rocket 3 seconds after being fired.
18 | 
19 | b. Find the acceleration of the rocket 3 seconds after being fired.
20 | 
21 | 154. A ball is thrown downward with a speed of 8 ft/s from the top of a 64-foot-tall building. After \( t \) seconds, its height above the ground is given by \( s(t) = -16t^2 - 8t + 64 \).
22 | 
23 | a. Determine how long it takes for the ball to hit the ground.
24 | 
25 | b. Determine the velocity of the ball when it hits the ground.
26 | 
27 | 155. The position function \( s(t) = t^2 - 3t - 4 \) represents the position of the back of a car backing out of a driveway and then driving in a straight line, where \( s \) is in feet and \( t \) is in seconds. In this case, \( s(t) = 0 \) represents the time at which the back of the car is at the garage door, so \( s(0) = -4 \) is the starting position of the car, 4 feet inside the garage.
28 | 
29 | a. Determine the velocity of the car when \( s(t) = 0 \).
30 | 
31 | b. Determine the velocity of the car when \( s(t) = 14 \).
32 | 
33 | 156. The position of a hummingbird flying along a straight line in \( t \) seconds is given by \( s(t) = 3t^3 - 7t \) meters.
34 | 
35 | a. Determine the velocity of the bird at \( t = 1 \) sec.
36 | 
37 | b. Determine the acceleration of the bird at \( t = 1 \) sec.
38 | 
39 | c. Determine the acceleration of the bird when the velocity equals 0.
40 | 
41 | 157. A potato is launched vertically upward with an initial velocity of 100 ft/s from a potato gun at the top of an 85-foot-tall building. The distance in feet that the potato travels from the ground after \( t \) seconds is given by \( s(t) = -16t^2 + 100t + 85 \).
42 | 
43 | a. Find the velocity of the potato after 0.5 s and 5.75 s.
44 | 
45 | b. Find the speed of the potato at 0.5 s and 5.75 s.
46 | 
47 | c. Determine when the potato reaches its maximum height.
48 | 
49 | d. Find the acceleration of the potato at 0.5 s and 1.5 s.
50 | 
51 | e. Determine how long the potato is in the air.
52 | 
53 | f. Determine the velocity of the potato upon hitting the ground.
54 | 
55 | 158. The position function \( s(t) = t^3 - 8t \) gives the position in miles of a freight train where east is the positive direction and \( t \) is measured in hours.
56 | 
57 | a. Determine the direction the train is traveling when \( s(t) = 0 \).
58 | 
59 | b. Determine the direction the train is traveling when \( s(t) = 0 \).
60 | 
61 | c. Determine the time intervals when the train is slowing down or speeding up.
62 | 
63 | 159. The following graph shows the position \( y = s(t) \) of an object moving along a straight line.
64 | 
65 | ![Graph of position function](image)
66 | 
67 | a. Use the graph of the position function to determine the time intervals when the velocity is positive, negative, or zero.
68 | 
69 | b. Sketch the graph of the velocity function.
70 | 
71 | c. Use the graph of the velocity function to determine the time intervals when the acceleration is positive, negative, or zero.
72 | 
73 | d. Determine the time intervals when the object is speeding up or slowing down.


--------------------------------------------------------------------------------
/olmocr/bench/sample_data/olmocr_pipeline/small_page_size_pg1_repeat1.md:
--------------------------------------------------------------------------------
 1 | any—was very trifling. Since the use of bones has, however, become general, the turnip crop has been, in many instances, ten-fold, and in few less than four or five-fold its former bulk. All the succeeding crops of grain and seeds have been amazingly increased, and, upon the four or five-shift system, there is no doubt the land will go on progressively improving, requiring a less quantity of bones annually, from its increased fertility and power."
 2 | 
 3 | On light loams, the returns to the Doncaster Committee give bones a preference to farm-yard dung. And we learn that, upon the calcareous soil of the Yorkshire Wolds, heavy crops of turnips have been raised from 16 bushels per acre of bones, while in the same field, and under similar circumstances, but manured from the farm-yard at the rate of from 8 to 10 tons per acre, the turnips have been of the most inferior description.
 4 | 
 5 | On peat soils, if previously drained and laid dry, their advantages are reported to be so striking, that from fifteen to twenty bushels of dust per acre, drilled, have been also found to very far surpass the ordinary dressing of stable-dung, and even of lime and pigeons'-dung.
 6 | 
 7 | On gravels, the reports are meagre and contradictory, though perhaps reconcilable in principle, as it has been justly observed, that "a gravelly soil may embrace every variety of texture and quality, from the light dry sand to the water-logged yellow clay—preserving in each the necessary admixture of stones and grit." To wet gravel, their application has been found decidedly unfavourable.
 8 | 
 9 | **ANALYSIS.**
10 | 
11 | An examination of the component parts of soils, and of the power of bones, when applied to them as manure, would go far to explain the irregularity of their different effects upon various kinds of soil. Bone is known to consist of about equal parts of earthy and animal matter; the former chiefly composed of gypsum—which is of so indestructible a nature as to have been termed, by early chemists, the "earth of bones"—and a small portion of carbonate of lime; from which we may conclude that probably half the weight of bones is in the greater part consumed by plants as direct nourishment in their state of growth, and that the remainder is more gradually absorbed by the soil, as well also as by the plants; for lime, though in small amount, is always present, in greater or less quantity, in all vegetable substances.
12 | 
13 | "The quantity of earthy matter varies according to the age of the animal; and, in like manner, the quantity of animal matter varies also in proportion to the condition of the animal. In the best kinds of bones for manure, viz., those from fat young animals, perhaps the following proportions may give an approximation to the relative quantities of each in 100 parts:
14 | 
15 | | Earthy and saline matter | 40 |
16 | | Cartilage and jelly | 40 |
17 | | Fatty matter | 20 |
18 | 
19 | The soft parts thus form, in the best bone, about sixty, and upon an average, perhaps, amount to fifty per cent., which are almost entirely constituted of the same elements of plants, and all of them, sooner or later, liable to be dissolved and absorbed by the roots. The cartilage, indeed, when the bones have been buried in a dry situation, is very indestructible; but when exposed to the action of air, water, soil, and vegetation, will probably pass into the state of jelly, and be dissolved, or otherwise decomposed,
20 | 
21 | * Doncaster Report, p. 8.


--------------------------------------------------------------------------------
/olmocr/bench/sample_data/olmocr_pipeline/test-graphical-text_pg1_repeat1.md:
--------------------------------------------------------------------------------
1 | THE POWER OF STORYTELLING
2 | FOR LEADERS
3 | ดร.วิทย์ สิทธิเวคิน


--------------------------------------------------------------------------------
/olmocr/bench/sample_data/pdfs/buildingnotes.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/olmocr/bench/sample_data/pdfs/buildingnotes.pdf


--------------------------------------------------------------------------------
/olmocr/bench/sample_data/pdfs/discoverworld_crazy_table4.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/olmocr/bench/sample_data/pdfs/discoverworld_crazy_table4.pdf


--------------------------------------------------------------------------------
/olmocr/bench/sample_data/pdfs/earnings.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/olmocr/bench/sample_data/pdfs/earnings.pdf


--------------------------------------------------------------------------------
/olmocr/bench/sample_data/pdfs/headers_footers/ff0f0b22c55d8b90dd77d153f48e144fc9db_pg2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/olmocr/bench/sample_data/pdfs/headers_footers/ff0f0b22c55d8b90dd77d153f48e144fc9db_pg2.pdf


--------------------------------------------------------------------------------
/olmocr/bench/sample_data/pdfs/headers_footers/ff1fc6a205ad039139ce566851b6b260c929_pg1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/olmocr/bench/sample_data/pdfs/headers_footers/ff1fc6a205ad039139ce566851b6b260c929_pg1.pdf


--------------------------------------------------------------------------------
/olmocr/bench/sample_data/pdfs/headers_footers/ff3d6e051903fe5ca9bc172ece14964c5632_pg1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/olmocr/bench/sample_data/pdfs/headers_footers/ff3d6e051903fe5ca9bc172ece14964c5632_pg1.pdf


--------------------------------------------------------------------------------
/olmocr/bench/sample_data/pdfs/headers_footers/ff4f7dad78081cff727d19ab51c181d4a661_pg1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/olmocr/bench/sample_data/pdfs/headers_footers/ff4f7dad78081cff727d19ab51c181d4a661_pg1.pdf


--------------------------------------------------------------------------------
/olmocr/bench/sample_data/pdfs/headers_footers/ff518b1240a66978f22035528ccb029450b5_pg2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/olmocr/bench/sample_data/pdfs/headers_footers/ff518b1240a66978f22035528ccb029450b5_pg2.pdf


--------------------------------------------------------------------------------
/olmocr/bench/sample_data/pdfs/headers_footers/ffaac214730d2b8c2ec842e3618ccb9c4259_pg1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/olmocr/bench/sample_data/pdfs/headers_footers/ffaac214730d2b8c2ec842e3618ccb9c4259_pg1.pdf


--------------------------------------------------------------------------------
/olmocr/bench/sample_data/pdfs/headers_footers/fff590bed29a2854ac1f874dad5752ede1aa_pg1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/olmocr/bench/sample_data/pdfs/headers_footers/fff590bed29a2854ac1f874dad5752ede1aa_pg1.pdf


--------------------------------------------------------------------------------
/olmocr/bench/sample_data/pdfs/lincoln_letter.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/olmocr/bench/sample_data/pdfs/lincoln_letter.pdf


--------------------------------------------------------------------------------
/olmocr/bench/sample_data/pdfs/math_2503_04086.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/olmocr/bench/sample_data/pdfs/math_2503_04086.pdf


--------------------------------------------------------------------------------
/olmocr/bench/sample_data/pdfs/mathfuncs.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/olmocr/bench/sample_data/pdfs/mathfuncs.pdf


--------------------------------------------------------------------------------
/olmocr/bench/sample_data/pdfs/mathfuncs_colswitch.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/olmocr/bench/sample_data/pdfs/mathfuncs_colswitch.pdf


--------------------------------------------------------------------------------
/olmocr/bench/sample_data/pdfs/mattsnotes.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/olmocr/bench/sample_data/pdfs/mattsnotes.pdf


--------------------------------------------------------------------------------
/olmocr/bench/sample_data/pdfs/multi_column_miss.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/olmocr/bench/sample_data/pdfs/multi_column_miss.pdf


--------------------------------------------------------------------------------
/olmocr/bench/sample_data/pdfs/olmo2-pg4.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/olmocr/bench/sample_data/pdfs/olmo2-pg4.pdf


--------------------------------------------------------------------------------
/olmocr/bench/sample_data/pdfs/openstax_caculus_pg_273.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/olmocr/bench/sample_data/pdfs/openstax_caculus_pg_273.pdf


--------------------------------------------------------------------------------
/olmocr/bench/sample_data/pdfs/small_page_size.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/olmocr/bench/sample_data/pdfs/small_page_size.pdf


--------------------------------------------------------------------------------
/olmocr/bench/sample_data/pdfs/test-graphical-text.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/olmocr/bench/sample_data/pdfs/test-graphical-text.pdf


--------------------------------------------------------------------------------
/olmocr/bench/scripts/run_difference.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from openai import OpenAI
 4 | from runners.run_chatgpt import run_chatgpt
 5 | from runners.run_gemini import run_gemini
 6 | 
 7 | from olmocr.data.renderpdf import render_pdf_to_base64png
 8 | 
 9 | 
10 | def build_find_difference_prompt(base_text: str) -> str:
11 |     return (
12 |         f"Below is an image of a document page, along with raw textual content previously extracted using different models."
13 |         f"Your goal is to carefully identify the differences between the extracted texts from both models and determine which one is more accurate by comparing them with the image."
14 |         f"Only return the differences and specify which model extracted the text with higher accuracy.\n"
15 |         f"Do not hallucinate.\n"
16 |         f"RAW_TEXT_START\n{base_text}\nRAW_TEXT_END"
17 |     )
18 | 
19 | 
20 | def combined_output(pdf_path: str) -> str:
21 |     chatgpt_output = run_chatgpt(pdf_path)
22 |     gemini_output = run_gemini(pdf_path)
23 |     return f"ChatGPT OUTPUT: \n" f"{chatgpt_output}\n\n" f"Gemini OUTPUT: \n" f"{gemini_output}"
24 | 
25 | 
26 | def run_difference(pdf_path: str, page_num: int = 1, model: str = "gpt-4o-2024-08-06", temperature: float = 0.1) -> str:
27 |     """
28 |     Convert page of a PDF file to markdown using GPT.
29 | 
30 |     This function renders the first page of the PDF to an image, runs OCR on that image,
31 |     and returns the OCR result as a markdown-formatted string.
32 | 
33 |     Args:
34 |         pdf_path (str): The local path to the PDF file.
35 |         page_num (int): Which page from document to pass.
36 |         model (str): Model used to process.
37 |         Temperature (float): Temperature used while utilizing the model.
38 | 
39 |     Returns:
40 |         str: The result in markdown format.
41 |     """
42 |     # Convert the first page of the PDF to a base64-encoded PNG image.
43 |     image_base64 = render_pdf_to_base64png(pdf_path, page_num=page_num, target_longest_image_dim=2048)
44 |     anchor_text = combined_output(pdf_path)
45 |     client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
46 | 
47 |     response = client.chat.completions.create(
48 |         model=model,
49 |         messages=[
50 |             {
51 |                 "role": "user",
52 |                 "content": [
53 |                     {"type": "text", "text": build_find_difference_prompt(anchor_text)},
54 |                     {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
55 |                 ],
56 |             }
57 |         ],
58 |         temperature=temperature,
59 |         max_tokens=3000,
60 |     )
61 | 
62 |     raw_response = response.choices[0].message.content
63 | 
64 |     return raw_response
65 | 


--------------------------------------------------------------------------------
/olmocr/bench/synth/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/olmocr/bench/synth/__init__.py


--------------------------------------------------------------------------------
/olmocr/bench/templates/all_done.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 | <head>
 4 |     <meta charset="UTF-8">
 5 |     <meta name="viewport" content="width=device-width, initial-scale=1.0">
 6 |     <title>All Tests Reviewed</title>
 7 |     <style>
 8 |         body {
 9 |             font-family: Arial, sans-serif;
10 |             margin: 0;
11 |             padding: 20px;
12 |             background-color: #f5f5f5;
13 |             display: flex;
14 |             justify-content: center;
15 |             align-items: center;
16 |             height: 100vh;
17 |             text-align: center;
18 |         }
19 |         
20 |         .message {
21 |             background-color: white;
22 |             padding: 40px;
23 |             border-radius: 8px;
24 |             box-shadow: 0 2px 10px rgba(0, 0, 0, 0.1);
25 |         }
26 |         
27 |         h1 {
28 |             color: #28a745;
29 |         }
30 |     </style>
31 | </head>
32 | <body>
33 |     <div class="message">
34 |         <h1>All Tests Reviewed!</h1>
35 |         <p>You have completed reviewing all tests in the dataset.</p>
36 |     </div>
37 | </body>
38 | </html>


--------------------------------------------------------------------------------
/olmocr/bench/templates/all_done_latex.html:
--------------------------------------------------------------------------------
 1 | 
 2 | <!DOCTYPE html>
 3 | <html lang="en">
 4 | <head>
 5 |     <meta charset="UTF-8">
 6 |     <meta name="viewport" content="width=device-width, initial-scale=1.0">
 7 |     <title>All Done!</title>
 8 |     <style>
 9 |         body {
10 |             font-family: Arial, sans-serif;
11 |             display: flex;
12 |             justify-content: center;
13 |             align-items: center;
14 |             height: 100vh;
15 |             margin: 0;
16 |             background-color: #f5f5f5;
17 |         }
18 |         .container {
19 |             text-align: center;
20 |             padding: 30px;
21 |             background-color: white;
22 |             border-radius: 10px;
23 |             box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
24 |         }
25 |         h1 {
26 |             color: #28a745;
27 |             margin-bottom: 20px;
28 |         }
29 |         p {
30 |             font-size: 18px;
31 |             margin-bottom: 20px;
32 |         }
33 |         button {
34 |             padding: 10px 20px;
35 |             background-color: #007bff;
36 |             color: white;
37 |             border: none;
38 |             border-radius: 5px;
39 |             cursor: pointer;
40 |             font-size: 16px;
41 |         }
42 |     </style>
43 | </head>
44 | <body>
45 |     <div class="container">
46 |         <h1>All Done! 🎉</h1>
47 |         <p>You have reviewed all equations in the dataset.</p>
48 |         <form method="post" action="/next_pdf">
49 |             <button type="submit">Start Over</button>
50 |         </form>
51 |     </div>
52 | </body>
53 | </html>
54 |     


--------------------------------------------------------------------------------
/olmocr/check.py:
--------------------------------------------------------------------------------
 1 | import importlib.util
 2 | import logging
 3 | import subprocess
 4 | import sys
 5 | 
 6 | logger = logging.getLogger(__name__)
 7 | 
 8 | 
 9 | def check_poppler_version():
10 |     try:
11 |         result = subprocess.run(["pdftoppm", "-h"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
12 |         if result.returncode == 0 and result.stderr.startswith("pdftoppm"):
13 |             logger.info("pdftoppm is installed and working.")
14 |         else:
15 |             logger.error("pdftoppm is installed but returned an error.")
16 |             sys.exit(1)
17 |     except FileNotFoundError:
18 |         logger.error("pdftoppm is not installed.")
19 |         logger.error("Check the README in the https://github.com/allenai/olmocr/blob/main/README.md for installation instructions")
20 |         sys.exit(1)
21 | 
22 | 
23 | def check_sglang_version():
24 |     if importlib.util.find_spec("sglang") is None:
25 |         logger.error("Please make sure sglang is installed according to the latest instructions here: https://docs.sglang.ai/start/install.html")
26 |         logger.error("Sglang needs to be installed with a separate command in order to find all dependencies properly.")
27 |         sys.exit(1)
28 | 
29 | 
30 | def check_torch_gpu_available(min_gpu_memory: int = 20 * 1024**3):
31 |     try:
32 |         import torch
33 |     except:
34 |         logger.error("Pytorch must be installed, visit https://pytorch.org/ for installation instructions")
35 |         raise
36 | 
37 |     try:
38 |         gpu_memory = torch.cuda.get_device_properties(0).total_memory
39 |         assert gpu_memory >= min_gpu_memory
40 |     except:
41 |         logger.error(f"Torch was not able to find a GPU with at least {min_gpu_memory // (1024 ** 3)} GB of RAM.")
42 |         raise
43 | 
44 | 
45 | if __name__ == "__main__":
46 |     check_poppler_version()
47 |     check_sglang_version()
48 | 


--------------------------------------------------------------------------------
/olmocr/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/olmocr/data/__init__.py


--------------------------------------------------------------------------------
/olmocr/datatypes.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | import hashlib
 3 | import json
 4 | from dataclasses import dataclass
 5 | 
 6 | 
 7 | @dataclass(frozen=True)
 8 | class PdfOutput:
 9 |     path: str
10 |     text: str
11 |     total_pdf_pages: int
12 |     processed_pdf_pages: int
13 | 
14 |     def mk_dolma_doc(self, **kwargs) -> str:
15 |         metadata = {
16 |             "Source-File": self.path,
17 |             "pdf-pages": self.processed_pdf_pages,
18 |             "pdf-total-pages": self.total_pdf_pages,
19 |             # Kwargs are added as extra metadata
20 |             **kwargs,
21 |         }
22 |         id_ = hashlib.sha1(self.text.encode()).hexdigest()
23 | 
24 |         dolma_doc = {
25 |             "id": id_,
26 |             "text": self.text,
27 |             "source": "s2pdf",
28 |             "added": datetime.datetime.now().strftime("%Y-%m-%d"),
29 |             "created": datetime.datetime.now().strftime("%Y-%m-%d"),
30 |             "metadata": metadata,
31 |         }
32 | 
33 |         return json.dumps(dolma_doc)
34 | 


--------------------------------------------------------------------------------
/olmocr/eval/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/olmocr/eval/__init__.py


--------------------------------------------------------------------------------
/olmocr/eval/dolma_refine/aligners.py:
--------------------------------------------------------------------------------
 1 | from typing import Type
 2 | 
 3 | from sequence_align.pairwise import hirschberg, needleman_wunsch
 4 | 
 5 | from .registry import BaseRegistry
 6 | 
 7 | 
 8 | class AlignerRegistry(BaseRegistry[Type["BaseAligner"]]):
 9 |     """A registry for aligners."""
10 | 
11 | 
12 | class BaseAligner:
13 |     def __init__(self, *args, **kwargs):
14 |         super().__init__()
15 | 
16 |     def align(self, gold: list[str], pred: list[str]) -> tuple[list[str], list[str]]:
17 |         raise NotImplementedError()
18 | 
19 | 
20 | @AlignerRegistry.add("hirschberg")
21 | class HirschbergAligner(BaseAligner):
22 |     def __init__(
23 |         self,
24 |         match_score: float = 1.0,
25 |         mismatch_score: float = -1.0,
26 |         indel_score: float = -1.0,
27 |         gap_token: str = "▓",
28 |     ):
29 |         self.match_score = match_score
30 |         self.mismatch_score = mismatch_score
31 |         self.indel_score = indel_score
32 |         self.gap_token = gap_token
33 |         super().__init__()
34 | 
35 |     def align(self, gold: list[str], pred: list[str]) -> tuple[list[str], list[str]]:
36 |         return hirschberg(
37 |             gold,
38 |             pred,
39 |             match_score=self.match_score,
40 |             mismatch_score=self.mismatch_score,
41 |             indel_score=self.indel_score,
42 |             gap=self.gap_token,
43 |         )
44 | 
45 | 
46 | @AlignerRegistry.add("needleman-wunsch")
47 | class NeedlemanWunschAligner(BaseAligner):
48 |     def __init__(
49 |         self,
50 |         match_score: float = 1.0,
51 |         mismatch_score: float = -1.0,
52 |         indel_score: float = -1.0,
53 |         gap_token: str = "▓",
54 |     ):
55 |         self.match_score = match_score
56 |         self.mismatch_score = mismatch_score
57 |         self.indel_score = indel_score
58 |         self.gap_token = gap_token
59 |         super().__init__()
60 | 
61 |     def align(self, gold: list[str], pred: list[str]) -> tuple[list[str], list[str]]:
62 |         return needleman_wunsch(
63 |             gold,
64 |             pred,
65 |             match_score=self.match_score,
66 |             mismatch_score=self.mismatch_score,
67 |             indel_score=self.indel_score,
68 |             gap=self.gap_token,
69 |         )
70 | 


--------------------------------------------------------------------------------
/olmocr/eval/dolma_refine/segmenters.py:
--------------------------------------------------------------------------------
 1 | from typing import Type
 2 | 
 3 | from spacy.lang.en import English
 4 | 
 5 | from .registry import BaseRegistry
 6 | 
 7 | 
 8 | class SegmenterRegistry(BaseRegistry[Type["BaseSegmenter"]]):
 9 |     """A registry for segmenters."""
10 | 
11 | 
12 | class BaseSegmenter:
13 |     def __init__(self, segmenter_name_or_path: str, *args, **kwargs):
14 |         super().__init__()
15 | 
16 |     def segment(self, text: str) -> list[str]:
17 |         raise NotImplementedError()
18 | 
19 | 
20 | @SegmenterRegistry.add("spacy")
21 | class SpacySegmenter(BaseSegmenter):
22 |     def __init__(self, segmenter_name_or_path: str, *args, **kwargs):
23 |         assert segmenter_name_or_path == "spacy", "Only 'spacy' segmenter is supported"
24 |         self.nlp = English()
25 |         self.nlp.add_pipe("sentencizer")
26 | 
27 |     def segment(self, text: str) -> list[str]:
28 |         return [sent.text_with_ws for sent in self.nlp(text).sents]
29 | 


--------------------------------------------------------------------------------
/olmocr/filter/__init__.py:
--------------------------------------------------------------------------------
1 | from .filter import PdfFilter
2 | 


--------------------------------------------------------------------------------
/olmocr/filter/coherency.py:
--------------------------------------------------------------------------------
 1 | from functools import lru_cache
 2 | 
 3 | import torch
 4 | from transformers import AutoModelForCausalLM, AutoTokenizer
 5 | 
 6 | 
 7 | @lru_cache()
 8 | def load_coherency_model(model_name: str = "HuggingFaceTB/SmolLM-135M"):
 9 |     tokenizer = AutoTokenizer.from_pretrained(model_name)
10 |     model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
11 |     model.eval()  # Set the model to evaluation mode
12 | 
13 |     return tokenizer, model
14 | 
15 | 
16 | def get_document_coherency(text: str) -> float:
17 |     """
18 |     Calculates the coherency of a document based on the log likelihood of its tokens.
19 |     Handles texts longer than the model's maximum token limit by splitting them into chunks.
20 | 
21 |     Args:
22 |         text (str): The input text to evaluate.
23 | 
24 |     Returns:
25 |         float: The average log likelihood per token as a measure of coherency.
26 |     """
27 |     tokenizer, model = load_coherency_model()
28 | 
29 |     # Determine the model's maximum number of tokens
30 |     max_length = tokenizer.model_max_length - 1
31 |     # Some tokenizers have a default value indicating no limit; use model config if so
32 |     if max_length > 1_000_000:
33 |         max_length = model.config.max_position_embeddings
34 | 
35 |     # Tokenize the entire text
36 |     tokens = tokenizer.encode(text, return_tensors="pt").squeeze(0)
37 | 
38 |     total_log_likelihood = 0.0
39 |     total_tokens = 0
40 | 
41 |     # Split tokens into chunks that fit within the model's max length
42 |     for i in range(0, len(tokens), max_length):
43 |         chunk = tokens[i : i + max_length]
44 |         inputs = chunk.unsqueeze(0)  # Add batch dimension
45 | 
46 |         # Move inputs to CPU (ensure compatibility)
47 |         inputs = {k: v.cpu() for k, v in {"input_ids": inputs}.items()}
48 | 
49 |         with torch.no_grad():
50 |             outputs = model(**inputs, labels=inputs["input_ids"])
51 |             # Compute log likelihood for the chunk
52 |             log_likelihood = -outputs.loss.item() * chunk.size(0)
53 |             total_log_likelihood += log_likelihood
54 |             total_tokens += chunk.size(0)
55 | 
56 |     # Calculate the average log likelihood per token
57 |     avg_log_likelihood = total_log_likelihood / total_tokens if total_tokens > 0 else 0.0
58 | 
59 |     return avg_log_likelihood
60 | 


--------------------------------------------------------------------------------
/olmocr/image_utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import subprocess
 3 | from typing import List, Union
 4 | 
 5 | 
 6 | def convert_image_to_pdf_bytes(image_files: Union[str, List[str]]) -> bytes:
 7 |     """
 8 |     Convert one or multiple image files to PDF bytes.
 9 | 
10 |     Args:
11 |         image_files: A single image file path (str) or a list of image file paths
12 | 
13 |     Returns:
14 |         bytes: The PDF content as bytes
15 | 
16 |     Raises:
17 |         RuntimeError: If the conversion fails
18 |         ValueError: If invalid input is provided
19 |     """
20 |     # Handle different input types
21 |     if isinstance(image_files, str):
22 |         # Single image case
23 |         image_files = [image_files]
24 |     elif not isinstance(image_files, list) or not image_files:
25 |         raise ValueError("image_files must be a non-empty string or list of strings")
26 | 
27 |     # Validate files exist and are valid image formats
28 |     for image_file in image_files:
29 |         if not os.path.exists(image_file):
30 |             raise ValueError(f"File does not exist: {image_file}")
31 | 
32 |     try:
33 |         # Run img2pdf with all images as arguments
34 |         result = subprocess.run(["img2pdf"] + image_files, check=True, capture_output=True)
35 | 
36 |         # Return the stdout content which contains the PDF data
37 |         return result.stdout
38 | 
39 |     except subprocess.CalledProcessError as e:
40 |         # Raise error with stderr information if the conversion fails
41 |         raise RuntimeError(f"Error converting image(s) to PDF: {e.stderr.decode('utf-8')}")
42 | 
43 | 
44 | def is_png(file_path):
45 |     try:
46 |         with open(file_path, "rb") as f:
47 |             header = f.read(8)
48 |             return header == b"\x89PNG\r\n\x1a\n"
49 |     except Exception as e:
50 |         print(f"Error: {e}")
51 |         return False
52 | 
53 | 
54 | def is_jpeg(file_path):
55 |     try:
56 |         with open(file_path, "rb") as f:
57 |             header = f.read(2)
58 |             return header == b"\xff\xd8"
59 |     except Exception as e:
60 |         print(f"Error: {e}")
61 |         return False
62 | 


--------------------------------------------------------------------------------
/olmocr/loadertest.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from concurrent.futures import ProcessPoolExecutor, as_completed
 3 | 
 4 | import boto3
 5 | from tqdm import tqdm
 6 | 
 7 | # Configuration
 8 | BUCKET = "ai2-llm"
 9 | PREFIX = "pretraining-data/sources/soldni-open-access-books/v0/pipeline/results"
10 | OUTPUT_FILENAME = "all_completed_files.txt"
11 | 
12 | 
13 | def process_file(key: str):
14 |     """
15 |     Process a single S3 file given by its key.
16 |     Reads a jsonl file from S3, decodes each line,
17 |     extracts the 'Source-File' from the 'metadata' field,
18 |     and returns a list of these source file strings.
19 |     """
20 |     # Create a new S3 client in the worker thread (thread-safe)
21 |     s3 = boto3.client("s3")
22 |     extracted_lines = []
23 |     try:
24 |         response = s3.get_object(Bucket=BUCKET, Key=key)
25 |         for raw_line in response["Body"].iter_lines():
26 |             try:
27 |                 # Decode the line from bytes to text
28 |                 line_str = raw_line.decode("utf-8")
29 |             except UnicodeDecodeError as e:
30 |                 print(f"Skipping a line in {key} due to decode error: {e}")
31 |                 continue
32 |             try:
33 |                 data = json.loads(line_str)
34 |             except json.JSONDecodeError as e:
35 |                 print(f"Skipping a malformed json line in {key}: {e}")
36 |                 continue
37 |             # Extract 'Source-File' from metadata if present
38 |             metadata = data.get("metadata", {})
39 |             source_file = metadata.get("Source-File")
40 |             if source_file:
41 |                 extracted_lines.append(source_file)
42 |     except Exception as e:
43 |         print(f"Error processing file {key}: {e}")
44 |     return extracted_lines
45 | 
46 | 
47 | def main():
48 |     s3 = boto3.client("s3")
49 |     paginator = s3.get_paginator("list_objects_v2")
50 |     page_iterator = paginator.paginate(Bucket=BUCKET, Prefix=PREFIX)
51 | 
52 |     # Gather all S3 object keys under the specified prefix
53 |     keys = []
54 |     for page in page_iterator:
55 |         if "Contents" not in page:
56 |             continue
57 |         for obj in page["Contents"]:
58 |             keys.append(obj["Key"])
59 | 
60 |     print(f"Found {len(keys)} files to process.")
61 | 
62 |     # Open the output file for writing
63 |     with open(OUTPUT_FILENAME, "w", encoding="utf-8") as output_file:
64 |         # Create a thread pool to process files concurrently.
65 |         # Adjust max_workers based on your environment and workload.
66 |         with ProcessPoolExecutor() as executor:
67 |             # Submit all processing jobs and map each future to its key
68 |             future_to_key = {executor.submit(process_file, key): key for key in keys}
69 |             # Use tqdm to wrap the as_completed iterator for progress display
70 |             for future in tqdm(as_completed(future_to_key), total=len(future_to_key), desc="Processing files"):
71 |                 try:
72 |                     source_files = future.result()
73 |                     # Write each extracted line to the output file as soon as the future completes
74 |                     for source in source_files:
75 |                         output_file.write(source + "\n")
76 |                     # Optionally flush after each completed task
77 |                     output_file.flush()
78 |                 except Exception as e:
79 |                     key = future_to_key[future]
80 |                     print(f"Exception occurred for file {key}: {e}")
81 | 
82 |     print(f"Finished writing the source file names to {OUTPUT_FILENAME}")
83 | 
84 | 
85 | if __name__ == "__main__":
86 |     main()
87 | 


--------------------------------------------------------------------------------
/olmocr/prompts/__init__.py:
--------------------------------------------------------------------------------
1 | from .prompts import (
2 |     PageResponse,
3 |     build_finetuning_prompt,
4 |     build_openai_silver_data_prompt,
5 |     extract_raw_text,
6 |     openai_response_format_schema,
7 | )
8 | 


--------------------------------------------------------------------------------
/olmocr/py.typed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/olmocr/py.typed


--------------------------------------------------------------------------------
/olmocr/train/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/olmocr/train/__init__.py


--------------------------------------------------------------------------------
/olmocr/train/config/molmo-o-lora-8192.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   name_or_path: allenai/Molmo-7B-O-0924
 3 |   arch: causal
 4 |   use_flash_attn: true
 5 | 
 6 | wandb:
 7 |   project: pdelfin
 8 |   entity: ai2-llm
 9 | 
10 | generate:
11 |   max_length: 8192
12 | 
13 | train_data:
14 |   seed: 1337
15 |   cache_location: /data/jakep/pdfdata/pdelfin_cache
16 |   sources:
17 |     - name: openai_batch_data_v5_1_train
18 |       response_glob_path: /data/jakep/pdfdata/openai_batch_data_v5_1_train_done/*.json
19 |       target_longest_image_dim: [1024]
20 |       target_anchor_text_len: [6000]
21 |     - name: openai_batch_data_v5_1_iabooks_train
22 |       response_glob_path: /data/jakep/pdfdata/openai_batch_data_v5_1_iabooks_train_done/*.json
23 |       target_longest_image_dim: [1024]
24 |       target_anchor_text_len: [6000]
25 | 
26 | valid_data:
27 |   cache_location: /data/jakep/pdfdata/pdelfin_cache
28 |   metric_for_best_model: openai_batch_data_v5_1_eval_loss
29 |   sources:
30 |     # These tend to be small, so you can load from s3 it's no big deal
31 |     - name: openai_batch_data_v5_1_eval
32 |       response_glob_path: s3://ai2-oe-data/jakep/pdfdata/openai_batch_done_v5_1_eval/*.json
33 |       target_longest_image_dim: [1024]
34 |       target_anchor_text_len: [6000]
35 |     - name: openai_batch_data_v5_1_iabooks_eval
36 |       response_glob_path: s3://ai2-oe-data/jakep/pdfdata/openai_batch_done_v5_1_iabooks_eval/*.json
37 |       target_longest_image_dim: [1024]
38 |       target_anchor_text_len: [6000]
39 | 
40 | 
41 | # Mostly pulled from https://github.com/QwenLM/Qwen2/blob/main/examples/sft/finetune.sh
42 | hparams:
43 |   batch_size: 1
44 |   eval_batch_size: 1
45 |   gradient_accumulation_steps: 4
46 |   gradient_checkpointing: true
47 |   find_unused_parameters: true
48 |   clip_grad_norm: 1.0
49 |   learning_rate: 3e-4
50 |   max_steps: 10000
51 |   pad_multiple_of: 16
52 |   log_every_steps: 10
53 |   eval_every_steps: 100
54 |   optim: adamw_torch
55 |   lr_scheduler: cosine
56 |   weight_decay: 0.01
57 |   warmup_ratio: 0.03
58 | 
59 | # From https://github.com/QwenLM/Qwen2/blob/main/examples/sft/finetune.py
60 | lora:
61 |   rank: 32
62 |   alpha: 32
63 |   dropout: 0.05
64 |   task_type: CAUSAL_LM
65 |   target_modules:
66 |       # attention layers in main transformer
67 |       - att_proj 
68 |       - ff_proj
69 |       - attn_out
70 |       - ff_out
71 |       # vision transformer attention and FF
72 |       - attention.wq
73 |       - attention.wk
74 |       - attention.wv
75 |       - attention.wo
76 |       - feed_forward.w1
77 |       - feed_forward.w2
78 |       # vision image projector
79 |       - vision_backbone.image_projector.w1
80 |       - vision_backbone.image_projector.w2
81 |       - vision_backbone.image_projector.w3
82 | 
83 | save:
84 |   path: s3://ai2-oe-data/jakep/experiments/molmo-o-0924/v1/models/
85 |   save_every_steps: 1000
86 | 
87 | max_workers: 10


--------------------------------------------------------------------------------
/olmocr/train/config/molmo-o-lora.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   name_or_path: allenai/Molmo-7B-O-0924
 3 |   arch: causal
 4 |   use_flash_attn: true
 5 | 
 6 | wandb:
 7 |   project: pdelfin
 8 |   entity: ai2-llm
 9 | 
10 | generate:
11 |   max_length: 4096
12 | 
13 | train_data:
14 |   seed: 1337
15 |   cache_location: /data/jakep/pdfdata/pdelfin_cache
16 |   sources:
17 |     - name: openai_batch_data_v5_1_train
18 |       response_glob_path: /data/jakep/pdfdata/openai_batch_data_v5_1_train_done/*.json
19 |       target_longest_image_dim: [1024]
20 |       target_anchor_text_len: [6000]
21 |     - name: openai_batch_data_v5_1_iabooks_train
22 |       response_glob_path: /data/jakep/pdfdata/openai_batch_data_v5_1_iabooks_train_done/*.json
23 |       target_longest_image_dim: [1024]
24 |       target_anchor_text_len: [6000]
25 | 
26 | valid_data:
27 |   cache_location: /data/jakep/pdfdata/pdelfin_cache
28 |   metric_for_best_model: openai_batch_data_v5_1_eval_loss
29 |   sources:
30 |     # These tend to be small, so you can load from s3 it's no big deal
31 |     - name: openai_batch_data_v5_1_eval
32 |       response_glob_path: s3://ai2-oe-data/jakep/pdfdata/openai_batch_done_v5_1_eval/*.json
33 |       target_longest_image_dim: [1024]
34 |       target_anchor_text_len: [6000]
35 |     - name: openai_batch_data_v5_1_eval
36 |       response_glob_path: s3://ai2-oe-data/jakep/pdfdata/openai_batch_done_v5_1_eval/*.json
37 |       target_longest_image_dim: [1024]
38 |       target_anchor_text_len: [6000]
39 | 
40 | 
41 | 
42 | 
43 | # Mostly pulled from https://github.com/QwenLM/Qwen2/blob/main/examples/sft/finetune.sh
44 | hparams:
45 |   batch_size: 1
46 |   eval_batch_size: 1
47 |   gradient_accumulation_steps: 4
48 |   gradient_checkpointing: true
49 |   find_unused_parameters: true
50 |   clip_grad_norm: 1.0
51 |   learning_rate: 1e-4
52 |   max_steps: 10000
53 |   pad_multiple_of: 16
54 |   log_every_steps: 10
55 |   eval_every_steps: 100
56 |   optim: adamw_torch
57 |   lr_scheduler: cosine
58 |   weight_decay: 0.01
59 |   warmup_ratio: 0.03
60 | 
61 | # From https://github.com/QwenLM/Qwen2/blob/main/examples/sft/finetune.py
62 | lora:
63 |   rank: 32
64 |   alpha: 32
65 |   dropout: 0.05
66 |   task_type: CAUSAL_LM
67 |   target_modules:
68 |       # attention layers in main transformer
69 |       - att_proj 
70 |       - ff_proj
71 |       - attn_out
72 |       - ff_out
73 |       # vision transformer attention and FF
74 |       - attention.wq
75 |       - attention.wk
76 |       - attention.wv
77 |       - attention.wo
78 |       - feed_forward.w1
79 |       - feed_forward.w2
80 |       # vision image projector
81 |       - vision_backbone.image_projector.w1
82 |       - vision_backbone.image_projector.w2
83 |       - vision_backbone.image_projector.w3
84 | 
85 | save:
86 |   path: s3://ai2-oe-data/jakep/experiments/molmo-o-0924/v1/models/
87 |   save_every_steps: 1000
88 | 
89 | max_workers: 10


--------------------------------------------------------------------------------
/olmocr/train/config/qwen25vl-7b.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   name_or_path: Qwen/Qwen2.5-VL-7B-Instruct
 3 |   arch: causal
 4 |   use_flash_attn: true
 5 | 
 6 | wandb:
 7 |   project: pdelfin
 8 |   entity: ai2-llm
 9 | 
10 | generate:
11 |   max_length: 8192
12 | 
13 | train_data:
14 |   seed: 1337
15 |   cache_location: /data/jakep/pdfdata/pdelfin_cache
16 |   sources:
17 |       # These tend to be small, so you can load from s3 it's no big deal
18 |     - name: openai_batch_data_v5_1_eval
19 |       response_glob_path: s3://ai2-oe-data/jakep/pdfdata/openai_batch_done_v5_1_eval/*.json
20 |       target_longest_image_dim: [1024]
21 |       target_anchor_text_len: [6000]
22 |     - name: openai_batch_data_v5_1_iabooks_eval
23 |       response_glob_path: s3://ai2-oe-data/jakep/pdfdata/openai_batch_done_v5_1_iabooks_eval/*.json
24 |       target_longest_image_dim: [1024]
25 |       target_anchor_text_len: [6000]
26 |     # - name: openai_batch_data_v5_1_train
27 |     #   response_glob_path: /data/jakep/pdfdata/openai_batch_data_v5_1_train_done/*.json
28 |     #   target_longest_image_dim: [1024]
29 |     #   target_anchor_text_len: [6000]
30 |     # - name: openai_batch_data_v5_1_iabooks_train
31 |     #   response_glob_path: /data/jakep/pdfdata/openai_batch_data_v5_1_iabooks_train_done/*.json
32 |     #   target_longest_image_dim: [1024]
33 |     #   target_anchor_text_len: [6000]
34 | 
35 | valid_data:
36 |   cache_location: /data/jakep/pdfdata/pdelfin_cache
37 |   metric_for_best_model: openai_batch_data_v5_1_eval_loss
38 |   sources:
39 |     # These tend to be small, so you can load from s3 it's no big deal
40 |     - name: openai_batch_data_v5_1_eval
41 |       response_glob_path: s3://ai2-oe-data/jakep/pdfdata/openai_batch_done_v5_1_eval/*.json
42 |       target_longest_image_dim: [1024]
43 |       target_anchor_text_len: [6000]
44 |     - name: openai_batch_data_v5_1_iabooks_eval
45 |       response_glob_path: s3://ai2-oe-data/jakep/pdfdata/openai_batch_done_v5_1_iabooks_eval/*.json
46 |       target_longest_image_dim: [1024]
47 |       target_anchor_text_len: [6000]
48 | 
49 | 
50 | 
51 | # Mostly pulled from https://github.com/QwenLM/Qwen2/blob/main/examples/sft/finetune.sh
52 | hparams:
53 |   batch_size: 1
54 |   eval_batch_size: 1
55 |   gradient_accumulation_steps: 4
56 |   gradient_checkpointing: true
57 |   clip_grad_norm: 1.0
58 |   learning_rate: 1e-6
59 |   max_steps: 10000
60 |   pad_multiple_of: 16
61 |   log_every_steps: 10
62 |   eval_every_steps: 100
63 |   optim: adamw_torch
64 |   lr_scheduler: cosine
65 |   weight_decay: 0.01
66 |   warmup_ratio: 0.03
67 | 
68 | 
69 | save:
70 |   path: s3://ai2-oe-data/jakep/experiments/qwen25vl-pdf/v1/models/
71 |   save_every_steps: 9500
72 | 
73 | max_workers: 10


--------------------------------------------------------------------------------
/olmocr/train/config/qwen2vl-2b-lora.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   name_or_path: Qwen/Qwen2-VL-2B-Instruct
 3 |   arch: causal
 4 |   use_flash_attn: true
 5 | 
 6 | wandb:
 7 |   project: pdelfin
 8 |   entity: ai2-llm
 9 | 
10 | # TODO This is not used
11 | format:
12 |   instruction_template: "Original:"
13 |   response_template: "Rewritten:"
14 |   # Template from here: https://github.com/QwenLM/Qwen2/blob/main/examples/sft/finetune.py#L30
15 |   chat_template: |
16 |     {% for message in messages %}
17 |       {{'<|im_start|>' + message['role'] + '\n' + message['content']}}
18 |       {% if loop.last %}
19 |         {{ '<|im_end|>'}}
20 |       {% else %}
21 |         {{ '<|im_end|>\n' }}
22 |       {% endif %}
23 |     {% endfor %}
24 | 
25 | generate:
26 |   max_length: 4096
27 | 
28 | train_data:
29 |   seed: 1337
30 |   sources:
31 |     - name: openai_batch_data_v2
32 |       query_glob_path: s3://ai2-oe-data/jakep/openai_batch_data_v2/*.jsonl
33 |       response_glob_path: s3://ai2-oe-data/jakep/openai_batch_done_v2/*.json
34 |       backend:
35 |         - openai
36 |       size: 100_000
37 | 
38 | valid_data:
39 |   sources:
40 |     - name: openai_batch_data_eval_mini
41 |       query_glob_path: s3://ai2-oe-data/jakep/openai_batch_data_eval_mini/*.jsonl
42 |       response_glob_path: s3://ai2-oe-data/jakep/openai_batch_done_eval_mini/*.json
43 |       backend:
44 |         - openai
45 |       size: 100_000
46 | 
47 | # Mostly pulled from https://github.com/QwenLM/Qwen2/blob/main/examples/sft/finetune.sh
48 | hparams:
49 |   batch_size: 1
50 |   eval_batch_size: 1
51 |   gradient_accumulation_steps: 4
52 |   gradient_checkpointing: false
53 |   clip_grad_norm: 1.0
54 |   learning_rate: 3e-4
55 |   max_steps: 2000
56 |   pad_multiple_of: 16
57 |   log_every_steps: 50
58 |   eval_every_steps: 1000
59 |   optim: adamw_torch
60 |   lr_scheduler: cosine
61 |   weight_decay: 0.01
62 |   warmup_ratio: 0.03
63 | 
64 | # From https://github.com/QwenLM/Qwen2/blob/main/examples/sft/finetune.py
65 | lora:
66 |   rank: 32
67 |   alpha: 32
68 |   dropout: 0.05
69 |   task_type: causal_lm
70 |   target_modules:
71 |       - q_proj
72 |       - k_proj
73 |       - v_proj
74 |       - o_proj
75 |       - gate_proj
76 |       - up_proj
77 |       - down_proj
78 |       - visual.blocks.[0-9]+.attn.qkv
79 |       - visual.blocks.[0-9]+.attn.proj
80 |       - visual.blocks.[0-9]+.mlp.fc1
81 |       - visual.blocks.[0-9]+.mlp.fc2
82 |       - visual.merger.mlp.0
83 |       - visual.merger.mlp.2
84 | 
85 | save:
86 |   path: s3://ai2-oe-data/jakep/experiments/qwen2vl-pdf/v1/models/
87 |   save_every_steps: 1000
88 | 
89 | max_workers: 10


--------------------------------------------------------------------------------
/olmocr/train/config/qwen2vl-2b.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   name_or_path: Qwen/Qwen2-VL-2B-Instruct
 3 |   arch: causal
 4 |   use_flash_attn: true
 5 | 
 6 | wandb:
 7 |   project: pdelfin
 8 |   entity: ai2-llm
 9 | 
10 | # TODO This is not used
11 | format:
12 |   instruction_template: "Original:"
13 |   response_template: "Rewritten:"
14 |   # Template from here: https://github.com/QwenLM/Qwen2/blob/main/examples/sft/finetune.py#L30
15 |   chat_template: |
16 |     {% for message in messages %}
17 |       {{'<|im_start|>' + message['role'] + '\n' + message['content']}}
18 |       {% if loop.last %}
19 |         {{ '<|im_end|>'}}
20 |       {% else %}
21 |         {{ '<|im_end|>\n' }}
22 |       {% endif %}
23 |     {% endfor %}
24 | 
25 | generate:
26 |   max_length: 4096
27 | 
28 | train_data:
29 |   seed: 1337
30 |   sources:
31 |     - name: openai_batch_data_v2
32 |       query_glob_path: s3://ai2-oe-data/jakep/openai_batch_data_v2/*.jsonl
33 |       response_glob_path: s3://ai2-oe-data/jakep/openai_batch_done_v2/*.json
34 |       backend:
35 |         - openai
36 |       size: 100_000
37 | 
38 | valid_data:
39 |   sources:
40 |     - name: openai_batch_data_eval_mini
41 |       query_glob_path: s3://ai2-oe-data/jakep/openai_batch_data_eval_mini/*.jsonl
42 |       response_glob_path: s3://ai2-oe-data/jakep/openai_batch_done_eval_mini/*.json
43 |       backend:
44 |         - openai
45 |       size: 100_000
46 | 
47 | # Mostly pulled from https://github.com/QwenLM/Qwen2/blob/main/examples/sft/finetune.sh
48 | hparams:
49 |   batch_size: 1
50 |   eval_batch_size: 1
51 |   gradient_accumulation_steps: 4
52 |   gradient_checkpointing: false
53 |   clip_grad_norm: 1.0
54 |   learning_rate: 3e-4
55 |   max_steps: 2000
56 |   pad_multiple_of: 16
57 |   log_every_steps: 50
58 |   eval_every_steps: 1000
59 |   optim: adamw_torch
60 |   lr_scheduler: cosine
61 |   weight_decay: 0.01
62 |   warmup_ratio: 0.03
63 | 
64 | # From https://github.com/QwenLM/Qwen2/blob/main/examples/sft/finetune.py
65 | # Disable LORA for now, because we want the visual network to get trained too
66 | # lora:
67 | #   rank: 32
68 | #   alpha: 32
69 | #   dropout: 0.05
70 | #   task_type: causal_lm
71 | #   target_modules:
72 | #       - q_proj
73 | #       - k_proj
74 | #       - v_proj
75 | #       - o_proj
76 | #       - gate_proj
77 | #       - up_proj
78 | #       - down_proj
79 | 
80 | save:
81 |   path: s3://ai2-oe-data/jakep/experiments/qwen2vl-pdf/v1/models/
82 |   save_every_steps: 1000
83 | 
84 | max_workers: 10


--------------------------------------------------------------------------------
/olmocr/train/config/qwen2vl-7b-lora.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   name_or_path: Qwen/Qwen2-VL-7B-Instruct
 3 |   arch: causal
 4 |   use_flash_attn: true
 5 | 
 6 | wandb:
 7 |   project: pdelfin
 8 |   entity: ai2-llm
 9 | 
10 | generate:
11 |   max_length: 8192
12 | 
13 | train_data:
14 |   seed: 1337
15 |   cache_location: /data/jakep/pdfdata/pdelfin_cache
16 |   sources:
17 |     - name: openai_batch_data_v5_1_train
18 |       response_glob_path: /data/jakep/pdfdata/openai_batch_data_v5_1_train_done/*.json
19 |       target_longest_image_dim: 1024
20 |       target_anchor_text_len: 6000
21 |     - name: openai_batch_data_v5_1_iabooks_train
22 |       response_glob_path: /data/jakep/pdfdata/openai_batch_data_v5_1_iabooks_train_done/*.json
23 |       target_longest_image_dim: 1024
24 |       target_anchor_text_len: 6000
25 | 
26 | valid_data:
27 |   cache_location: /data/jakep/pdfdata/pdelfin_cache
28 |   metric_for_best_model: openai_batch_data_v5_1_eval_loss
29 |   sources:
30 |     # These tend to be small, so you can load from s3 it's no big deal
31 |     - name: openai_batch_data_v5_1_eval
32 |       response_glob_path: s3://ai2-oe-data/jakep/pdfdata/openai_batch_done_v5_1_eval/*.json
33 |       target_longest_image_dim: 1024
34 |       target_anchor_text_len: 6000
35 |     - name: openai_batch_data_v5_1_iabooks_eval
36 |       response_glob_path: s3://ai2-oe-data/jakep/pdfdata/openai_batch_done_v5_1_iabooks_eval/*.json
37 |       target_longest_image_dim: 1024
38 |       target_anchor_text_len: 6000
39 | 
40 | 
41 | 
42 | # Mostly pulled from https://github.com/QwenLM/Qwen2/blob/main/examples/sft/finetune.sh
43 | hparams:
44 |   batch_size: 1
45 |   eval_batch_size: 1
46 |   gradient_accumulation_steps: 4
47 |   gradient_checkpointing: true
48 |   clip_grad_norm: 1.0
49 |   learning_rate: 1e-4
50 |   max_steps: 10000
51 |   pad_multiple_of: 16
52 |   log_every_steps: 10
53 |   eval_every_steps: 100
54 |   optim: adamw_torch
55 |   lr_scheduler: cosine
56 |   weight_decay: 0.01
57 |   warmup_ratio: 0.03
58 | 
59 | # From https://github.com/QwenLM/Qwen2/blob/main/examples/sft/finetune.py
60 | lora:
61 |   rank: 32
62 |   alpha: 32
63 |   dropout: 0.05
64 |   task_type: causal_lm
65 |   target_modules:
66 |       - q_proj
67 |       - k_proj
68 |       - v_proj
69 |       - o_proj
70 |       - gate_proj
71 |       - up_proj
72 |       - down_proj
73 |       - visual.blocks.[0-9]+.attn.qkv
74 |       - visual.blocks.[0-9]+.attn.proj
75 |       - visual.blocks.[0-9]+.mlp.fc1
76 |       - visual.blocks.[0-9]+.mlp.fc2
77 |       - visual.merger.mlp.0
78 |       - visual.merger.mlp.2
79 | 
80 | save:
81 |   path: s3://ai2-oe-data/jakep/experiments/qwen2vl-pdf/v1/models/
82 |   save_every_steps: 1000
83 | 
84 | max_workers: 10


--------------------------------------------------------------------------------
/olmocr/train/config/qwen2vl-7b.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   name_or_path: Qwen/Qwen2-VL-7B-Instruct
 3 |   arch: causal
 4 |   use_flash_attn: true
 5 | 
 6 | wandb:
 7 |   project: pdelfin
 8 |   entity: ai2-llm
 9 | 
10 | generate:
11 |   max_length: 8192
12 | 
13 | train_data:
14 |   seed: 1337
15 |   cache_location: /data/jakep/pdfdata/pdelfin_cache
16 |   sources:
17 |     - name: openai_batch_data_v5_1_train
18 |       response_glob_path: /data/jakep/pdfdata/openai_batch_data_v5_1_train_done/*.json
19 |       target_longest_image_dim: [1024]
20 |       target_anchor_text_len: [6000]
21 |     - name: openai_batch_data_v5_1_iabooks_train
22 |       response_glob_path: /data/jakep/pdfdata/openai_batch_data_v5_1_iabooks_train_done/*.json
23 |       target_longest_image_dim: [1024]
24 |       target_anchor_text_len: [6000]
25 | 
26 | valid_data:
27 |   cache_location: /data/jakep/pdfdata/pdelfin_cache
28 |   metric_for_best_model: openai_batch_data_v5_1_eval_loss
29 |   sources:
30 |     # These tend to be small, so you can load from s3 it's no big deal
31 |     - name: openai_batch_data_v5_1_eval
32 |       response_glob_path: s3://ai2-oe-data/jakep/pdfdata/openai_batch_done_v5_1_eval/*.json
33 |       target_longest_image_dim: [1024]
34 |       target_anchor_text_len: [6000]
35 |     - name: openai_batch_data_v5_1_iabooks_eval
36 |       response_glob_path: s3://ai2-oe-data/jakep/pdfdata/openai_batch_done_v5_1_iabooks_eval/*.json
37 |       target_longest_image_dim: [1024]
38 |       target_anchor_text_len: [6000]
39 | 
40 | 
41 | 
42 | # Mostly pulled from https://github.com/QwenLM/Qwen2/blob/main/examples/sft/finetune.sh
43 | hparams:
44 |   batch_size: 1
45 |   eval_batch_size: 1
46 |   gradient_accumulation_steps: 4
47 |   gradient_checkpointing: true
48 |   clip_grad_norm: 1.0
49 |   learning_rate: 1e-6
50 |   max_steps: 10000
51 |   pad_multiple_of: 16
52 |   log_every_steps: 10
53 |   eval_every_steps: 100
54 |   optim: adamw_torch
55 |   lr_scheduler: cosine
56 |   weight_decay: 0.01
57 |   warmup_ratio: 0.03
58 | 
59 | 
60 | save:
61 |   path: s3://ai2-oe-data/jakep/experiments/qwen2vl-pdf/v1/models/
62 |   save_every_steps: 9500
63 | 
64 | max_workers: 10


--------------------------------------------------------------------------------
/olmocr/train/core/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/olmocr/train/core/__init__.py


--------------------------------------------------------------------------------
/olmocr/train/core/adapters.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from logging import Logger
 3 | from typing import Optional, Type
 4 | 
 5 | import smart_open
 6 | import torch
 7 | from peft.peft_model import PeftModel
 8 | from transformers import (
 9 |     AutoModelForCausalLM,
10 |     AutoModelForSeq2SeqLM,
11 |     AutoModelWithLMHead,
12 |     AutoTokenizer,
13 | )
14 | 
15 | from .config import ModelConfig
16 | from .loggers import get_logger
17 | from .paths import cached_path, exists, get_cache_dir, join_path, resource_to_filename
18 | 
19 | __all__ = ["load_model", "cache_merged_model"]
20 | 
21 | 
22 | def get_model_cls(config: ModelConfig) -> Type[AutoModelWithLMHead]:
23 |     if config.arch == "seq2seq":
24 |         return AutoModelForSeq2SeqLM  # pyright: ignore
25 |     elif config.arch == "causal" or config.arch == "vllm":
26 |         return AutoModelForCausalLM  # pyright: ignore
27 |     else:
28 |         raise ValueError(f"Unsupported model architecture: {config.arch}")
29 | 
30 | 
31 | def get_adapter_config(config: ModelConfig) -> dict:
32 |     local_path = cached_path(config.name_or_path)
33 |     if exists(adapter_config_path := join_path("", local_path, "adapter_config.json")):
34 |         with smart_open.open(adapter_config_path, "rt", encoding="utf-8") as f:
35 |             return json.load(f)
36 |     return {}
37 | 
38 | 
39 | def load_model(config: ModelConfig, logger: Optional[Logger] = None) -> AutoModelWithLMHead:
40 |     logger = logger or get_logger(__file__, level="INFO")
41 | 
42 |     logger.info(f"Loading model from {config.name_or_path}")
43 |     local_path = cached_path(config.name_or_path)
44 |     if local_path != config.name_or_path:
45 |         logger.info(f"Model cached at {local_path}")
46 | 
47 |     if exists(adapter_config_path := join_path("", local_path, "adapter_config.json")):
48 |         logger.info(f"Loading LoRA adapter from {adapter_config_path}")
49 |         with smart_open.open(adapter_config_path) as f:
50 |             adapter_config = json.load(f)
51 |         base_model_name_or_path = adapter_config["base_model_name_or_path"]
52 |         enable_lora = True
53 |     else:
54 |         base_model_name_or_path = local_path
55 |         enable_lora = False
56 | 
57 |     model = get_model_cls(config).from_pretrained(
58 |         base_model_name_or_path,
59 |         device_map="auto",
60 |         trust_remote_code=config.trust_remote_code,
61 |         # low_cpu_mem_usage=model_config.low_cpu_mem_usage,
62 |         use_flash_attention_2=True if config.use_flash_attn else False,
63 |         revision=config.model_revision,
64 |         torch_dtype=torch.bfloat16 if config.use_flash_attn else getattr(torch, config.dtype),
65 |     )
66 |     logger.info(f"Successfully loaded base model from {base_model_name_or_path}")
67 | 
68 |     if enable_lora:
69 |         peft_model = PeftModel.from_pretrained(model, local_path)
70 |         model = peft_model.merge_and_unload()
71 |         logger.info(f"Successfully loaded LoRA adapter from base model: {base_model_name_or_path}")
72 | 
73 |     return model
74 | 
75 | 
76 | def cache_merged_model(config: ModelConfig, logger: Optional[Logger] = None) -> str:
77 |     logger = logger or get_logger(__file__, level="INFO")
78 | 
79 |     base_local_path = cached_path(config.name_or_path)
80 |     adapter_config = get_adapter_config(config)
81 |     if not adapter_config:
82 |         logger.info("No adapter config found; using base model")
83 |         return base_local_path
84 | 
85 |     local_fn = resource_to_filename(json.dumps({"adapter": adapter_config, "model": config.name_or_path}))
86 |     merged_local_path = f"{get_cache_dir()}/{local_fn}"
87 | 
88 |     if not exists(merged_local_path):
89 |         model = load_model(config=config, logger=logger)
90 |         tokenizer = AutoTokenizer.from_pretrained(base_local_path)
91 | 
92 |         logger.info(f"Saving merged model to {merged_local_path}")
93 |         model.save_pretrained(merged_local_path)
94 |         tokenizer.save_pretrained(merged_local_path)
95 | 
96 |     return merged_local_path
97 | 


--------------------------------------------------------------------------------
/olmocr/train/core/compression.py:
--------------------------------------------------------------------------------
 1 | from smart_open import register_compressor
 2 | 
 3 | __all__ = ["mk_compression"]
 4 | 
 5 | 
 6 | def mk_compression():
 7 |     def _handle_zst(file_obj, mode):
 8 |         try:
 9 |             import zstandard as zstd
10 |         except ImportError:
11 |             raise ImportError("zstandard is required for zstd support")
12 | 
13 |         return zstd.open(file_obj, mode)
14 | 
15 |     register_compressor(".zstd", _handle_zst)
16 |     register_compressor(".zst", _handle_zst)
17 | 


--------------------------------------------------------------------------------
/olmocr/train/core/errors.py:
--------------------------------------------------------------------------------
1 | class DolmaRefineError(RuntimeError): ...
2 | 


--------------------------------------------------------------------------------
/olmocr/train/core/loggers.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import multiprocessing
 3 | from typing import Union
 4 | 
 5 | LOGGER_PREFIX = "dolma-refine"
 6 | 
 7 | 
 8 | def get_logger(name: str, level: Union[int, str] = logging.WARN) -> logging.Logger:
 9 |     if (proc_name := multiprocessing.current_process().name) == "MainProcess":
10 |         proc_name = "main"
11 |     proc_name = proc_name.replace(" ", "_")
12 | 
13 |     # set the log level
14 |     level = level if isinstance(level, int) else getattr(logging, level.strip().upper(), logging.WARN)
15 | 
16 |     # set name
17 |     name = f"{LOGGER_PREFIX}.{proc_name}.{name}"
18 |     logger = logging.getLogger(name)
19 |     logger.setLevel(level)
20 | 
21 |     # add handler
22 |     if not logger.handlers:
23 |         handler = logging.StreamHandler()
24 |         formatter = logging.Formatter("[%(asctime)s %(name)s %(levelname)s] %(message)s", datefmt="%Y-%m-%d %H:%M:%S")
25 |         handler.setFormatter(formatter)
26 |         logger.addHandler(handler)
27 | 
28 |     return logger
29 | 
30 | 
31 | def reset_level(level: Union[int, str]) -> None:
32 |     """
33 |     Reset the log level for all Dolma loggers.
34 | 
35 |     Args:
36 |         level (Union[int, str]): The log level to set. It can be either an integer
37 |             representing the log level (e.g., logging.DEBUG) or a string
38 |             representing the log level name (e.g., 'debug').
39 | 
40 |     Returns:
41 |         None
42 |     """
43 |     if isinstance(level, str):
44 |         if (level_tmp := getattr(logging, level.strip().upper(), None)) is not None:
45 |             level = level_tmp
46 |         else:
47 |             raise ValueError(f"Invalid log level: {level}")
48 | 
49 |     for logger in logging.Logger.manager.loggerDict.values():
50 |         if isinstance(logger, logging.Logger):
51 |             if logger.name.startswith(LOGGER_PREFIX):
52 |                 logger.setLevel(level)
53 | 


--------------------------------------------------------------------------------
/olmocr/train/core/state.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from dataclasses import dataclass
 3 | from typing import Optional
 4 | 
 5 | 
 6 | @dataclass
 7 | class BeakerState:
 8 |     job_id: Optional[str] = None
 9 |     job_kind: Optional[str] = None
10 |     task_id: Optional[str] = None
11 |     experiment_id: Optional[str] = None
12 |     replica_rank: Optional[str] = None
13 |     leader_replica_hostname: Optional[str] = None
14 |     leader_replica_node_id: Optional[str] = None
15 |     user_id: Optional[str] = None
16 | 
17 |     def __post_init__(self):
18 |         for key, value in os.environ.items():
19 |             if not key.startswith("BEAKER_"):
20 |                 continue
21 |             setattr(self, key.lstrip("BEAKER_").lower(), value)
22 | 
23 |     @property
24 |     def url(self) -> Optional[str]:
25 |         if self.job_id:
26 |             return f"https://beaker.org/jobs/{self.job_id}"
27 |         return None
28 | 


--------------------------------------------------------------------------------
/olmocr/train/hf/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/olmocr/train/hf/__init__.py


--------------------------------------------------------------------------------
/olmocr/train/hf/hfhub_upload.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | import tarfile
 4 | from math import ceil
 5 | 
 6 | from huggingface_hub import HfApi
 7 | 
 8 | # Configuration
 9 | pdf_dir = "pdfs"  # Directory with PDF files (flat structure)
10 | tarball_dir = "tarballs"  # Directory where tar.gz files will be saved
11 | os.makedirs(tarball_dir, exist_ok=True)
12 | repo_id = "allenai/olmOCR-mix-0225"  # Hugging Face dataset repo ID
13 | 
14 | # Set up logging to file
15 | logging.basicConfig(filename="upload.log", level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
16 | 
17 | 
18 | def process_chunk(args):
19 |     """
20 |     Worker function to create a tar.gz file for a given chunk.
21 |     Returns a tuple: (chunk_index, success (bool), message).
22 |     """
23 |     chunk_index, chunk_files = args
24 |     tarball_name = f"pdf_chunk_{chunk_index:04d}.tar.gz"
25 |     tarball_path = os.path.join(tarball_dir, tarball_name)
26 | 
27 |     try:
28 |         with tarfile.open(tarball_path, "w:gz") as tar:
29 |             for pdf_filename in chunk_files:
30 |                 pdf_path = os.path.join(pdf_dir, pdf_filename)
31 |                 # Add the file with its basename to maintain a flat structure
32 |                 tar.add(pdf_path, arcname=pdf_filename)
33 |         logging.info(f"Chunk {chunk_index:04d}: Created '{tarball_name}' with {len(chunk_files)} PDFs.")
34 |         return chunk_index, True, "Success"
35 |     except Exception as e:
36 |         error_msg = f"Chunk {chunk_index:04d}: Error creating '{tarball_name}': {e}"
37 |         logging.error(error_msg)
38 |         return chunk_index, False, error_msg
39 | 
40 | 
41 | def main():
42 |     # List all PDF files (assuming a flat directory)
43 |     try:
44 |         pdf_files = sorted([f for f in os.listdir(pdf_dir) if f.lower().endswith(".pdf")])
45 |     except Exception as e:
46 |         logging.error(f"Error listing PDFs in '{pdf_dir}': {e}")
47 |         return
48 | 
49 |     total_files = len(pdf_files)
50 |     chunk_size = 5000
51 |     total_chunks = ceil(total_files / chunk_size)
52 |     logging.info(f"Found {total_files} PDFs; dividing into {total_chunks} chunks of up to {chunk_size} files each.")
53 | 
54 |     # # Enumerate chunks (starting at 0000)
55 |     # chunks = []
56 |     # for idx in range(total_chunks):
57 |     #     start = idx * chunk_size
58 |     #     end = start + chunk_size
59 |     #     chunk_files = pdf_files[start:end]
60 |     #     chunks.append((idx, chunk_files))
61 | 
62 |     # # Create tarballs in parallel
63 |     # results = []
64 |     # with ProcessPoolExecutor() as executor:
65 |     #     futures = {executor.submit(process_chunk, chunk): chunk for chunk in chunks}
66 |     #     for future in tqdm(as_completed(futures), total=len(futures), desc="Creating tarballs"):
67 |     #         try:
68 |     #             result = future.result()
69 |     #             results.append(result)
70 |     #             chunk_index, success, message = result
71 |     #             if not success:
72 |     #                 logging.error(f"Chunk {chunk_index:04d} failed: {message}")
73 |     #         except Exception as e:
74 |     #             logging.error(f"Unexpected error processing a chunk: {e}")
75 | 
76 |     # # Abort upload if any tarball creation failed
77 |     # failed_chunks = [r for r in results if not r[1]]
78 |     # if failed_chunks:
79 |     #     logging.error(f"{len(failed_chunks)} chunk(s) failed to create. Aborting upload.")
80 |     #     return
81 | 
82 |     # All tarballs created successfully; now upload the entire tarball directory
83 | 
84 |     api = HfApi()
85 |     logging.info("Starting upload of tarballs folder to Hugging Face Hub...")
86 |     # This will upload all files in tarball_dir to the repo under "pdf_tarballs"
87 |     api.upload_large_folder(
88 |         folder_path=tarball_dir,
89 |         repo_id=repo_id,
90 |         # path_in_repo="pdf_tarballs",
91 |         repo_type="dataset",
92 |     )
93 |     logging.info("Successfully uploaded tarballs folder to Hugging Face Hub.")
94 | 
95 | 
96 | if __name__ == "__main__":
97 |     main()
98 | 


--------------------------------------------------------------------------------
/olmocr/train/inference.py:
--------------------------------------------------------------------------------
 1 | import base64
 2 | from io import BytesIO
 3 | 
 4 | import torch
 5 | import torch.distributed
 6 | from PIL import Image
 7 | from transformers import AutoConfig, AutoProcessor, Qwen2_5_VLForConditionalGeneration
 8 | 
 9 | from olmocr.data.renderpdf import render_pdf_to_base64png
10 | from olmocr.prompts.anchor import get_anchor_text
11 | from olmocr.prompts.prompts import build_openai_silver_data_prompt
12 | 
13 | 
14 | @torch.no_grad()
15 | def run_inference(model_name: str):
16 |     config = AutoConfig.from_pretrained(model_name)
17 |     processor = AutoProcessor.from_pretrained(model_name)
18 | 
19 |     # If it doesn't load, change the type:mrope key to "default"
20 | 
21 |     # model = Qwen2VLForConditionalGeneration.from_pretrained(model_name, device_map="auto", config=config)
22 |     model = Qwen2_5_VLForConditionalGeneration.from_pretrained(model_name, device_map="auto", config=config)
23 |     model.eval()
24 | 
25 |     # local_pdf_path = os.path.join(os.path.dirname(__file__), "..", "..", "tests", "gnarly_pdfs", "horribleocr.pdf")
26 |     local_pdf_path = "/root/brochure.pdf"
27 |     page = 1
28 | 
29 |     image_base64 = render_pdf_to_base64png(local_pdf_path, page, 1024)
30 |     anchor_text = get_anchor_text(local_pdf_path, page, pdf_engine="pdfreport")
31 | 
32 |     messages = [
33 |         {
34 |             "role": "user",
35 |             "content": [
36 |                 {"type": "text", "text": build_openai_silver_data_prompt(anchor_text)},
37 |                 {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
38 |             ],
39 |         }
40 |     ]
41 | 
42 |     # Preparation for inference
43 |     text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
44 | 
45 |     main_image = Image.open(BytesIO(base64.b64decode(image_base64)))
46 | 
47 |     inputs = processor(
48 |         text=[text],
49 |         images=[main_image],
50 |         padding=True,
51 |         return_tensors="pt",
52 |     )
53 |     inputs = inputs.to("cuda")
54 | 
55 |     output_ids = model.generate(**inputs, temperature=0.8, do_sample=True, max_new_tokens=1500)
56 |     generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs["input_ids"], output_ids)]
57 |     output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
58 |     print(output_text[0])
59 | 
60 | 
61 | def main():
62 |     run_inference(model_name="Qwen/Qwen2.5-VL-7B-Instruct")
63 | 
64 | 
65 | if __name__ == "__main__":
66 |     main()
67 | 


--------------------------------------------------------------------------------
/olmocr/train/loaddataset.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoProcessor
 2 | 
 3 | from olmocr.train.core.cli import make_cli
 4 | from olmocr.train.core.config import TrainConfig
 5 | 
 6 | from .utils import make_dataset
 7 | 
 8 | 
 9 | def main():
10 |     train_config = make_cli(TrainConfig)  # pyright: ignore
11 | 
12 |     processor = AutoProcessor.from_pretrained(train_config.model.name_or_path, trust_remote_code=True)
13 |     train_dataset, valid_dataset = make_dataset(train_config, processor)
14 | 
15 |     print("Training dataset........")
16 |     print(train_dataset)
17 | 
18 |     train_example = train_dataset[0]
19 |     print(train_example)
20 |     print({(x, y.shape) for x, y in train_example.items()})
21 |     print("\nTokens")
22 |     print(processor.tokenizer.batch_decode(train_example["input_ids"]))
23 | 
24 |     print("\n\n")
25 | 
26 |     print("Validation dataset........")
27 |     print(valid_dataset)
28 |     print(valid_dataset[list(valid_dataset.keys())[0]][0])
29 |     print("\n\n")
30 | 
31 |     print("Datasets loaded into hugging face cache directory")
32 | 
33 |     # data_collator = TruncatingCollator(
34 |     #     max_length=4096
35 |     # )
36 | 
37 |     # train_dataloader = DataLoader(train_dataset, batch_size=1, num_workers=4, shuffle=False, collate_fn=data_collator)
38 |     # max_seen_len = 0
39 |     # for index, entry in tqdm(enumerate(train_dataloader)):
40 |     #     if index == 0:
41 |     #         print(entry)
42 | 
43 |     #     num_input_tokens = entry["input_ids"].shape[1]
44 |     #     max_seen_len = max(max_seen_len, num_input_tokens)
45 | 
46 |     #     print(max_seen_len)
47 | 
48 | 
49 | if __name__ == "__main__":
50 |     main()
51 | 


--------------------------------------------------------------------------------
/olmocr/train/molmo/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/olmocr/train/molmo/__init__.py


--------------------------------------------------------------------------------
/olmocr/train/molmo/config_molmo.py:
--------------------------------------------------------------------------------
 1 | from transformers import PretrainedConfig
 2 | 
 3 | 
 4 | class MolmoConfig(PretrainedConfig):
 5 |     model_type = "molmo"
 6 |     keys_to_ignore_at_inference = ["past_key_values"]
 7 | 
 8 |     def __init__(
 9 |         self,
10 |         vocab_size=50304,
11 |         embedding_size=50304,
12 |         hidden_size=4096,
13 |         intermediate_size=11008,
14 |         num_hidden_layers=32,
15 |         num_attention_heads=32,
16 |         num_key_value_heads=None,
17 |         max_position_embeddings=2048,
18 |         initializer_range=0.02,
19 |         use_cache=True,
20 |         layer_norm_eps: float = 1e-5,
21 |         rope_theta=10000.0,
22 |         clip_qkv=None,
23 |         qkv_bias: bool = False,
24 |         weight_tying: bool = False,
25 |         use_position_ids: bool = True,
26 |         tie_word_embeddings: bool = True,
27 |         attention_layer_norm: bool = False,
28 |         norm_after: bool = False,
29 |         layer_norm_type: str = "rms",
30 |         **kwargs,
31 |     ):
32 |         self.vocab_size = vocab_size
33 |         self.embedding_size = embedding_size
34 |         self.max_position_embeddings = max_position_embeddings
35 |         self.hidden_size = hidden_size
36 |         self.intermediate_size = intermediate_size
37 |         self.num_hidden_layers = num_hidden_layers
38 |         self.num_attention_heads = num_attention_heads
39 |         self.layer_norm_eps = layer_norm_eps
40 |         self.weight_tying = weight_tying
41 |         self.use_position_ids = use_position_ids
42 |         self.attention_layer_norm = attention_layer_norm
43 |         self.num_key_value_heads = num_key_value_heads
44 |         self.initializer_range = initializer_range
45 |         self.use_cache = use_cache
46 |         self.rope_theta = rope_theta
47 |         self.clip_qkv = clip_qkv
48 |         self.qkv_bias = qkv_bias
49 |         self.norm_after = norm_after
50 |         self.tie_word_embeddings = tie_word_embeddings
51 |         self.layer_norm_type = layer_norm_type
52 | 
53 |         super().__init__(
54 |             tie_word_embeddings=tie_word_embeddings,
55 |             **kwargs,
56 |         )
57 | 
58 | 
59 | MolmoConfig.register_for_auto_class()
60 | 


--------------------------------------------------------------------------------
/olmocr/version.py:
--------------------------------------------------------------------------------
 1 | _MAJOR = "0"
 2 | _MINOR = "1"
 3 | # On main and in a nightly release the patch should be one ahead of the last
 4 | # released build.
 5 | _PATCH = "71"
 6 | # This is mainly for nightly builds which have the suffix ".dev$DATE". See
 7 | # https://semver.org/#is-v123-a-semantic-version for the semantics.
 8 | _SUFFIX = ""
 9 | 
10 | VERSION_SHORT = "{0}.{1}".format(_MAJOR, _MINOR)
11 | VERSION = "{0}.{1}.{2}{3}".format(_MAJOR, _MINOR, _PATCH, _SUFFIX)
12 | 


--------------------------------------------------------------------------------
/olmocr/viewer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/olmocr/viewer/__init__.py


--------------------------------------------------------------------------------
/scripts/beaker/Dockerfile-gpu-ci:
--------------------------------------------------------------------------------
 1 | FROM --platform=linux/amd64 nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu20.04
 2 | 
 3 | RUN apt-get update -y && apt-get install -y software-properties-common \
 4 |     && add-apt-repository ppa:deadsnakes/ppa \
 5 |     && apt-get -y update
 6 | 
 7 | # Install requirements specific to pdfs
 8 | RUN apt-get update && apt-get -y install python3-apt
 9 | RUN echo "ttf-mscorefonts-installer msttcorefonts/accepted-mscorefonts-eula select true" | debconf-set-selections
10 | RUN apt-get update -y && apt-get install -y poppler-utils ttf-mscorefonts-installer msttcorefonts fonts-crosextra-caladea fonts-crosextra-carlito gsfonts lcdf-typetools
11 | 
12 | RUN apt-get update -y && apt-get install -y --no-install-recommends \
13 |     git \
14 |     python3.11 \
15 |     python3.11-dev \
16 |     python3.11-distutils \
17 |     ca-certificates \
18 |     build-essential \
19 |     curl \
20 |     unzip
21 | 
22 | RUN rm -rf /var/lib/apt/lists/* \
23 |     && unlink /usr/bin/python3 \
24 |     && ln -s /usr/bin/python3.11 /usr/bin/python3 \
25 |     && ln -s /usr/bin/python3 /usr/bin/python \
26 |     && curl -sS https://bootstrap.pypa.io/get-pip.py | python \
27 |     && pip3 install -U pip
28 | 
29 | RUN apt-get update && apt-get -y install python3.11-venv
30 | ADD --chmod=755 https://astral.sh/uv/install.sh /install.sh
31 | RUN /install.sh && rm /install.sh
32 | 
33 | 
34 | WORKDIR /root
35 | COPY gpu-ci-script.sh .
36 | 
37 | ENV PYTHONUNBUFFERED=1
38 | 


--------------------------------------------------------------------------------
/scripts/beaker/Dockerfile-inference:
--------------------------------------------------------------------------------
 1 | FROM --platform=linux/amd64 nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu20.04
 2 | 
 3 | RUN apt-get update -y && apt-get install -y software-properties-common \
 4 |     && add-apt-repository ppa:deadsnakes/ppa \
 5 |     && apt-get -y update
 6 | 
 7 | # Install requirements specific to pdfs
 8 | RUN apt-get update && apt-get -y install python3-apt
 9 | RUN echo "ttf-mscorefonts-installer msttcorefonts/accepted-mscorefonts-eula select true" | debconf-set-selections
10 | RUN apt-get update -y && apt-get install -y poppler-utils ttf-mscorefonts-installer msttcorefonts fonts-crosextra-caladea fonts-crosextra-carlito gsfonts lcdf-typetools
11 | 
12 | RUN apt-get update -y && apt-get install -y --no-install-recommends \
13 |     git \
14 |     python3.11 \
15 |     python3.11-dev \
16 |     python3.11-distutils \
17 |     ca-certificates \
18 |     build-essential \
19 |     curl \
20 |     unzip
21 | 
22 | RUN rm -rf /var/lib/apt/lists/* \
23 |     && unlink /usr/bin/python3 \
24 |     && ln -s /usr/bin/python3.11 /usr/bin/python3 \
25 |     && ln -s /usr/bin/python3 /usr/bin/python \
26 |     && curl -sS https://bootstrap.pypa.io/get-pip.py | python \
27 |     && pip3 install -U pip    
28 | 
29 | RUN apt-get update && apt-get -y install python3.11-venv 
30 | ADD --chmod=755 https://astral.sh/uv/install.sh /install.sh
31 | RUN /install.sh && rm /install.sh
32 | 
33 | ENV PYTHONUNBUFFERED=1
34 | WORKDIR /root
35 | COPY pyproject.toml pyproject.toml
36 | COPY olmocr/version.py olmocr/version.py
37 | 
38 | RUN /root/.local/bin/uv pip install --system --no-cache -e .
39 | 
40 | RUN /root/.local/bin/uv pip install --system --no-cache sgl-kernel==0.0.3.post1 --force-reinstall --no-deps
41 | RUN /root/.local/bin/uv pip install --system --no-cache "sglang[all]==0.4.2" --find-links https://flashinfer.ai/whl/cu124/torch2.4/flashinfer/
42 | 
43 | COPY olmocr olmocr
44 | 
45 | WORKDIR /root
46 | COPY olmocr olmocr
47 | 
48 | RUN python3 -m sglang.launch_server --help
49 | RUN python3 -m olmocr.pipeline --help


--------------------------------------------------------------------------------
/scripts/beaker/Dockerfile-tagging:
--------------------------------------------------------------------------------
 1 | FROM --platform=linux/amd64 nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu20.04
 2 | 
 3 | RUN apt-get update -y && apt-get install -y software-properties-common \
 4 |     && add-apt-repository ppa:deadsnakes/ppa \
 5 |     && apt-get -y update
 6 | 
 7 | # Install requirements specific to pdfs
 8 | RUN apt-get update && apt-get -y install python3-apt
 9 | RUN echo "ttf-mscorefonts-installer msttcorefonts/accepted-mscorefonts-eula select true" | debconf-set-selections
10 | RUN apt-get update -y && apt-get install -y poppler-utils ttf-mscorefonts-installer msttcorefonts fonts-crosextra-caladea fonts-crosextra-carlito gsfonts lcdf-typetools
11 | 
12 | RUN apt-get update -y && apt-get install -y --no-install-recommends \
13 |     git \
14 |     python3.11 \
15 |     python3.11-dev \
16 |     python3.11-distutils \
17 |     ca-certificates \
18 |     build-essential \
19 |     curl \
20 |     unzip
21 | 
22 | RUN rm -rf /var/lib/apt/lists/* \
23 |     && unlink /usr/bin/python3 \
24 |     && ln -s /usr/bin/python3.11 /usr/bin/python3 \
25 |     && ln -s /usr/bin/python3 /usr/bin/python \
26 |     && curl -sS https://bootstrap.pypa.io/get-pip.py | python \
27 |     && pip3 install -U pip    
28 | 
29 | RUN apt-get update && apt-get -y install python3.11-venv 
30 | ADD --chmod=755 https://astral.sh/uv/install.sh /install.sh
31 | RUN /install.sh && rm /install.sh
32 | 
33 | ENV PYTHONUNBUFFERED=1
34 | WORKDIR /root
35 | COPY pyproject.toml pyproject.toml
36 | COPY olmocr/version.py olmocr/version.py
37 | 
38 | RUN /root/.local/bin/uv pip install --system --no-cache -e .
39 | 
40 | RUN /root/.local/bin/uv pip install --system --no-cache vllm==0.8.2
41 | 
42 | 
43 | WORKDIR /root
44 | COPY olmocr olmocr
45 | COPY scripts scripts
46 | 
47 | RUN vllm --help
48 | RUN python3 -m olmocr.pipeline --help
49 | RUN python scripts/tagging_pipeline.py --help


--------------------------------------------------------------------------------
/scripts/beaker/Dockerfile-train:
--------------------------------------------------------------------------------
 1 | FROM gcr.io/ai2-beaker-core/public/cqgl31u2ba5vrtuc91og:latest
 2 | 
 3 | # Update the package list and install libaio-dev and gnupg2
 4 | RUN apt update && apt-get install -y libaio-dev gnupg2
 5 | 
 6 | # Add NVIDIA package repository keys
 7 | RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub \
 8 |     && apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub \
 9 |     && apt-get -y update
10 | 
11 | # Set up the NVIDIA CUDA repository
12 | RUN apt-get install -y software-properties-common \
13 |     && add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/ /" \
14 |     && apt-get update
15 | 
16 | # Install CUDA toolkit and nvcc 12.1
17 | RUN apt-get install -y cuda-nvcc-12-1
18 | 
19 | # Get flash attention setup
20 | RUN pip install flash-attn --no-build-isolation
21 | 
22 | # Install PDF utilities
23 | RUN apt-get install -y poppler-utils
24 | RUN echo "ttf-mscorefonts-installer msttcorefonts/accepted-mscorefonts-eula select true" | debconf-set-selections
25 | RUN apt-get install -y ttf-mscorefonts-installer msttcorefonts fonts-crosextra-caladea fonts-crosextra-carlito gsfonts lcdf-typetools
26 | 
27 | 


--------------------------------------------------------------------------------
/scripts/beaker/gpu-ci-script.sh:
--------------------------------------------------------------------------------
 1 | #/usr/bin/bash
 2 | 
 3 | set -ex
 4 | 
 5 | git clone https://github.com/allenai/olmocr.git olmocr \
 6 |   && cd olmocr \
 7 |   && git checkout $GIT_REVISION \
 8 |   && /root/.local/bin/uv pip install --system --no-cache \
 9 |     .[gpu] \
10 |     pytest \
11 |     --find-links https://flashinfer.ai/whl/cu124/torch2.4/flashinfer/ \
12 |   && bash scripts/run_integration_test.sh
13 | 
14 | 
15 | 
16 | 


--------------------------------------------------------------------------------
/scripts/beaker/jupiter-ib.sh:
--------------------------------------------------------------------------------
1 | set -ex
2 | export NCCL_DEBUG=INFO NCCL_SOCKET_IFNAME=ib NCCL_IB_HCA="^=mlx5_bond_0"


--------------------------------------------------------------------------------
/scripts/beaker/pluto-ib.sh:
--------------------------------------------------------------------------------
1 | set -ex
2 | export NCCL_DEBUG=INFO NCCL_SOCKET_IFNAME=ib NCCL_IB_HCA="^=mlx5_1,mlx5_2"


--------------------------------------------------------------------------------
/scripts/birr/config/qwen2-vl-7b-pdf-weka.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   # full fine tune
 3 |   name_or_path: weka://oe-data-default/jakep/Qwen_Qwen2-VL-7B-Instruct-e4ecf8-01JAH8GMWHTJ376S2N7ETXRXH4/best_bf16/
 4 |   #name_or_path: s3://ai2-oe-data/jakep/experiments/qwen2vl-pdf/v1/models/jakep/Qwen_Qwen2-VL-7B-Instruct-e4ecf8-01JAH8GMWHTJ376S2N7ETXRXH4/checkpoint-9500/bf16/
 5 |   vlm: true
 6 | 
 7 |   # necessary to prevent random crashes, until vllm fixes some bugs
 8 |   num_scheduler_steps: 1
 9 | 
10 | format:
11 |   add_generation_prompt: true
12 | 
13 | generate:
14 |   # The model's max context length is 8096, but around 1500 tokens are reserved for the image itself
15 |   max_context_length: 6500
16 |   temperature: 0.8
17 |   top_p: 1.0
18 |   drop_long_outputs: false
19 | 
20 | 
21 | pipeline:
22 |   sqs_queue_name: jake-pdf
23 |   num_workers: 3
24 |   generation_batch_size: 256
25 |   tokenization_batch_size: 64
26 |   output_serializer: default
27 |   target_bucket: ai2-oe-data
28 |   target_object_prefix: [your username]/pdfworkspaces/s2orc_3200k_v2/inference_outputs
29 |   allowed_restarts_per_predictor: 10
30 | 
31 | task:
32 |   budget: ai2/oe-data
33 |   workspace: ai2/oe-data-model-based-cleanup
34 |   name: qwen2vl-schedsteps-bg
35 |   replicas: 128
36 |   priority: LOW
37 |   gpu_count: 1
38 |   cluster:
39 |     - ai2/jupiter-cirrascale-2
40 |     - ai2/saturn-cirrascale
41 | 
42 | 


--------------------------------------------------------------------------------
/scripts/build-docker.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | 
 5 | VERSION=$(python -c 'import olmocr.version; print(olmocr.version.VERSION)')
 6 | echo "$VERSION"
 7 | 
 8 | docker build --platform linux/amd64 -f ./scripts/beaker/Dockerfile-inference  -t olmocr-inference-$VERSION .
 9 | beaker image create --workspace ai2/oe-data-pdf --name olmocr-inference-$VERSION olmocr-inference-$VERSION
10 | 
11 | docker build --platform linux/amd64 -f ./scripts/beaker/Dockerfile-tagging  -t olmocr-tagging-$VERSION .
12 | beaker image create --workspace ai2/oe-data-pdf --name olmocr-tagging-$VERSION olmocr-tagging-$VERSION


--------------------------------------------------------------------------------
/scripts/check_qual.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | 
 5 | python scripts/pii_rule_comparison.py \
 6 |   --docs-folder /home/ubuntu/s2pdf_dedupe_minhash_v1_with_no_pii/documents \
 7 |   --ref-rule "ft_lang_id_en_doc_v2__ft_lang_id_en_doc_v2__en:avg>0.5 and \
 8 |               fineweb_edu_fasttext_gt2__fineweb_edu_fasttext_gt2__score:avg>0.001 and \
 9 |               avg_fraction_numbers_in_line_v1__avg_fraction_numbers_in_line_v1__avg_fraction_numbers_in_line_ratio:avg<0.2 and \
10 |               pipe_delimited_lines_v1__pipe_delimited_lines_v1__pipe_delimited_lines_ratio:avg<0.3 \
11 |              " \
12 |   --hyp-rule "ft_lang_id_en_doc_v2__ft_lang_id_en_doc_v2__en:avg>0.5 and \
13 |               fineweb_edu_fasttext_gt2__fineweb_edu_fasttext_gt2__score:avg>0.001 and \
14 |               avg_fraction_numbers_in_line_v1__avg_fraction_numbers_in_line_v1__avg_fraction_numbers_in_line_ratio:avg<0.2 and \
15 |               pipe_delimited_lines_v1__pipe_delimited_lines_v1__pipe_delimited_lines_ratio:avg<0.4 \
16 |              " \
17 |   --output-dir results/pii_detection \
18 | 
19 | 
20 | # Run1, langid, pipes and numbers
21 | # Prompt, boilerplate, reference, prose, table classification -> train fasttext
22 | # 50k docs to train fast text
23 | 
24 | tinyhost results/pii_detection/*


--------------------------------------------------------------------------------
/scripts/elo/README.md:
--------------------------------------------------------------------------------
 1 | # elo rating
 2 | 
 3 | Calculates elo rating of olmOCR vs other tools.
 4 | 
 5 | ## Data
 6 | 
 7 | The pairwise judgment data is stored in `ratings.csv` as win/loss counts:
 8 | ```
 9 | MethodA,MethodB,A_wins,B_wins,A_rate(%),B_rate(%)
10 | marker,mineru,53,26,67.1,32.9
11 | mineru,pdelf,22,55,28.6,71.4
12 | gotocr_format,marker,26,45,36.6,63.4
13 | marker,pdelf,31,49,38.8,61.3
14 | gotocr_format,pdelf,29,41,41.4,58.6
15 | gotocr_format,mineru,38,37,50.7,49.3
16 | ```
17 | 
18 | *Note* `pdfelf` is olmOCR.
19 | 
20 | ## Usage
21 | 
22 | To calculate elo ratings, run the following command:
23 | ```bash
24 | python calculate_elo_ratings.py ratings.csv --num-bootstrap 5000 --num-elo-sims 100 --confidence-level 95 --seed 123
25 | ```
26 | 
27 | It should print something like:
28 | ```
29 | Bootstrapped Elo Ratings (95% CI):
30 | --------------------------------------------------
31 | pdelf        1813.0 ± 84.9 [1605.9, 1930.0]
32 | mineru       1545.2 ± 99.7 [1336.7, 1714.1]
33 | marker       1429.1 ± 100.7 [1267.6, 1645.5]
34 | gotocr_format 1212.7 ± 82.0 [1097.3, 1408.3]
35 | 
36 | Pairwise Significance Tests:
37 | --------------------------------------------------
38 | gotocr_format vs marker       Δ = -216.3 [-470.8,  135.0] p = 0.218
39 | gotocr_format vs mineru       Δ = -332.5 [-567.5,   19.3] p = 0.051
40 | gotocr_format vs pdelf        Δ = -600.3 [-826.1, -344.3] p = 0.000*
41 | marker       vs mineru       Δ = -116.1 [-365.4,  246.5] p = 0.430
42 | marker       vs pdelf        Δ = -383.9 [-610.6,  -10.9] p = 0.044*
43 | mineru       vs pdelf        Δ = -267.8 [-517.3,  104.0] p = 0.135
44 | ```
45 | 
46 | which is also already saved in `results.txt`.
47 | 
48 | To generate boxplots of elo ratings, run the following command:
49 | ```bash
50 | python draw_boxplots.py results.txt boxplots.png
51 | ```
52 | 
53 | which should save boxplots as `boxplots.png`.


--------------------------------------------------------------------------------
/scripts/elo/boxplots.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/scripts/elo/boxplots.png


--------------------------------------------------------------------------------
/scripts/elo/ratings.csv:
--------------------------------------------------------------------------------
1 | MethodA,MethodB,A_wins,B_wins,A_rate(%),B_rate(%)
2 | marker,mineru,53,26,67.1,32.9
3 | mineru,pdelf,22,55,28.6,71.4
4 | gotocr_format,marker,26,45,36.6,63.4
5 | marker,pdelf,31,49,38.8,61.3
6 | gotocr_format,pdelf,29,41,41.4,58.6
7 | gotocr_format,mineru,38,37,50.7,49.3


--------------------------------------------------------------------------------
/scripts/elo/results.txt:
--------------------------------------------------------------------------------
 1 | Bootstrapped Elo Ratings (95% CI):
 2 | --------------------------------------------------
 3 | pdelf        1813.0 ± 84.9 [1605.9, 1930.0]
 4 | mineru       1545.2 ± 99.7 [1336.7, 1714.1]
 5 | marker       1429.1 ± 100.7 [1267.6, 1645.5]
 6 | gotocr_format 1212.7 ± 82.0 [1097.3, 1408.3]
 7 | 
 8 | Pairwise Significance Tests:
 9 | --------------------------------------------------
10 | gotocr_format vs marker       Δ = -216.3 [-470.8,  135.0] p = 0.218
11 | gotocr_format vs mineru       Δ = -332.5 [-567.5,   19.3] p = 0.051
12 | gotocr_format vs pdelf        Δ = -600.3 [-826.1, -344.3] p = 0.000*
13 | marker       vs mineru       Δ = -116.1 [-365.4,  246.5] p = 0.430
14 | marker       vs pdelf        Δ = -383.9 [-610.6,  -10.9] p = 0.044*
15 | mineru       vs pdelf        Δ = -267.8 [-517.3,  104.0] p = 0.135
16 | 
17 | 


--------------------------------------------------------------------------------
/scripts/jsonl_to_markdown.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import sys
 4 | 
 5 | 
 6 | # This is a simple script to convert JSONL files to Markdown format.
 7 | # It reads each line of the JSONL file, extracts the 'text' field,
 8 | # and saves it as a Markdown file with the line number as the filename.
 9 | # The script also handles potential JSON decoding errors and prints relevant messages.
10 | def jsonl_to_markdown(input_file, output_dir):
11 |     """
12 |     Reads a JSONL file, extracts the 'text' field from each line, and saves it as a Markdown file.
13 | 
14 |     Args:
15 |         input_file (str): Path to the input JSONL file.
16 |         output_dir (str): Directory to save the Markdown files.
17 |     """
18 |     if not os.path.exists(output_dir):
19 |         os.makedirs(output_dir)
20 | 
21 |     with open(input_file, "r", encoding="utf-8") as file:
22 |         for i, line in enumerate(file):
23 |             try:
24 |                 # Parse the JSON line
25 |                 data = json.loads(line)
26 |                 text_content = data.get("text", "")
27 | 
28 |                 # Save to a Markdown file
29 |                 output_file = os.path.join(output_dir, f"line_{i + 1}.md")
30 |                 with open(output_file, "w", encoding="utf-8") as md_file:
31 |                     md_file.write(text_content)
32 | 
33 |                 print(f"Extracted and saved line {i + 1} to {output_file}")
34 |             except json.JSONDecodeError as e:
35 |                 print(f"Error decoding JSON on line {i + 1}: {e}")
36 |             except Exception as e:
37 |                 print(f"Unexpected error on line {i + 1}: {e}")
38 | 
39 | 
40 | # Example usage
41 | # input_jsonl_file = "/path/to/test.jsonl"  # Replace with the actual path to your JSONL file
42 | # output_directory = "/path/to/output_markdown"  # Replace with the desired output directory
43 | # jsonl_to_markdown(input_jsonl_file, output_directory)
44 | 
45 | # This is the main entrypoint to use the script from the command line.
46 | # It takes two arguments: the input JSONL file and the output directory.
47 | # The script will create the output directory if it does not exist.
48 | if __name__ == "__main__":
49 |     if len(sys.argv) != 3:
50 |         print("Usage: python jsonl_to_markdown.py <input_file> <output_dir>")
51 |         sys.exit(1)
52 | 
53 |     input_file = sys.argv[1]
54 |     output_dir = sys.argv[2]
55 | 
56 |     jsonl_to_markdown(input_file, output_dir)
57 | 


--------------------------------------------------------------------------------
/scripts/molmo-7b-lora-gantry.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -ex
 4 | 
 5 | # check if jq is installed
 6 | if ! command -v jq &> /dev/null
 7 | then
 8 |     echo "jq could not be found. Please install it."
 9 |     exit
10 | fi
11 | 
12 | 
13 | EXTRA_ARGS="-c olmocr/train/config/molmo-o-lora-8192.yaml --num_proc 64 --save.path \"s3://ai2-oe-data/jakep/experiments/molmo-pdf/v1/models/\${BEAKER_USER_ID}\""
14 | 
15 | run_name=$(basename "$0" .sh)
16 | 
17 | # --cluster 'ai2/jupiter*' \
18 | # --cluster 'ai2/pluto*' \
19 | # --cluster 'ai2/allennlp-cirrascale' \
20 | # --priority high \
21 | 
22 | CLUSTER='jupiter'
23 | 
24 | gantry run \
25 |     --description "${run_name}-8192"\
26 |     --task-name "${run_name}-8192"\
27 |     --allow-dirty \
28 |     --host-networking \
29 |     --workspace ai2/oe-data-model-based-cleanup \
30 |     --beaker-image 'jakep/jakep-pdf-finetunev1.2' \
31 |     --venv 'base' \
32 |     --pip gantry-requirements.txt \
33 |     --priority high \
34 |     --gpus 8 \
35 |     --cluster "ai2/${CLUSTER}*" \
36 |     --budget ai2/oe-data \
37 |     --weka "oe-data-default:/data" \
38 |     --env LOG_FILTER_TYPE=local_rank0_only \
39 |     --env OMP_NUM_THREADS=8 \
40 |     --env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \
41 |     --env-secret AWS_ACCESS_KEY_ID=S2_AWS_ACCESS_KEY_ID \
42 |     --env-secret AWS_SECRET_ACCESS_KEY=S2_AWS_SECRET_ACCESS_KEY \
43 |     --env-secret DS_AWS_ACCESS_KEY_ID=S2_AWS_ACCESS_KEY_ID \
44 |     --env-secret DS_AWS_SECRET_ACCESS_KEY=S2_AWS_SECRET_ACCESS_KEY \
45 |     --env-secret WANDB_API_KEY=JAKE_WANDB_API_KEY \
46 |     --shared-memory 10GiB \
47 |     --yes \
48 |     -- /bin/bash -c "source scripts/beaker/${CLUSTER}-ib.sh && python -m olmocr.train.loaddataset ${EXTRA_ARGS} && accelerate launch --multi_gpu --num_processes \${BEAKER_ASSIGNED_GPU_COUNT} --mixed_precision bf16 -m olmocr.train.train ${EXTRA_ARGS}"


--------------------------------------------------------------------------------
/scripts/parse_with_pdfminer.py:
--------------------------------------------------------------------------------
 1 | from pdfminer.high_level import extract_pages
 2 | from pdfminer.layout import LTChar
 3 | 
 4 | 
 5 | def extract_chars_with_transforms(pdf_path, page_num=0):
 6 |     """
 7 |     Extract characters with transformation data for a specific page in a PDF.
 8 | 
 9 |     Args:
10 |         pdf_path (str): Path to the PDF file
11 |         page_num (int): Page number to extract (0-indexed)
12 |     """
13 |     print(f"Analyzing PDF: {pdf_path}, Page: {page_num + 1}")
14 |     char_count = 0
15 | 
16 |     # Extract only the specified page
17 |     for i, page_layout in enumerate(extract_pages(pdf_path)):
18 |         if i == page_num:
19 |             print(f"Processing page {page_num + 1}")
20 | 
21 |             # Recursively process all elements
22 |             def process_element(element, level=0):
23 |                 nonlocal char_count
24 |                 indent = "  " * level
25 | 
26 |                 if isinstance(element, LTChar):
27 |                     char = element.get_text()
28 |                     matrix = element.matrix
29 |                     font = element.fontname if hasattr(element, "fontname") else "Unknown"
30 |                     size = element.size if hasattr(element, "size") else "Unknown"
31 | 
32 |                     print(f"{indent}Character: '{char}'")
33 |                     print(f"{indent}Transform Matrix: {matrix}")
34 |                     print(f"{indent}Font: {font}, Size: {size}")
35 |                     print(f"{indent}{'-' * 30}")
36 |                     char_count += 1
37 | 
38 |                 # For container elements, process their children
39 |                 if hasattr(element, "_objs"):
40 |                     for obj in element._objs:
41 |                         process_element(obj, level + 1)
42 | 
43 |             # Process all elements in the page
44 |             for element in page_layout:
45 |                 process_element(element)
46 | 
47 |             break  # Stop after processing the requested page
48 | 
49 |     print(f"\nTotal characters extracted: {char_count}")
50 | 
51 |     if char_count == 0:
52 |         print("No characters were extracted. This could mean:")
53 |         print(f"1. Page {page_num + 1} doesn't exist or is empty")
54 |         print("2. The PDF contains scanned images rather than text")
55 |         print("3. The text is embedded in a way PDFMiner can't extract")
56 | 
57 | 
58 | # Usage
59 | 
60 | pdf_path = "/Users/kylel/Downloads/olmOCR_Technical_Report_COLM_2025.pdf"
61 | extract_chars_with_transforms(pdf_path)
62 | 


--------------------------------------------------------------------------------
/scripts/prepare_changelog.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | from pathlib import Path
 3 | 
 4 | from olmocr.version import VERSION
 5 | 
 6 | 
 7 | def main():
 8 |     changelog = Path("CHANGELOG.md")
 9 | 
10 |     with changelog.open() as f:
11 |         lines = f.readlines()
12 | 
13 |     insert_index: int = -1
14 |     for i in range(len(lines)):
15 |         line = lines[i]
16 |         if line.startswith("## Unreleased"):
17 |             insert_index = i + 1
18 |         elif line.startswith(f"## [v{VERSION}]"):
19 |             print("CHANGELOG already up-to-date")
20 |             return
21 |         elif line.startswith("## [v"):
22 |             break
23 | 
24 |     if insert_index < 0:
25 |         raise RuntimeError("Couldn't find 'Unreleased' section")
26 | 
27 |     lines.insert(insert_index, "\n")
28 |     lines.insert(
29 |         insert_index + 1,
30 |         f"## [v{VERSION}](https://github.com/allenai/olmocr/releases/tag/v{VERSION}) - " f"{datetime.now().strftime('%Y-%m-%d')}\n",
31 |     )
32 | 
33 |     with changelog.open("w") as f:
34 |         f.writelines(lines)
35 | 
36 | 
37 | if __name__ == "__main__":
38 |     main()
39 | 


--------------------------------------------------------------------------------
/scripts/qwen25vl-7b-gantry.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -ex
 4 | 
 5 | # check if jq is installed
 6 | if ! command -v jq &> /dev/null
 7 | then
 8 |     echo "jq could not be found. Please install it."
 9 |     exit
10 | fi
11 | 
12 | 
13 | EXTRA_ARGS="-c olmocr/train/config/qwen25vl-7b.yaml --num_proc 64 --save.path \"s3://ai2-oe-data/jakep/experiments/qwen25vl-pdf/v1/models/\${BEAKER_USER_ID}\""
14 | 
15 | run_name=$(basename "$0" .sh)
16 | 
17 | # --cluster 'ai2/jupiter*' \
18 | # --cluster 'ai2/pluto*' \
19 | # --cluster 'ai2/allennlp-cirrascale' \
20 | # --priority high \
21 | 
22 | CLUSTER='jupiter'
23 | 
24 | gantry run \
25 |     --description "${run_name}"\
26 |     --task-name "${run_name}"\
27 |     --allow-dirty \
28 |     --host-networking \
29 |     --workspace ai2/oe-data-model-based-cleanup \
30 |     --beaker-image 'jakep/jakep-pdf-finetunev1.2' \
31 |     --venv 'base' \
32 |     --pip gantry-requirements.txt \
33 |     --priority high \
34 |     --gpus 8 \
35 |     --preemptible \
36 |     --cluster "ai2/${CLUSTER}*" \
37 |     --budget ai2/oe-data \
38 |     --weka "oe-data-default:/data" \
39 |     --env LOG_FILTER_TYPE=local_rank0_only \
40 |     --env OMP_NUM_THREADS=8 \
41 |     --env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \
42 |     --env-secret AWS_ACCESS_KEY_ID=S2_AWS_ACCESS_KEY_ID \
43 |     --env-secret AWS_SECRET_ACCESS_KEY=S2_AWS_SECRET_ACCESS_KEY \
44 |     --env-secret WANDB_API_KEY=JAKE_WANDB_API_KEY \
45 |     --shared-memory 10GiB \
46 |     --yes \
47 |     -- /bin/bash -c "pip install transformers==4.51.3 && source scripts/beaker/${CLUSTER}-ib.sh && python -m olmocr.train.loaddataset ${EXTRA_ARGS} && accelerate launch --use_fsdp --num_processes \${BEAKER_ASSIGNED_GPU_COUNT} --fsdp_offload_params false --fsdp_sharding_strategy FULL_SHARD --fsdp_auto_wrap_policy TRANSFORMER_BASED_WRAP --mixed_precision bf16 -m olmocr.train.train ${EXTRA_ARGS}"


--------------------------------------------------------------------------------
/scripts/qwen2vl-2b-gantry.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -ex
 4 | 
 5 | # check if jq is installed
 6 | if ! command -v jq &> /dev/null
 7 | then
 8 |     echo "jq could not be found. Please install it."
 9 |     exit
10 | fi
11 | 
12 | 
13 | EXTRA_ARGS="-c olmocr/train/config/qwen2vl-2b.yaml --num_proc 64 --save.path \"s3://ai2-oe-data/jakep/experiments/qwen2vl-pdf/v1/models/\${BEAKER_USER_ID}\""
14 | 
15 | run_name=$(basename "$0" .sh)
16 | 
17 | # --cluster 'ai2/jupiter*' \
18 | # --cluster 'ai2/pluto*' \
19 | # --cluster 'ai2/allennlp-cirrascale' \
20 | # --priority high \
21 | 
22 | CLUSTER='jupiter'
23 | 
24 | gantry run \
25 |     --description "${run_name}"\
26 |     --task-name "${run_name}"\
27 |     --allow-dirty \
28 |     --host-networking \
29 |     --workspace ai2/oe-data-pdf \
30 |     --beaker-image 'jakep/jakep-pdf-finetunev1.2' \
31 |     --venv 'base' \
32 |     --pip gantry-requirements.txt \
33 |     --priority normal \
34 |     --gpus 8 \
35 |     --preemptible \
36 |     --cluster "ai2/${CLUSTER}*" \
37 |     --budget ai2/oe-data \
38 |     --env LOG_FILTER_TYPE=local_rank0_only \
39 |     --env OMP_NUM_THREADS=8 \
40 |     --env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \
41 |     --env-secret AWS_ACCESS_KEY_ID=S2_AWS_ACCESS_KEY_ID \
42 |     --env-secret AWS_SECRET_ACCESS_KEY=S2_AWS_SECRET_ACCESS_KEY \
43 |     --env-secret WANDB_API_KEY=JAKE_WANDB_API_KEY \
44 |     --shared-memory 10GiB \
45 |     --yes \
46 |     -- /bin/bash -c "source scripts/beaker/${CLUSTER}-ib.sh && accelerate launch --multi_gpu --num_processes \${BEAKER_ASSIGNED_GPU_COUNT} --mixed_precision bf16 -m olmocr.train.train ${EXTRA_ARGS}"


--------------------------------------------------------------------------------
/scripts/qwen2vl-7b-gantry.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -ex
 4 | 
 5 | # check if jq is installed
 6 | if ! command -v jq &> /dev/null
 7 | then
 8 |     echo "jq could not be found. Please install it."
 9 |     exit
10 | fi
11 | 
12 | 
13 | EXTRA_ARGS="-c olmocr/train/config/qwen2vl-7b.yaml --num_proc 64 --save.path \"s3://ai2-oe-data/jakep/experiments/qwen2vl-pdf/v1/models/\${BEAKER_USER_ID}\""
14 | 
15 | run_name=$(basename "$0" .sh)
16 | 
17 | # --cluster 'ai2/jupiter*' \
18 | # --cluster 'ai2/pluto*' \
19 | # --cluster 'ai2/allennlp-cirrascale' \
20 | # --priority high \
21 | 
22 | CLUSTER='jupiter'
23 | 
24 | gantry run \
25 |     --description "${run_name}"\
26 |     --task-name "${run_name}"\
27 |     --allow-dirty \
28 |     --host-networking \
29 |     --workspace ai2/oe-data-model-based-cleanup \
30 |     --beaker-image 'jakep/jakep-pdf-finetunev1.2' \
31 |     --venv 'base' \
32 |     --pip gantry-requirements.txt \
33 |     --priority high \
34 |     --gpus 8 \
35 |     --preemptible \
36 |     --cluster "ai2/${CLUSTER}*" \
37 |     --budget ai2/oe-data \
38 |     --weka "oe-data-default:/data" \
39 |     --env LOG_FILTER_TYPE=local_rank0_only \
40 |     --env OMP_NUM_THREADS=8 \
41 |     --env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \
42 |     --env-secret AWS_ACCESS_KEY_ID=S2_AWS_ACCESS_KEY_ID \
43 |     --env-secret AWS_SECRET_ACCESS_KEY=S2_AWS_SECRET_ACCESS_KEY \
44 |     --env-secret WANDB_API_KEY=JAKE_WANDB_API_KEY \
45 |     --shared-memory 10GiB \
46 |     --yes \
47 |     -- /bin/bash -c "source scripts/beaker/${CLUSTER}-ib.sh && python -m olmocr.train.loaddataset ${EXTRA_ARGS} && accelerate launch --use_fsdp --num_processes \${BEAKER_ASSIGNED_GPU_COUNT} --fsdp_offload_params false --fsdp_sharding_strategy FULL_SHARD --fsdp_auto_wrap_policy TRANSFORMER_BASED_WRAP --mixed_precision bf16 -m olmocr.train.train ${EXTRA_ARGS}"


--------------------------------------------------------------------------------
/scripts/qwen2vl-7b-lora-gantry.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -ex
 4 | 
 5 | # check if jq is installed
 6 | if ! command -v jq &> /dev/null
 7 | then
 8 |     echo "jq could not be found. Please install it."
 9 |     exit
10 | fi
11 | 
12 | 
13 | EXTRA_ARGS="-c olmocr/train/config/qwen2vl-7b-lora.yaml --num_proc 64 --save.path \"s3://ai2-oe-data/jakep/experiments/qwen2vl-pdf/v1/models/\${BEAKER_USER_ID}\""
14 | 
15 | run_name=$(basename "$0" .sh)
16 | 
17 | # --cluster 'ai2/jupiter*' \
18 | # --cluster 'ai2/pluto*' \
19 | # --cluster 'ai2/allennlp-cirrascale' \
20 | # --priority high \
21 | 
22 | CLUSTER='jupiter'
23 | 
24 | gantry run \
25 |     --description "${run_name}"\
26 |     --task-name "${run_name}"\
27 |     --allow-dirty \
28 |     --host-networking \
29 |     --workspace ai2/oe-data-model-based-cleanup \
30 |     --beaker-image 'jakep/jakep-pdf-finetunev1.2' \
31 |     --venv 'base' \
32 |     --pip gantry-requirements.txt \
33 |     --priority high \
34 |     --gpus 8 \
35 |     --preemptible \
36 |     --cluster "ai2/${CLUSTER}*" \
37 |     --budget ai2/oe-data \
38 |     --weka "oe-data-default:/data" \
39 |     --env LOG_FILTER_TYPE=local_rank0_only \
40 |     --env OMP_NUM_THREADS=8 \
41 |     --env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \
42 |     --env-secret AWS_ACCESS_KEY_ID=S2_AWS_ACCESS_KEY_ID \
43 |     --env-secret AWS_SECRET_ACCESS_KEY=S2_AWS_SECRET_ACCESS_KEY \
44 |     --env-secret DS_AWS_ACCESS_KEY_ID=S2_AWS_ACCESS_KEY_ID \
45 |     --env-secret DS_AWS_SECRET_ACCESS_KEY=S2_AWS_SECRET_ACCESS_KEY \
46 |     --env-secret WANDB_API_KEY=JAKE_WANDB_API_KEY \
47 |     --shared-memory 10GiB \
48 |     --yes \
49 |     -- /bin/bash -c "source scripts/beaker/${CLUSTER}-ib.sh && python -m olmocr.train.loaddataset ${EXTRA_ARGS} && accelerate launch --multi_gpu --num_processes \${BEAKER_ASSIGNED_GPU_COUNT} --mixed_precision bf16 -m olmocr.train.train ${EXTRA_ARGS}"


--------------------------------------------------------------------------------
/scripts/release.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | 
 5 | # Function to extract version components from version.py using regex
 6 | get_version_from_file() {
 7 |     VERSION_FILE="olmocr/version.py"
 8 | 
 9 |     if [[ ! -f "$VERSION_FILE" ]]; then
10 |         echo "Error: $VERSION_FILE does not exist."
11 |         exit 1
12 |     fi
13 | 
14 |     # Extract _MAJOR
15 |     _MAJOR=$(grep -E '^_MAJOR\s*=\s*"([^"]+)"' "$VERSION_FILE" | sed -E 's/_MAJOR\s*=\s*"([^"]+)"/\1/')
16 |     if [[ -z "$_MAJOR" ]]; then
17 |         echo "Error: Could not extract _MAJOR from $VERSION_FILE."
18 |         exit 1
19 |     fi
20 | 
21 |     # Extract _MINOR
22 |     _MINOR=$(grep -E '^_MINOR\s*=\s*"([^"]+)"' "$VERSION_FILE" | sed -E 's/_MINOR\s*=\s*"([^"]+)"/\1/')
23 |     if [[ -z "$_MINOR" ]]; then
24 |         echo "Error: Could not extract _MINOR from $VERSION_FILE."
25 |         exit 1
26 |     fi
27 | 
28 |     # Extract _PATCH
29 |     _PATCH=$(grep -E '^_PATCH\s*=\s*"([^"]+)"' "$VERSION_FILE" | sed -E 's/_PATCH\s*=\s*"([^"]+)"/\1/')
30 |     if [[ -z "$_PATCH" ]]; then
31 |         echo "Error: Could not extract _PATCH from $VERSION_FILE."
32 |         exit 1
33 |     fi
34 | 
35 |     # Extract _SUFFIX (optional)
36 |     _SUFFIX=$(grep -E '^_SUFFIX\s*=\s*"([^"]*)"' "$VERSION_FILE" | sed -E 's/_SUFFIX\s*=\s*"([^"]*)"/\1/')
37 |     if [[ -z "$_SUFFIX" ]]; then
38 |         _SUFFIX=""
39 |     fi
40 | 
41 |     # Construct VERSION
42 |     VERSION_PY="${_MAJOR}.${_MINOR}.${_PATCH}${_SUFFIX}"
43 |     echo "$VERSION_PY"
44 | }
45 | 
46 | TAG=$(python -c 'from olmocr.version import VERSION; print("v" + VERSION)')
47 | 
48 | # Get the VERSION from version.py
49 | VERSION_PY=$(get_version_from_file)
50 | 
51 | # Compare the two versions
52 | if [[ "v$VERSION_PY" != "$TAG" ]]; then
53 |     echo "Version mismatch detected:"
54 |     echo "  Python reported version: $TAG"
55 |     echo "  version.py contains: v$VERSION_PY"
56 |     echo
57 |     read -p "The versions do not match. Please run 'pip install -e .' to synchronize versions. Do you want to continue? [Y/n] " prompt
58 | 
59 |     if [[ ! "$prompt" =~ ^([yY][eE][sS]|[yY])$ ]]; then
60 |         echo "Release process aborted due to version mismatch."
61 |         exit 1
62 |     else
63 |         echo "Proceeding with the release despite the version mismatch."
64 |     fi
65 | fi
66 | 
67 | read -p "Creating new release for $TAG. Do you want to continue? [Y/n] " prompt
68 | 
69 | if [[ $prompt == "y" || $prompt == "Y" || $prompt == "yes" || $prompt == "Yes" ]]; then
70 |     python scripts/prepare_changelog.py
71 |     git add -A
72 |     git commit -m "Bump version to $TAG for release" || true && git push
73 |     echo "Creating new git tag $TAG"
74 |     git tag "$TAG" -m "$TAG"
75 |     git push --tags
76 | else
77 |     echo "Cancelled"
78 |     exit 1
79 | fi


--------------------------------------------------------------------------------
/scripts/release_notes.py:
--------------------------------------------------------------------------------
 1 | # encoding: utf-8
 2 | 
 3 | """
 4 | Prepares markdown release notes for GitHub releases.
 5 | """
 6 | 
 7 | import os
 8 | from typing import List, Optional
 9 | 
10 | import packaging.version
11 | 
12 | TAG = os.environ["TAG"]
13 | 
14 | ADDED_HEADER = "### Added 🎉"
15 | CHANGED_HEADER = "### Changed ⚠️"
16 | FIXED_HEADER = "### Fixed ✅"
17 | REMOVED_HEADER = "### Removed 👋"
18 | 
19 | 
20 | def get_change_log_notes() -> str:
21 |     in_current_section = False
22 |     current_section_notes: List[str] = []
23 |     with open("CHANGELOG.md") as changelog:
24 |         for line in changelog:
25 |             if line.startswith("## "):
26 |                 if line.startswith("## Unreleased"):
27 |                     continue
28 |                 if line.startswith(f"## [{TAG}]"):
29 |                     in_current_section = True
30 |                     continue
31 |                 break
32 |             if in_current_section:
33 |                 if line.startswith("### Added"):
34 |                     line = ADDED_HEADER + "\n"
35 |                 elif line.startswith("### Changed"):
36 |                     line = CHANGED_HEADER + "\n"
37 |                 elif line.startswith("### Fixed"):
38 |                     line = FIXED_HEADER + "\n"
39 |                 elif line.startswith("### Removed"):
40 |                     line = REMOVED_HEADER + "\n"
41 |                 current_section_notes.append(line)
42 |     assert current_section_notes
43 |     return "## What's new\n\n" + "".join(current_section_notes).strip() + "\n"
44 | 
45 | 
46 | def get_commit_history() -> str:
47 |     new_version = packaging.version.parse(TAG)
48 | 
49 |     # Pull all tags.
50 |     os.popen("git fetch --tags")
51 | 
52 |     # Get all tags sorted by version, latest first.
53 |     all_tags = os.popen("git tag -l --sort=-version:refname 'v*'").read().split("\n")
54 | 
55 |     # Out of `all_tags`, find the latest previous version so that we can collect all
56 |     # commits between that version and the new version we're about to publish.
57 |     # Note that we ignore pre-releases unless the new version is also a pre-release.
58 |     last_tag: Optional[str] = None
59 |     for tag in all_tags:
60 |         if not tag.strip():  # could be blank line
61 |             continue
62 |         version = packaging.version.parse(tag)
63 |         if new_version.pre is None and version.pre is not None:
64 |             continue
65 |         if version < new_version:
66 |             last_tag = tag
67 |             break
68 |     if last_tag is not None:
69 |         commits = os.popen(f"git log {last_tag}..{TAG} --oneline --first-parent").read()
70 |     else:
71 |         commits = os.popen("git log --oneline --first-parent").read()
72 |     return "## Commits\n\n" + commits
73 | 
74 | 
75 | def main():
76 |     print(get_change_log_notes())
77 |     print(get_commit_history())
78 | 
79 | 
80 | if __name__ == "__main__":
81 |     main()
82 | 


--------------------------------------------------------------------------------
/scripts/run_benchmark.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | 
 5 | # Use conda environment Python if available, otherwise use system Python
 6 | if [ -n "$CONDA_PREFIX" ]; then
 7 |     PYTHON="$CONDA_PREFIX/bin/python"
 8 |     echo "Using conda Python from: $CONDA_PREFIX"
 9 | else
10 |     PYTHON="python"
11 |     echo "Warning: No conda environment detected, using system Python"
12 | fi
13 | 
14 | # Get version from version.py
15 | VERSION=$($PYTHON -c 'import olmocr.version; print(olmocr.version.VERSION)')
16 | echo "OlmOCR version: $VERSION"
17 | 
18 | # Get first 10 characters of git hash
19 | GIT_HASH=$(git rev-parse HEAD | cut -c1-10)
20 | echo "Git hash: $GIT_HASH"
21 | 
22 | # Get current git branch name
23 | GIT_BRANCH=$(git rev-parse --abbrev-ref HEAD)
24 | echo "Git branch: $GIT_BRANCH"
25 | 
26 | # Create full image tag
27 | IMAGE_TAG="olmocr-benchmark-${VERSION}-${GIT_HASH}"
28 | echo "Building Docker image with tag: $IMAGE_TAG"
29 | 
30 | # Build the Docker image
31 | echo "Building Docker image..."
32 | docker build --platform linux/amd64 -f ./Dockerfile -t $IMAGE_TAG .
33 | 
34 | # Get Beaker username
35 | BEAKER_USER=$(beaker account whoami --format json | jq -r '.[0].name')
36 | echo "Beaker user: $BEAKER_USER"
37 | 
38 | # Push image to beaker
39 | echo "Pushing image to Beaker..."
40 | beaker image create --workspace ai2/oe-data-pdf --name $IMAGE_TAG $IMAGE_TAG
41 | 
42 | # Create Python script to run beaker experiment
43 | cat << 'EOF' > /tmp/run_benchmark_experiment.py
44 | import sys
45 | from beaker import Beaker, ExperimentSpec, TaskSpec, TaskContext, ResultSpec, TaskResources, ImageSource, Priority, Constraints
46 | 
47 | # Get image tag, beaker user, git branch, and git hash from command line
48 | image_tag = sys.argv[1]
49 | beaker_user = sys.argv[2]
50 | git_branch = sys.argv[3]
51 | git_hash = sys.argv[4]
52 | 
53 | # Initialize Beaker client
54 | b = Beaker.from_env(default_workspace="ai2/olmocr")
55 | 
56 | # Create experiment spec
57 | experiment_spec = ExperimentSpec(
58 |     description=f"OlmOCR Benchmark Run - Branch: {git_branch}, Commit: {git_hash}",
59 |     budget="ai2/oe-data",
60 |     tasks=[
61 |         TaskSpec(
62 |             name="olmocr-benchmark",
63 |             image=ImageSource(beaker=f"{beaker_user}/{image_tag}"),
64 |             command=[
65 |                 "bash", "-c",
66 |                 " && ".join([
67 |                     "git clone https://huggingface.co/datasets/allenai/olmOCR-bench",
68 |                     "cd olmOCR-bench && git lfs pull && cd ..",
69 |                     "python -m olmocr.pipeline ./localworkspace --markdown --pdfs ./olmOCR-bench/bench_data/pdfs/**/*.pdf",
70 |                     "python olmocr/bench/scripts/workspace_to_bench.py localworkspace/ olmOCR-bench/bench_data/olmocr --bench-path ./olmOCR-bench/",
71 |                     "python -m olmocr.bench.benchmark --dir ./olmOCR-bench/bench_data"
72 |                 ])
73 |             ],
74 |             context=TaskContext(
75 |                 priority=Priority.normal,
76 |                 preemptible=True,
77 |             ),
78 |             resources=TaskResources(gpu_count=1),
79 |             constraints=Constraints(cluster=["ai2/ceres-cirrascale", "ai2/jupiter-cirrascale-2"]),
80 |             result=ResultSpec(path="/noop-results"),
81 |         )
82 |     ],
83 | )
84 | 
85 | # Create the experiment
86 | experiment = b.experiment.create(spec=experiment_spec, workspace="ai2/olmocr")
87 | print(f"Created experiment: {experiment.id}")
88 | print(f"View at: https://beaker.org/ex/{experiment.id}")
89 | EOF
90 | 
91 | # Run the Python script to create the experiment
92 | echo "Creating Beaker experiment..."
93 | $PYTHON /tmp/run_benchmark_experiment.py $IMAGE_TAG $BEAKER_USER $GIT_BRANCH $GIT_HASH
94 | 
95 | # Clean up temporary file
96 | rm /tmp/run_benchmark_experiment.py
97 | 
98 | echo "Benchmark experiment submitted successfully!"


--------------------------------------------------------------------------------
/scripts/run_integration_test.sh:
--------------------------------------------------------------------------------
1 | #/usr/bin/bash
2 | 
3 | set -ex
4 | 
5 | python -m olmocr.pipeline ./localworkspace --pdfs tests/gnarly_pdfs/ambiguous.pdf tests/gnarly_pdfs/edgar.pdf tests/gnarly_pdfs/dolma-page-1.pdf \
6 |   && pytest tests/test_integration.py
7 | 


--------------------------------------------------------------------------------
/scripts/run_tagging_pipeline.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | set -e
4 | 
5 | gantry run --gpus 1 --workspace ai2/olmocr --beaker-image ai2/pytorch2.5.1-cuda12.1-python3.11 --cluster ai2/jupiter-cirrascale-2 --budget ai2/oe-data --priority normal --env-secret AWS_CREDENTIALS_FILE=jakep-AWS_CREDENTIALS_FILE --env-secret HF_TOKEN=jake-HF_TOKEN --allow-dirty -- /bin/bash -c "pip install -e .[gpu] --find-links https://flashinfer.ai/whl/cu124/torch2.4/flashinfer/ && pip install --upgrade sglang==0.4.5.post3 transformers==4.51.3 && python scripts/tagging_pipeline.py s3://ai2-oe-data/jakep/s2pdf_dedupe_minhash_v1_mini s3://ai2-oe-data/jakep/s2pdf_dedupe_minhash_v1_mini_scratch"
6 | 
7 | gantry run --gpus 1 --workspace ai2/olmocr --beaker-image ai2/pytorch2.5.1-cuda12.1-python3.11 --cluster ai2/jupiter-cirrascale-2 --budget ai2/oe-data --priority normal --env-secret AWS_CREDENTIALS_FILE=jakep-AWS_CREDENTIALS_FILE --env-secret HF_TOKEN=jake-HF_TOKEN --allow-dirty -- /bin/bash -c "pip install -e .[gpu,bench] --find-links https://flashinfer.ai/whl/cu124/torch2.4/flashinfer/ && huggingface-cli download allenai/olmOCR-bench --repo-type dataset --local-dir ./olmOCR-bench &&  olmocr/bench/scripts/convert_all.sh"


--------------------------------------------------------------------------------
/scripts/s2orc_extractor.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Define the output file for the metadata.sha1 fields
 4 | OUTPUT_FILE="s2orc_pdfs_v2.txt"
 5 | 
 6 | # Clear the output file if it already exists
 7 | > "$OUTPUT_FILE"
 8 | 
 9 | # Create a temporary directory for partial outputs
10 | temp_output_dir=$(mktemp -d)
11 | 
12 | # Ensure the temporary directory is cleaned up on exit or error
13 | trap 'rm -rf "$temp_output_dir"' EXIT
14 | 
15 | # Export the temporary output directory variable for use in xargs
16 | export temp_output_dir
17 | 
18 | echo "temp dir $temp_output_dir"
19 | 
20 | # Find all .gz files recursively from the current directory
21 | find 'split=train' -type f -name "*.gz" | \
22 |     xargs -P 30 -I{} bash -c '
23 |         gz_file="$1"
24 |         partial_output="$temp_output_dir/$(basename "$gz_file").txt"
25 | 
26 |         # Stream uncompressed data directly into jq and format the output
27 |         gunzip -c "$gz_file" | jq -r '"'"'
28 |             select(.metadata.sha1 != null) |
29 |             "s3://ai2-s2-pdfs/" + (.metadata.sha1[:4]) + "/" + (.metadata.sha1[4:]) + ".pdf"
30 |         '"'"' >> "$partial_output"
31 |     ' _ {}
32 | 
33 | # Concatenate all partial outputs into the final output file
34 | cat "$temp_output_dir"/*.txt >> "$OUTPUT_FILE"
35 | 
36 | echo "All metadata.sha1 fields have been extracted to $OUTPUT_FILE."
37 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/tests/__init__.py


--------------------------------------------------------------------------------
/tests/gnarly_pdfs/ambiguous.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/tests/gnarly_pdfs/ambiguous.pdf


--------------------------------------------------------------------------------
/tests/gnarly_pdfs/badlines.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/tests/gnarly_pdfs/badlines.pdf


--------------------------------------------------------------------------------
/tests/gnarly_pdfs/bws_book_ch2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/tests/gnarly_pdfs/bws_book_ch2.pdf


--------------------------------------------------------------------------------
/tests/gnarly_pdfs/discoverworld_crazy_tables.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/tests/gnarly_pdfs/discoverworld_crazy_tables.pdf


--------------------------------------------------------------------------------
/tests/gnarly_pdfs/dolma-page-1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/tests/gnarly_pdfs/dolma-page-1.pdf


--------------------------------------------------------------------------------
/tests/gnarly_pdfs/edgar.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/tests/gnarly_pdfs/edgar.pdf


--------------------------------------------------------------------------------
/tests/gnarly_pdfs/failing_anchor_pg4.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/tests/gnarly_pdfs/failing_anchor_pg4.pdf


--------------------------------------------------------------------------------
/tests/gnarly_pdfs/failing_pdf_pg9.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/tests/gnarly_pdfs/failing_pdf_pg9.pdf


--------------------------------------------------------------------------------
/tests/gnarly_pdfs/form_on_later_pages.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/tests/gnarly_pdfs/form_on_later_pages.pdf


--------------------------------------------------------------------------------
/tests/gnarly_pdfs/guidebook_failed_pages.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/tests/gnarly_pdfs/guidebook_failed_pages.pdf


--------------------------------------------------------------------------------
/tests/gnarly_pdfs/handwriting_bad_ocr.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/tests/gnarly_pdfs/handwriting_bad_ocr.pdf


--------------------------------------------------------------------------------
/tests/gnarly_pdfs/horribleocr.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/tests/gnarly_pdfs/horribleocr.pdf


--------------------------------------------------------------------------------
/tests/gnarly_pdfs/instructions_and_schematics.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/tests/gnarly_pdfs/instructions_and_schematics.pdf


--------------------------------------------------------------------------------
/tests/gnarly_pdfs/large_prompt_hint1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/tests/gnarly_pdfs/large_prompt_hint1.pdf


--------------------------------------------------------------------------------
/tests/gnarly_pdfs/large_prompt_hint2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/tests/gnarly_pdfs/large_prompt_hint2.pdf


--------------------------------------------------------------------------------
/tests/gnarly_pdfs/large_prompt_hint3.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/tests/gnarly_pdfs/large_prompt_hint3.pdf


--------------------------------------------------------------------------------
/tests/gnarly_pdfs/load_v_error.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/tests/gnarly_pdfs/load_v_error.pdf


--------------------------------------------------------------------------------
/tests/gnarly_pdfs/lots_of_chem_tables.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/tests/gnarly_pdfs/lots_of_chem_tables.pdf


--------------------------------------------------------------------------------
/tests/gnarly_pdfs/lots_of_sci_tables.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/tests/gnarly_pdfs/lots_of_sci_tables.pdf


--------------------------------------------------------------------------------
/tests/gnarly_pdfs/map1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/tests/gnarly_pdfs/map1.pdf


--------------------------------------------------------------------------------
/tests/gnarly_pdfs/most_content_in_image_form.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/tests/gnarly_pdfs/most_content_in_image_form.pdf


--------------------------------------------------------------------------------
/tests/gnarly_pdfs/newspaper.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/tests/gnarly_pdfs/newspaper.pdf


--------------------------------------------------------------------------------
/tests/gnarly_pdfs/not_parsing.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/tests/gnarly_pdfs/not_parsing.pdf


--------------------------------------------------------------------------------
/tests/gnarly_pdfs/not_parsing2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/tests/gnarly_pdfs/not_parsing2.pdf


--------------------------------------------------------------------------------
/tests/gnarly_pdfs/olmo-page-1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/tests/gnarly_pdfs/olmo-page-1.pdf


--------------------------------------------------------------------------------
/tests/gnarly_pdfs/overrun_on_pg8.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/tests/gnarly_pdfs/overrun_on_pg8.pdf


--------------------------------------------------------------------------------
/tests/gnarly_pdfs/pdftotext_two_column_issue.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/tests/gnarly_pdfs/pdftotext_two_column_issue.pdf


--------------------------------------------------------------------------------
/tests/gnarly_pdfs/repeating_references_on_pg9_pg10.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/tests/gnarly_pdfs/repeating_references_on_pg9_pg10.pdf


--------------------------------------------------------------------------------
/tests/gnarly_pdfs/skinnypage.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/tests/gnarly_pdfs/skinnypage.pdf


--------------------------------------------------------------------------------
/tests/gnarly_pdfs/slideshow_mostly_good_some_pages_should_get_filtered.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/tests/gnarly_pdfs/slideshow_mostly_good_some_pages_should_get_filtered.pdf


--------------------------------------------------------------------------------
/tests/gnarly_pdfs/slideshow_mostly_images.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/tests/gnarly_pdfs/slideshow_mostly_images.pdf


--------------------------------------------------------------------------------
/tests/gnarly_pdfs/small_page_size.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/tests/gnarly_pdfs/small_page_size.pdf


--------------------------------------------------------------------------------
/tests/gnarly_pdfs/some_ocr1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/tests/gnarly_pdfs/some_ocr1.pdf


--------------------------------------------------------------------------------
/tests/gnarly_pdfs/ti89_guidebook_programming.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/tests/gnarly_pdfs/ti89_guidebook_programming.pdf


--------------------------------------------------------------------------------
/tests/gnarly_pdfs/tobacco_missed_tokens_pg1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/tests/gnarly_pdfs/tobacco_missed_tokens_pg1.pdf


--------------------------------------------------------------------------------
/tests/test_dataloader.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from functools import partial
 3 | 
 4 | import pytest
 5 | from torch.utils.data import DataLoader
 6 | from tqdm import tqdm
 7 | from transformers import AutoProcessor
 8 | 
 9 | from olmocr.train.dataloader import (
10 |     build_finetuning_dataset,
11 |     extract_openai_batch_response,
12 |     list_dataset_files,
13 |     load_jsonl_into_ds,
14 | )
15 | from olmocr.train.dataprep import batch_prepare_data_for_qwen2_training
16 | 
17 | 
18 | @pytest.mark.nonci
19 | class TestBatchQueryResponseDataset(unittest.TestCase):
20 |     def testLoadS3(self):
21 |         ds = load_jsonl_into_ds("s3://ai2-oe-data/jakep/openai_batch_data_v2/*.jsonl", first_n_files=3)
22 | 
23 |         print(f"Loaded {len(ds)} entries")
24 |         print(ds)
25 |         print(ds["train"])
26 | 
27 |     def testFinetuningDS(self):
28 |         ds = build_finetuning_dataset(
29 |             response_glob_path="s3://ai2-oe-data/jakep/pdfdata/openai_batch_done_v5_1_eval/*.json",
30 |         )
31 | 
32 |         print(ds)
33 | 
34 |         processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
35 | 
36 |         ds = ds.with_transform(partial(batch_prepare_data_for_qwen2_training, processor=processor, target_longest_image_dim=1024, target_anchor_text_len=6000))
37 | 
38 |         print(ds[0])
39 | 
40 |     def testPlotSequenceLengthHistogram(self):
41 |         import plotly.express as px
42 | 
43 |         ds = build_finetuning_dataset(
44 |             response_glob_path="s3://ai2-oe-data/jakep/pdfdata/openai_batch_done_v5_1_eval/*.json",
45 |         )
46 | 
47 |         processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
48 | 
49 |         ds = ds.with_transform(partial(batch_prepare_data_for_qwen2_training, processor=processor, target_longest_image_dim=1024, target_anchor_text_len=6000))
50 | 
51 |         processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
52 | 
53 |         initial_len = len(ds)
54 | 
55 |         train_dataloader = DataLoader(ds, batch_size=1, num_workers=30, shuffle=False)
56 | 
57 |         max_seen_len = 0
58 |         steps = 0
59 |         sequence_lengths = []  # List to store sequence lengths
60 |         for entry in tqdm(train_dataloader):
61 |             num_input_tokens = entry["input_ids"].shape[1]
62 |             max_seen_len = max(max_seen_len, num_input_tokens)
63 |             sequence_lengths.append(num_input_tokens)  # Collecting sequence lengths
64 | 
65 |             if steps % 100 == 0:
66 |                 print(f"Max input len {max_seen_len}")
67 | 
68 |             steps += 1
69 | 
70 |             # model.forward(**{k: v.to("cuda:0") for (k,v) in entry.items()})
71 |         print(f"Max input len {max_seen_len}")
72 |         print(f"Total elements before filtering: {initial_len}")
73 |         print(f"Total elements after filtering: {steps}")
74 | 
75 |         # Plotting the histogram using Plotly
76 |         fig = px.histogram(
77 |             sequence_lengths, nbins=100, title="Distribution of Input Sequence Lengths", labels={"value": "Sequence Length", "count": "Frequency"}
78 |         )
79 | 
80 |         fig.write_image("sequence_lengths_histogram.png")
81 | 


--------------------------------------------------------------------------------
/tests/test_filter.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import unittest
 3 | 
 4 | from pypdf import PdfReader
 5 | 
 6 | from olmocr.filter import PdfFilter
 7 | 
 8 | 
 9 | class PdfFilterTest(unittest.TestCase):
10 |     def testFormLaterPages(self):
11 |         self.filter = PdfFilter(apply_form_check=True)
12 | 
13 |         self.assertTrue(self.filter.filter_out_pdf(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "form_on_later_pages.pdf")))
14 | 
15 |         self.filter = PdfFilter(apply_form_check=False)
16 | 
17 |         self.assertFalse(self.filter.filter_out_pdf(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "form_on_later_pages.pdf")))
18 | 


--------------------------------------------------------------------------------
/tests/test_integration.py:
--------------------------------------------------------------------------------
 1 | import glob
 2 | import json
 3 | import os
 4 | import unittest
 5 | 
 6 | import pytest
 7 | 
 8 | 
 9 | @pytest.mark.nonci
10 | class TestPipelineIntegration(unittest.TestCase):
11 |     def setUp(self):
12 |         self.data = []
13 | 
14 |         for file in glob.glob(os.path.join("localworkspace", "results", "*.jsonl")):
15 |             with open(file, "r") as jf:
16 |                 for line in jf:
17 |                     if len(line.strip()) > 0:
18 |                         self.data.append(json.loads(line))
19 |                         print(self.data[-1])
20 | 
21 |     def test_edgar(self) -> None:
22 |         self.assertTrue(any("King of the English" in line["text"] for line in self.data))
23 | 
24 |     def test_ambig(self) -> None:
25 |         self.assertTrue(any("Apples and Bananas" in line["text"] for line in self.data))
26 | 
27 |     def test_dolma(self) -> None:
28 |         self.assertTrue(any("We extensively document Dolma" in line["text"] for line in self.data))
29 | 


--------------------------------------------------------------------------------
/tests/test_molmo.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import pytest
 4 | import requests
 5 | from PIL import Image
 6 | from transformers import (
 7 |     AutoModelForCausalLM,
 8 |     AutoProcessor,
 9 |     AutoTokenizer,
10 |     GenerationConfig,
11 | )
12 | 
13 | 
14 | @pytest.mark.nonci
15 | class MolmoProcessorTest(unittest.TestCase):
16 |     def test_molmo_demo(self):
17 |         # load the processor
18 |         processor = AutoProcessor.from_pretrained(
19 |             "allenai/Molmo-7B-O-0924",
20 |             trust_remote_code=True,
21 |             torch_dtype="auto",
22 |         )
23 | 
24 |         # load the model
25 |         model = AutoModelForCausalLM.from_pretrained(
26 |             "allenai/Molmo-7B-O-0924",
27 |             trust_remote_code=True,
28 |             torch_dtype="auto",
29 |         )
30 | 
31 |         device = "cuda:0"
32 | 
33 |         model = model.to(device)
34 | 
35 |         # process the image and text
36 |         inputs = processor.process(images=[Image.open(requests.get("https://picsum.photos/id/237/536/354", stream=True).raw)], text="Describe this image.")
37 | 
38 |         # move inputs to the correct device and make a batch of size 1
39 |         inputs = {k: v.to(model.device).unsqueeze(0) for k, v in inputs.items()}
40 | 
41 |         print("Raw inputs")
42 |         print(inputs)
43 | 
44 |         print("\nShapes")
45 |         # {('input_ids', torch.Size([1, 589])), ('images', torch.Size([1, 5, 576, 588])), ('image_masks', torch.Size([1, 5, 576])), ('image_input_idx', torch.Size([1, 5, 144]))}
46 |         print({(x, y.shape) for x, y in inputs.items()})
47 | 
48 |         print("\nTokens")
49 |         print(processor.tokenizer.batch_decode(inputs["input_ids"]))
50 | 
51 |         # generate output; maximum 200 new tokens; stop generation when <|endoftext|> is generated
52 |         output = model.generate_from_batch(inputs, GenerationConfig(max_new_tokens=200, stop_strings="<|endoftext|>"), tokenizer=processor.tokenizer)
53 | 
54 |         # only get generated tokens; decode them to text
55 |         generated_tokens = output[0, inputs["input_ids"].size(1) :]
56 |         generated_text = processor.tokenizer.decode(generated_tokens, skip_special_tokens=True)
57 | 
58 |         # print the generated text
59 |         print(generated_text)
60 | 


--------------------------------------------------------------------------------
/tests/test_renders/output_image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/tests/test_renders/output_image.png


--------------------------------------------------------------------------------
/tests/test_renders/output_image_rotated90.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/tests/test_renders/output_image_rotated90.png


--------------------------------------------------------------------------------