├── .dockerignore ├── .github ├── CONTRIBUTING.md ├── ISSUE_TEMPLATE │ ├── bug_report.yml │ ├── documentation.yml │ └── feature_request.yml ├── actions │ └── setup-venv │ │ └── action.yml ├── dependabot.yml ├── pull_request_template.md └── workflows │ ├── main.yml │ └── pr_checks.yml ├── .gitignore ├── .readthedocs.yaml ├── CHANGELOG.md ├── Dockerfile ├── LICENSE ├── Makefile ├── README.md ├── RELEASE_PROCESS.md ├── docs ├── .gitignore ├── Makefile ├── make.bat └── source │ ├── CHANGELOG.md │ ├── CONTRIBUTING.md │ ├── _static │ ├── css │ │ └── custom.css │ └── favicon.ico │ ├── conf.py │ ├── index.md │ ├── installation.md │ ├── ocr_pareto.pdf │ ├── ocr_pareto.png │ └── overview.md ├── gantry-requirements.txt ├── olmocr ├── __init__.py ├── bench │ ├── README.md │ ├── __init__.py │ ├── benchmark.py │ ├── convert.py │ ├── katex │ │ ├── __init__.py │ │ ├── auto-render.min.js │ │ ├── katex.min.css │ │ ├── katex.min.js │ │ └── render.py │ ├── miners │ │ ├── check_headers_footers.py │ │ ├── check_multicolumn.py │ │ ├── check_old_scans_math.py │ │ ├── cleanup_data.py │ │ ├── cleanup_urls.py │ │ ├── delete_rejected.py │ │ ├── download_math.py │ │ ├── mine_diffs.py │ │ ├── mine_headers_footers.py │ │ ├── mine_long_tiny_text.py │ │ ├── mine_math.py │ │ ├── mine_multi_column.py │ │ ├── mine_old_scan_pdf.py │ │ ├── mine_old_scans.py │ │ ├── mine_old_scans_math.py │ │ ├── mine_reading_order.py │ │ ├── mine_tables_gemini.py │ │ ├── mine_tables_gpt.py │ │ └── pick_mediod.py │ ├── prompts.py │ ├── report.py │ ├── review_app.py │ ├── review_app_latex.py │ ├── runners │ │ ├── __init__.py │ │ ├── run_chatgpt.py │ │ ├── run_claude.py │ │ ├── run_docling.py │ │ ├── run_gemini.py │ │ ├── run_gotocr.py │ │ ├── run_marker.py │ │ ├── run_mineru.py │ │ ├── run_mistral.py │ │ ├── run_olmocr_pipeline.py │ │ ├── run_rolmocr.py │ │ ├── run_server.py │ │ └── run_transformers.py │ ├── sample_data │ │ ├── dataset.jsonl │ │ ├── olmocr_pipeline │ │ │ ├── buildingnotes_pg1_repeat1.md │ │ │ ├── discoverworld_crazy_table4_pg1_repeat1.md │ │ │ ├── earnings_pg1_repeat1.md │ │ │ ├── headers_footers │ │ │ │ ├── ff0f0b22c55d8b90dd77d153f48e144fc9db_pg2_pg1_repeat1.md │ │ │ │ ├── ff1fc6a205ad039139ce566851b6b260c929_pg1_pg1_repeat1.md │ │ │ │ ├── ff3d6e051903fe5ca9bc172ece14964c5632_pg1_pg1_repeat1.md │ │ │ │ ├── ff4f7dad78081cff727d19ab51c181d4a661_pg1_pg1_repeat1.md │ │ │ │ ├── ff518b1240a66978f22035528ccb029450b5_pg2_pg1_repeat1.md │ │ │ │ ├── ffaac214730d2b8c2ec842e3618ccb9c4259_pg1_pg1_repeat1.md │ │ │ │ └── fff590bed29a2854ac1f874dad5752ede1aa_pg1_pg1_repeat1.md │ │ │ ├── lincoln_letter_pg1_repeat1.md │ │ │ ├── math_2503_04086_pg1_repeat1.md │ │ │ ├── mathfuncs_colswitch_pg1_repeat1.md │ │ │ ├── mathfuncs_pg1_repeat1.md │ │ │ ├── mattsnotes_pg1_repeat1.md │ │ │ ├── mattsnotes_pg2_repeat1.md │ │ │ ├── mattsnotes_pg3_repeat1.md │ │ │ ├── multi_column_miss_pg1_repeat1.md │ │ │ ├── olmo2-pg4_pg1_repeat1.md │ │ │ ├── openstax_caculus_pg_273_pg1_repeat1.md │ │ │ ├── small_page_size_pg1_repeat1.md │ │ │ └── test-graphical-text_pg1_repeat1.md │ │ └── pdfs │ │ │ ├── buildingnotes.pdf │ │ │ ├── discoverworld_crazy_table4.pdf │ │ │ ├── earnings.pdf │ │ │ ├── headers_footers │ │ │ ├── ff0f0b22c55d8b90dd77d153f48e144fc9db_pg2.pdf │ │ │ ├── ff1fc6a205ad039139ce566851b6b260c929_pg1.pdf │ │ │ ├── ff3d6e051903fe5ca9bc172ece14964c5632_pg1.pdf │ │ │ ├── ff4f7dad78081cff727d19ab51c181d4a661_pg1.pdf │ │ │ ├── ff518b1240a66978f22035528ccb029450b5_pg2.pdf │ │ │ ├── ffaac214730d2b8c2ec842e3618ccb9c4259_pg1.pdf │ │ │ └── fff590bed29a2854ac1f874dad5752ede1aa_pg1.pdf │ │ │ ├── lincoln_letter.pdf │ │ │ ├── math_2503_04086.pdf │ │ │ ├── mathfuncs.pdf │ │ │ ├── mathfuncs_colswitch.pdf │ │ │ ├── mattsnotes.pdf │ │ │ ├── multi_column_miss.pdf │ │ │ ├── olmo2-pg4.pdf │ │ │ ├── openstax_caculus_pg_273.pdf │ │ │ ├── small_page_size.pdf │ │ │ └── test-graphical-text.pdf │ ├── scripts │ │ ├── convert_all.sh │ │ ├── difference_viewer.py │ │ ├── run_difference.py │ │ ├── url_matcher.py │ │ └── workspace_to_bench.py │ ├── synth │ │ ├── __init__.py │ │ ├── mine_html_templates.py │ │ └── test_mine.py │ ├── templates │ │ ├── all_done.html │ │ ├── all_done_latex.html │ │ ├── review.html │ │ └── review_latex.html │ ├── tests.py │ └── utils.py ├── check.py ├── data │ ├── __init__.py │ ├── buildsilver.py │ ├── buildsilverdatasummary.py │ ├── buildtestset.py │ ├── convertsilver_birr.py │ ├── convertsilver_openai.py │ ├── renderpdf.py │ └── runopenaibatch.py ├── datatypes.py ├── eval │ ├── __init__.py │ ├── buildelo.py │ ├── dolma_refine │ │ ├── aligners.py │ │ ├── metrics.py │ │ ├── registry.py │ │ └── segmenters.py │ ├── evalhtml.py │ ├── evalhtml_template.html │ ├── runeval.py │ └── scoreelo.py ├── filter │ ├── __init__.py │ ├── coherency.py │ └── filter.py ├── image_utils.py ├── loadertest.py ├── metrics.py ├── pipeline.py ├── prompts │ ├── __init__.py │ ├── anchor.py │ └── prompts.py ├── py.typed ├── repeatdetect.py ├── s3_utils.py ├── train │ ├── __init__.py │ ├── config │ │ ├── molmo-o-lora-8192.yaml │ │ ├── molmo-o-lora.yaml │ │ ├── qwen25vl-7b.yaml │ │ ├── qwen2vl-2b-lora.yaml │ │ ├── qwen2vl-2b.yaml │ │ ├── qwen2vl-7b-lora.yaml │ │ └── qwen2vl-7b.yaml │ ├── core │ │ ├── __init__.py │ │ ├── adapters.py │ │ ├── cli.py │ │ ├── compression.py │ │ ├── config.py │ │ ├── errors.py │ │ ├── loggers.py │ │ ├── paths.py │ │ └── state.py │ ├── dataloader.py │ ├── dataprep.py │ ├── fixqwen25vlcheckpoint.py │ ├── hf │ │ ├── __init__.py │ │ ├── convertjsontoparquet.py │ │ ├── hfhub_upload.py │ │ └── warc_parser.py │ ├── inference.py │ ├── loaddataset.py │ ├── molmo │ │ ├── __init__.py │ │ ├── config_molmo.py │ │ ├── image_processing_molmo.py │ │ ├── modeling_molmo.py │ │ └── preprocessing_molmo.py │ ├── train.py │ └── utils.py ├── version.py ├── viewer │ ├── __init__.py │ ├── dolmaviewer.py │ └── dolmaviewer_template.html └── work_queue.py ├── pyproject.toml ├── scripts ├── autoscan_dolmadocs.py ├── beaker │ ├── Dockerfile-gpu-ci │ ├── Dockerfile-inference │ ├── Dockerfile-tagging │ ├── Dockerfile-train │ ├── gpu-ci-script.sh │ ├── jupiter-ib.sh │ └── pluto-ib.sh ├── benchmark_throughput.py ├── birr │ └── config │ │ └── qwen2-vl-7b-pdf-weka.yaml ├── build-docker.sh ├── chatgpt_tag_dolmadocs_v1.py ├── chatgpt_tag_dolmadocs_v2.py ├── check_qual.sh ├── elo │ ├── README.md │ ├── boxplots.png │ ├── calculate_elo_ratings.py │ ├── draw_boxplots.py │ ├── ratings.csv │ └── results.txt ├── infinigram_count.py ├── jsonl_to_markdown.py ├── molmo-7b-lora-gantry.sh ├── movedolmadocs_to_md.py ├── pareto_plot.py ├── parse_with_pdfminer.py ├── pii_rule_comparison.py ├── prepare_changelog.py ├── qwen25vl-7b-gantry.sh ├── qwen2vl-2b-gantry.sh ├── qwen2vl-7b-gantry.sh ├── qwen2vl-7b-lora-gantry.sh ├── release.sh ├── release_notes.py ├── rich_tagging_pipeline.py ├── run_benchmark.sh ├── run_integration_test.sh ├── run_tagging_pipeline.sh ├── s2orc_extractor.sh ├── scan_dolmadocs.py ├── tagging_pipeline.py └── tagging_pipeline_v2.py └── tests ├── __init__.py ├── gnarly_pdfs ├── ambiguous.pdf ├── badlines.pdf ├── bws_book_ch2.pdf ├── discoverworld_crazy_tables.pdf ├── dolma-page-1.pdf ├── edgar.pdf ├── failing_anchor_pg4.pdf ├── failing_pdf_pg9.pdf ├── form_on_later_pages.pdf ├── guidebook_failed_pages.pdf ├── handwriting_bad_ocr.pdf ├── horribleocr.pdf ├── instructions_and_schematics.pdf ├── large_prompt_hint1.pdf ├── large_prompt_hint2.pdf ├── large_prompt_hint3.pdf ├── load_v_error.pdf ├── lots_of_chem_tables.pdf ├── lots_of_sci_tables.pdf ├── map1.pdf ├── most_content_in_image_form.pdf ├── newspaper.pdf ├── not_parsing.pdf ├── not_parsing2.pdf ├── olmo-page-1.pdf ├── overrun_on_pg8.pdf ├── pdftotext_two_column_issue.pdf ├── repeating_references_on_pg9_pg10.pdf ├── skinnypage.pdf ├── slideshow_mostly_good_some_pages_should_get_filtered.pdf ├── slideshow_mostly_images.pdf ├── small_page_size.pdf ├── some_ocr1.pdf ├── ti89_guidebook_programming.pdf └── tobacco_missed_tokens_pg1.pdf ├── test_anchor.py ├── test_dataloader.py ├── test_dataprep.py ├── test_filter.py ├── test_integration.py ├── test_molmo.py ├── test_renders ├── output_image.png └── output_image_rotated90.png ├── test_s3_work_queue.py ├── test_sglang.py └── test_tests.py /.dockerignore: -------------------------------------------------------------------------------- 1 | .git 2 | .github 3 | .mypy_cache 4 | .pytest_cache 5 | .venv 6 | __pycache__ 7 | *.egg-info 8 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.yml: -------------------------------------------------------------------------------- 1 | name: 🐛 Bug Report 2 | description: Create a report to help us reproduce and fix the bug 3 | labels: 'bug' 4 | 5 | body: 6 | - type: markdown 7 | attributes: 8 | value: > 9 | #### Before submitting a bug, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/allenai/olmocr/issues?q=is%3Aissue+sort%3Acreated-desc+). 10 | - type: textarea 11 | attributes: 12 | label: 🐛 Describe the bug 13 | description: | 14 | Please provide a clear and concise description of what the bug is. 15 | 16 | If relevant, add a minimal example so that we can reproduce the error by running the code. It is very important for the snippet to be as succinct (minimal) as possible, so please take time to trim down any irrelevant code to help us debug efficiently. We are going to copy-paste your code and we expect to get the same result as you did: avoid any external data, and include the relevant imports, etc. For example: 17 | 18 | ```python 19 | # All necessary imports at the beginning 20 | import olmocr 21 | 22 | # A succinct reproducing example trimmed down to the essential parts: 23 | assert False is True, "Oh no!" 24 | ``` 25 | 26 | If the code is too long (hopefully, it isn't), feel free to put it in a public gist and link it in the issue: https://gist.github.com. 27 | 28 | Please also paste or describe the results you observe instead of the expected results. If you observe an error, please paste the error message including the **full** traceback of the exception. It may be relevant to wrap error messages in ```` ```triple quotes blocks``` ````. 29 | placeholder: | 30 | A clear and concise description of what the bug is. 31 | validations: 32 | required: true 33 | - type: textarea 34 | attributes: 35 | label: Versions 36 | description: | 37 | Please run the following and paste the output below. 38 | ```sh 39 | python --version && pip freeze 40 | ``` 41 | validations: 42 | required: true 43 | - type: markdown 44 | attributes: 45 | value: > 46 | Thanks for contributing 🎉! 47 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/documentation.yml: -------------------------------------------------------------------------------- 1 | name: 📚 Documentation 2 | description: Report an issue related to https://olmocr.readthedocs.io/latest 3 | labels: 'documentation' 4 | 5 | body: 6 | - type: textarea 7 | attributes: 8 | label: 📚 The doc issue 9 | description: > 10 | A clear and concise description of what content in https://olmocr.readthedocs.io/latest is an issue. 11 | validations: 12 | required: true 13 | - type: textarea 14 | attributes: 15 | label: Suggest a potential alternative/fix 16 | description: > 17 | Tell us how we could improve the documentation in this regard. 18 | - type: markdown 19 | attributes: 20 | value: > 21 | Thanks for contributing 🎉! 22 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.yml: -------------------------------------------------------------------------------- 1 | name: 🚀 Feature request 2 | description: Submit a proposal/request for a new feature 3 | labels: 'feature request' 4 | 5 | body: 6 | - type: textarea 7 | attributes: 8 | label: 🚀 The feature, motivation and pitch 9 | description: > 10 | A clear and concise description of the feature proposal. Please outline the motivation for the proposal. Is your feature request related to a specific problem? e.g., *"I'm working on X and would like Y to be possible"*. If this is related to another GitHub issue, please link here too. 11 | validations: 12 | required: true 13 | - type: textarea 14 | attributes: 15 | label: Alternatives 16 | description: > 17 | A description of any alternative solutions or features you've considered, if any. 18 | - type: textarea 19 | attributes: 20 | label: Additional context 21 | description: > 22 | Add any other context or screenshots about the feature request. 23 | - type: markdown 24 | attributes: 25 | value: > 26 | Thanks for contributing 🎉! 27 | -------------------------------------------------------------------------------- /.github/actions/setup-venv/action.yml: -------------------------------------------------------------------------------- 1 | name: Python virtualenv 2 | description: Set up a Python virtual environment with caching 3 | inputs: 4 | python-version: 5 | description: The Python version to use 6 | required: true 7 | cache-prefix: 8 | description: Update this to invalidate the cache 9 | required: true 10 | default: v0 11 | runs: 12 | using: composite 13 | steps: 14 | - name: Setup Python 15 | uses: actions/setup-python@v4 16 | with: 17 | python-version: ${{ inputs.python-version }} 18 | 19 | - shell: bash 20 | run: | 21 | # Install prerequisites. 22 | pip install --upgrade pip setuptools wheel virtualenv 23 | 24 | - shell: bash 25 | run: | 26 | # Get the exact Python version to use in the cache key. 27 | echo "PYTHON_VERSION=$(python --version)" >> $GITHUB_ENV 28 | 29 | - uses: actions/cache@v3 30 | id: virtualenv-cache 31 | with: 32 | path: .venv 33 | key: ${{ inputs.cache-prefix }}-${{ runner.os }}-${{ env.PYTHON_VERSION }}-${{ hashFiles('pyproject.toml') }} 34 | 35 | - if: steps.virtualenv-cache.outputs.cache-hit != 'true' 36 | shell: bash 37 | run: | 38 | # Set up virtual environment without cache hit. 39 | test -d .venv || virtualenv -p $(which python) --copies --reset-app-data .venv 40 | . .venv/bin/activate 41 | pip install -e .[dev] 42 | pip install -e .[bench] 43 | 44 | - if: steps.virtualenv-cache.outputs.cache-hit == 'true' 45 | shell: bash 46 | run: | 47 | # Set up virtual environment from cache hit. 48 | . .venv/bin/activate 49 | pip install --no-deps -e .[dev] 50 | pip install --no-deps -e .[bench] 51 | 52 | - shell: bash 53 | run: | 54 | # Show environment info. 55 | . .venv/bin/activate 56 | echo "✓ Installed $(python --version) virtual environment to $(which python)" 57 | echo "Packages:" 58 | pip freeze 59 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: "pip" 4 | directory: "/" 5 | schedule: 6 | interval: "daily" 7 | open-pull-requests-limit: 10 8 | - package-ecosystem: "github-actions" 9 | directory: "/" 10 | schedule: 11 | interval: "daily" 12 | -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Fixes # 5 | 6 | Changes proposed in this pull request: 7 | 8 | - 9 | 10 | ## Before submitting 11 | 12 | 13 | - [ ] I've read and followed all steps in the [Making a pull request](https://github.com/allenai/olmocr/blob/main/.github/CONTRIBUTING.md#making-a-pull-request) 14 | section of the `CONTRIBUTING` docs. 15 | - [ ] I've updated or added any relevant docstrings following the syntax described in the 16 | [Writing docstrings](https://github.com/allenai/olmocr/blob/main/.github/CONTRIBUTING.md#writing-docstrings) section of the `CONTRIBUTING` docs. 17 | - [ ] If this PR fixes a bug, I've added a test that will fail without my fix. 18 | - [ ] If this PR adds a new feature, I've added tests that sufficiently cover my new functionality. 19 | -------------------------------------------------------------------------------- /.github/workflows/pr_checks.yml: -------------------------------------------------------------------------------- 1 | name: PR Checks 2 | 3 | concurrency: 4 | group: ${{ github.workflow }}-${{ github.ref }} 5 | cancel-in-progress: true 6 | 7 | on: 8 | pull_request: 9 | branches: 10 | - main 11 | paths: 12 | - 'olmocr/**' 13 | 14 | jobs: 15 | changelog: 16 | name: CHANGELOG 17 | runs-on: ubuntu-latest 18 | if: github.event_name == 'pull_request' 19 | 20 | steps: 21 | - uses: actions/checkout@v3 22 | with: 23 | fetch-depth: 0 24 | 25 | - name: Check that CHANGELOG has been updated 26 | run: | 27 | # If this step fails, this means you haven't updated the CHANGELOG.md 28 | # file with notes on your contribution. 29 | git diff --name-only $(git merge-base origin/main HEAD) | grep '^CHANGELOG.md$' && echo "Thanks for helping keep our CHANGELOG up-to-date!" 30 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # ml stuff 2 | wandb/ 3 | *histogram.png 4 | *.json 5 | dolma_previews/* 6 | s2_previews/* 7 | gnarly_previews/* 8 | s2orc_previews/* 9 | s2orc_previews_3200/* 10 | sample200_vllm/* 11 | sample200_sglang/* 12 | pdelfin_testset/* 13 | localworkspace/* 14 | math_data/* 15 | math_data_big/* 16 | gpt4otestset/* 17 | gpt4otestset_output/* 18 | pdfs/* 19 | olmOCR-bench/* 20 | table_data*/ 21 | /synth*/ 22 | dolma_samples/* 23 | /*.html 24 | scoreelo.csv 25 | debug.log 26 | birrpipeline-debug.log 27 | beakerpipeline-debug.log 28 | olmocr-pipeline-debug.log 29 | 30 | # build artifacts 31 | 32 | .eggs/ 33 | .mypy_cache 34 | *.egg-info/ 35 | build/ 36 | dist/ 37 | pip-wheel-metadata/ 38 | 39 | 40 | # dev tools 41 | 42 | .envrc 43 | .python-version 44 | .idea 45 | .venv/ 46 | .vscode/ 47 | /*.iml 48 | pyrightconfig.json 49 | 50 | 51 | # jupyter notebooks 52 | 53 | .ipynb_checkpoints 54 | 55 | 56 | # miscellaneous 57 | 58 | .cache/ 59 | doc/_build/ 60 | *.swp 61 | .DS_Store 62 | 63 | 64 | # python 65 | 66 | *.pyc 67 | *.pyo 68 | __pycache__ 69 | 70 | 71 | # testing and continuous integration 72 | 73 | .coverage 74 | .pytest_cache/ 75 | .benchmarks 76 | 77 | # documentation build artifacts 78 | 79 | docs/build 80 | site/ 81 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | sphinx: 4 | configuration: docs/source/conf.py 5 | fail_on_warning: true 6 | 7 | python: 8 | version: "3.8" 9 | install: 10 | - method: pip 11 | path: . 12 | extra_requirements: 13 | - dev 14 | 15 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | All notable changes to this project will be documented in this file. 4 | 5 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), 6 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). 7 | 8 | ## Unreleased 9 | 10 | ## [v0.1.71](https://github.com/allenai/olmocr/releases/tag/v0.1.71) - 2025-05-30 11 | 12 | ## [v0.1.70](https://github.com/allenai/olmocr/releases/tag/v0.1.70) - 2025-05-23 13 | 14 | ## [v0.1.69](https://github.com/allenai/olmocr/releases/tag/v0.1.69) - 2025-05-20 15 | 16 | ## [v0.1.68](https://github.com/allenai/olmocr/releases/tag/v0.1.68) - 2025-05-19 17 | 18 | ## [v0.1.60](https://github.com/allenai/olmocr/releases/tag/v0.1.60) - 2025-03-17 19 | 20 | ## [v0.1.58](https://github.com/allenai/olmocr/releases/tag/v0.1.58) - 2025-02-15 21 | 22 | ## [v0.1.53](https://github.com/allenai/olmocr/releases/tag/v0.1.53) - 2025-02-14 23 | 24 | - Fixed git checks 25 | 26 | - Added gemini and claude runners and a viewer. -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM --platform=linux/amd64 nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu20.04 2 | 3 | RUN apt-get update -y && apt-get install -y software-properties-common \ 4 | && add-apt-repository ppa:deadsnakes/ppa \ 5 | && apt-get -y update 6 | 7 | RUN apt-get update && apt-get -y install python3-apt 8 | RUN echo "ttf-mscorefonts-installer msttcorefonts/accepted-mscorefonts-eula select true" | debconf-set-selections 9 | RUN apt-get update -y && apt-get install -y poppler-utils ttf-mscorefonts-installer msttcorefonts fonts-crosextra-caladea fonts-crosextra-carlito gsfonts lcdf-typetools 10 | 11 | RUN apt-get update -y && apt-get install -y --no-install-recommends \ 12 | git \ 13 | git-lfs \ 14 | python3.11 \ 15 | python3.11-dev \ 16 | python3.11-distutils \ 17 | ca-certificates \ 18 | build-essential \ 19 | curl \ 20 | wget \ 21 | unzip 22 | 23 | RUN rm -rf /var/lib/apt/lists/* \ 24 | && unlink /usr/bin/python3 \ 25 | && ln -s /usr/bin/python3.11 /usr/bin/python3 \ 26 | && ln -s /usr/bin/python3 /usr/bin/python \ 27 | && curl -sS https://bootstrap.pypa.io/get-pip.py | python \ 28 | && pip3 install -U pip 29 | 30 | RUN apt-get update && apt-get -y install python3.11-venv 31 | ADD --chmod=755 https://astral.sh/uv/install.sh /install.sh 32 | RUN /install.sh && rm /install.sh 33 | 34 | ENV PYTHONUNBUFFERED=1 35 | 36 | WORKDIR /root 37 | COPY pyproject.toml pyproject.toml 38 | COPY olmocr/version.py olmocr/version.py 39 | 40 | RUN /root/.local/bin/uv pip install --system --no-cache -e . 41 | RUN /root/.local/bin/uv pip install --system --no-cache ".[gpu]" --find-links https://flashinfer.ai/whl/cu124/torch2.4/flashinfer/ 42 | RUN /root/.local/bin/uv pip install --system --no-cache ".[bench]" 43 | RUN playwright install-deps 44 | RUN playwright install chromium 45 | COPY olmocr olmocr 46 | COPY scripts scripts 47 | 48 | RUN python3 -m sglang.launch_server --help 49 | RUN python3 -m olmocr.pipeline --help -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY : docs 2 | docs : 3 | rm -rf docs/build/ 4 | sphinx-autobuild -b html --watch olmocr/ docs/source/ docs/build/ 5 | 6 | .PHONY : run-checks 7 | run-checks : 8 | isort --check . 9 | black --check . 10 | ruff check . 11 | mypy . 12 | CUDA_VISIBLE_DEVICES='' pytest -v --color=yes --doctest-modules tests/ olmocr/ 13 | 14 | .PHONY : build 15 | build : 16 | rm -rf *.egg-info/ 17 | python -m build 18 | -------------------------------------------------------------------------------- /RELEASE_PROCESS.md: -------------------------------------------------------------------------------- 1 | # GitHub Release Process 2 | 3 | ## Steps 4 | 5 | 1. Update the version in `olmocr/version.py`. 6 | 7 | 3. Run the release script: 8 | 9 | ```bash 10 | ./scripts/release.sh 11 | ``` 12 | 13 | This will commit the changes to the CHANGELOG and `version.py` files and then create a new tag in git 14 | which will trigger a workflow on GitHub Actions that handles the rest. 15 | 16 | ## Fixing a failed release 17 | 18 | If for some reason the GitHub Actions release workflow failed with an error that needs to be fixed, you'll have to delete both the tag and corresponding release from GitHub. After you've pushed a fix, delete the tag from your local clone with 19 | 20 | ```bash 21 | git tag -l | xargs git tag -d && git fetch -t 22 | ``` 23 | 24 | Then repeat the steps above. 25 | -------------------------------------------------------------------------------- /docs/.gitignore: -------------------------------------------------------------------------------- 1 | build 2 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= -W 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.https://www.sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/source/CHANGELOG.md: -------------------------------------------------------------------------------- 1 | ../../CHANGELOG.md -------------------------------------------------------------------------------- /docs/source/CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | ../../.github/CONTRIBUTING.md -------------------------------------------------------------------------------- /docs/source/_static/css/custom.css: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/docs/source/_static/css/custom.css -------------------------------------------------------------------------------- /docs/source/_static/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/docs/source/_static/favicon.ico -------------------------------------------------------------------------------- /docs/source/index.md: -------------------------------------------------------------------------------- 1 | # **olmocr** 2 | 3 | ```{toctree} 4 | :maxdepth: 2 5 | :hidden: 6 | :caption: Getting started 7 | 8 | installation 9 | overview 10 | ``` 11 | 12 | ```{toctree} 13 | :hidden: 14 | :caption: Development 15 | 16 | CHANGELOG 17 | CONTRIBUTING 18 | License 19 | GitHub Repository 20 | ``` 21 | 22 | ## Indices and tables 23 | 24 | ```{eval-rst} 25 | * :ref:`genindex` 26 | * :ref:`modindex` 27 | ``` 28 | -------------------------------------------------------------------------------- /docs/source/installation.md: -------------------------------------------------------------------------------- 1 | Installation 2 | ============ 3 | 4 | **olmocr** supports Python >= 3.8. 5 | 6 | ## Installing with `pip` 7 | 8 | **olmocr** is available [on PyPI](https://pypi.org/project/olmocr/). Just run 9 | 10 | ```bash 11 | pip install olmocr 12 | ``` 13 | 14 | ## Installing from source 15 | 16 | To install **olmocr** from source, first clone [the repository](https://github.com/allenai/olmocr): 17 | 18 | ```bash 19 | git clone https://github.com/allenai/olmocr.git 20 | cd olmocr 21 | ``` 22 | 23 | Then run 24 | 25 | ```bash 26 | pip install -e . 27 | ``` 28 | -------------------------------------------------------------------------------- /docs/source/ocr_pareto.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/docs/source/ocr_pareto.pdf -------------------------------------------------------------------------------- /docs/source/ocr_pareto.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/docs/source/ocr_pareto.png -------------------------------------------------------------------------------- /docs/source/overview.md: -------------------------------------------------------------------------------- 1 | Overview 2 | ======== 3 | 4 | -------------------------------------------------------------------------------- /gantry-requirements.txt: -------------------------------------------------------------------------------- 1 | torchvision 2 | cached-path 3 | smart_open 4 | pypdf 5 | pypdfium2 6 | lingua-language-detector 7 | Pillow 8 | ruff 9 | mypy>=1.0,<1.5 10 | black>=23.0,<24.0 11 | isort>=5.12,<5.13 12 | pytest 13 | pytest-sphinx 14 | pytest-cov 15 | twine>=1.11.0 16 | build 17 | setuptools 18 | wheel 19 | Sphinx>=4.3.0,<7.1.0 20 | furo==2023.7.26 21 | myst-parser>=1.0,<2.1 22 | sphinx-copybutton==0.5.2 23 | sphinx-autobuild==2021.3.14 24 | sphinx-autodoc-typehints==1.23.3 25 | packaging 26 | necessary 27 | accelerate>=0.34.2 28 | datasets==3.0.0 29 | peft 30 | wandb 31 | omegaconf 32 | s3fs 33 | transformers>=4.45.1 34 | bitsandbytes 35 | ftfy 36 | -------------------------------------------------------------------------------- /olmocr/__init__.py: -------------------------------------------------------------------------------- 1 | from .version import VERSION, VERSION_SHORT 2 | -------------------------------------------------------------------------------- /olmocr/bench/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/olmocr/bench/__init__.py -------------------------------------------------------------------------------- /olmocr/bench/katex/__init__.py: -------------------------------------------------------------------------------- 1 | from .render import compare_rendered_equations, render_equation 2 | -------------------------------------------------------------------------------- /olmocr/bench/katex/auto-render.min.js: -------------------------------------------------------------------------------- 1 | !function(e,t){"object"==typeof exports&&"object"==typeof module?module.exports=t(require("katex")):"function"==typeof define&&define.amd?define(["katex"],t):"object"==typeof exports?exports.renderMathInElement=t(require("katex")):e.renderMathInElement=t(e.katex)}("undefined"!=typeof self?self:this,(function(e){return function(){"use strict";var t={757:function(t){t.exports=e}},n={};function r(e){var o=n[e];if(void 0!==o)return o.exports;var i=n[e]={exports:{}};return t[e](i,i.exports,r),i.exports}r.n=function(e){var t=e&&e.__esModule?function(){return e.default}:function(){return e};return r.d(t,{a:t}),t},r.d=function(e,t){for(var n in t)r.o(t,n)&&!r.o(e,n)&&Object.defineProperty(e,n,{enumerable:!0,get:t[n]})},r.o=function(e,t){return Object.prototype.hasOwnProperty.call(e,t)};var o={};r.d(o,{default:function(){return p}});var i=r(757),a=r.n(i);const l=function(e,t,n){let r=n,o=0;const i=e.length;for(;re.left.replace(/[-/\\^$*+?.()|[\]{}]/g,"\\$&"))).join("|")+")");for(;n=e.search(o),-1!==n;){n>0&&(r.push({type:"text",data:e.slice(0,n)}),e=e.slice(n));const o=t.findIndex((t=>e.startsWith(t.left)));if(n=l(t[o].right,e,t[o].left.length),-1===n)break;const i=e.slice(0,n+t[o].right.length),a=s.test(i)?i:e.slice(t[o].left.length,n);r.push({type:"math",data:a,rawData:i,display:t[o].display}),e=e.slice(n+t[o].right.length)}return""!==e&&r.push({type:"text",data:e}),r};const c=function(e,t){const n=d(e,t.delimiters);if(1===n.length&&"text"===n[0].type)return null;const r=document.createDocumentFragment();for(let e=0;e-1===e.indexOf(" "+t+" ")))&&f(r,t)}}};var p=function(e,t){if(!e)throw new Error("No element provided to render");const n={};for(const e in t)t.hasOwnProperty(e)&&(n[e]=t[e]);n.delimiters=n.delimiters||[{left:"$$",right:"$$",display:!0},{left:"\\(",right:"\\)",display:!1},{left:"\\begin{equation}",right:"\\end{equation}",display:!0},{left:"\\begin{align}",right:"\\end{align}",display:!0},{left:"\\begin{alignat}",right:"\\end{alignat}",display:!0},{left:"\\begin{gather}",right:"\\end{gather}",display:!0},{left:"\\begin{CD}",right:"\\end{CD}",display:!0},{left:"\\[",right:"\\]",display:!0}],n.ignoredTags=n.ignoredTags||["script","noscript","style","textarea","pre","code","option"],n.ignoredClasses=n.ignoredClasses||[],n.errorCallback=n.errorCallback||console.error,n.macros=n.macros||{},f(e,n)};return o=o.default}()})); -------------------------------------------------------------------------------- /olmocr/bench/miners/cleanup_urls.py: -------------------------------------------------------------------------------- 1 | # Rewrites all URLs in a dataset.jsonl file using a sql lite database lookup 2 | import argparse 3 | import json 4 | import re 5 | import sqlite3 6 | from typing import Optional 7 | 8 | 9 | def parse_pdf_hash(pretty_pdf_path: str) -> Optional[str]: 10 | pattern = r"s3://ai2-s2-pdfs/([a-f0-9]{4})/([a-f0-9]+)\.pdf" 11 | match = re.match(pattern, pretty_pdf_path) 12 | if match: 13 | return match.group(1) + match.group(2) 14 | return None 15 | 16 | 17 | def get_uri_from_db(db_path: str, pdf_hash: str) -> Optional[str]: 18 | conn = sqlite3.connect(db_path) 19 | cursor = conn.cursor() 20 | cursor.execute("SELECT uri FROM pdf_mapping WHERE pdf_hash = ?", (pdf_hash,)) 21 | result = cursor.fetchone() 22 | conn.close() 23 | return result[0] if result else None 24 | 25 | 26 | if __name__ == "__main__": 27 | parser = argparse.ArgumentParser(description="Rewrites all URLs in a dataset.jsonl file using a sql lite database lookup") 28 | parser.add_argument("jsonl", type=str, help="JSONL file containing s3 paths") 29 | parser.add_argument("--db", type=str, required=True, help="Path to sqlite database mapping internal s3 urls to external ones") 30 | parser.add_argument("--force", action="store_true", help="Path to sqlite database mapping internal s3 urls to external ones") 31 | args = parser.parse_args() 32 | 33 | data = [] 34 | skipped = 0 35 | 36 | with open(args.jsonl, "r") as inpf: 37 | for row in inpf: 38 | if len(row.strip()) > 0: 39 | j = json.loads(row) 40 | 41 | assert j["url"] 42 | hash = parse_pdf_hash(j["url"]) 43 | if hash: 44 | url = get_uri_from_db(args.db, hash) 45 | 46 | if url: 47 | j["url"] = url 48 | data.append(j) 49 | else: 50 | skipped += 1 51 | else: 52 | data.append(j) 53 | 54 | print(data) 55 | 56 | print(f"{skipped} entries were skipped!") 57 | 58 | if not args.force: 59 | print("Now run with --force to write data") 60 | quit() 61 | 62 | with open(args.jsonl, "w") as inpf: 63 | for row in data: 64 | print(json.dumps(row), file=inpf) 65 | -------------------------------------------------------------------------------- /olmocr/bench/prompts.py: -------------------------------------------------------------------------------- 1 | def build_basic_prompt() -> str: 2 | return "Please provide a natural, plain text representation of the document, formatted in Markdown. Skip any headers and footers. For ALL mathematical expressions, use LaTeX notation with \( and \) for inline equations and \[ and \] for display equations. Convert any tables into Markdown format." 3 | 4 | 5 | def build_openai_silver_data_prompt_no_document_anchoring(_base_text: str) -> str: 6 | return ( 7 | "Below is the image of one page of a PDF document. " 8 | "Just return the plain text representation of this document as if you were reading it naturally.\n" 9 | "Turn equations into a LaTeX representation, and tables into markdown format. Remove the headers and footers, but keep references and footnotes.\n" 10 | "Read any natural handwriting.\n" 11 | "This is likely one page out of several in the document, so be sure to preserve any sentences that come from the previous page, or continue onto the next page, exactly as they are.\n" 12 | "If there is no text at all that you think you should read, you can output null.\n" 13 | "Do not hallucinate." 14 | ) 15 | 16 | 17 | def claude_response_format_schema() -> dict: 18 | return ( 19 | { 20 | "name": "page_response", 21 | "description": "Extracts text from pdf's.", 22 | "input_schema": { 23 | "type": "object", 24 | "properties": { 25 | "primary_language": { 26 | "type": ["string", "null"], 27 | "description": "The primary language of the text using two-letter codes or null if there is no text at all that you think you should read.", 28 | }, 29 | "is_rotation_valid": { 30 | "type": "boolean", 31 | "description": "Is this page oriented correctly for reading? Answer only considering the textual content, do not factor in the rotation of any charts, tables, drawings, or figures.", 32 | }, 33 | "rotation_correction": { 34 | "type": "integer", 35 | "description": "Indicates the degree of clockwise rotation needed if the page is not oriented correctly.", 36 | "enum": [0, 90, 180, 270], 37 | "default": 0, 38 | }, 39 | "is_table": { 40 | "type": "boolean", 41 | "description": "Indicates if the majority of the page content is in tabular format.", 42 | }, 43 | "is_diagram": { 44 | "type": "boolean", 45 | "description": "Indicates if the majority of the page content is a visual diagram.", 46 | }, 47 | "natural_text": { 48 | "type": ["string", "null"], 49 | "description": "The natural text content extracted from the page.", 50 | }, 51 | }, 52 | "required": [ 53 | "primary_language", 54 | "is_rotation_valid", 55 | "rotation_correction", 56 | "is_table", 57 | "is_diagram", 58 | "natural_text", 59 | ], 60 | }, 61 | }, 62 | ) 63 | -------------------------------------------------------------------------------- /olmocr/bench/runners/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/olmocr/bench/runners/__init__.py -------------------------------------------------------------------------------- /olmocr/bench/runners/run_chatgpt.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from typing import Literal 4 | 5 | from openai import OpenAI 6 | 7 | from olmocr.bench.prompts import ( 8 | build_basic_prompt, 9 | build_openai_silver_data_prompt_no_document_anchoring, 10 | ) 11 | from olmocr.data.renderpdf import render_pdf_to_base64png 12 | from olmocr.prompts.anchor import get_anchor_text 13 | from olmocr.prompts.prompts import ( 14 | PageResponse, 15 | build_finetuning_prompt, 16 | build_openai_silver_data_prompt, 17 | openai_response_format_schema, 18 | ) 19 | 20 | 21 | def run_chatgpt( 22 | pdf_path: str, 23 | page_num: int = 1, 24 | model: str = "gpt-4o-2024-08-06", 25 | temperature: float = 0.1, 26 | target_longest_image_dim: int = 2048, 27 | prompt_template: Literal["full", "full_no_document_anchoring", "basic", "finetune"] = "finetune", 28 | response_template: Literal["plain", "json"] = "json", 29 | ) -> str: 30 | """ 31 | Convert page of a PDF file to markdown using the commercial openAI APIs. 32 | 33 | See run_server.py for running against an openai compatible server 34 | 35 | Args: 36 | pdf_path (str): The local path to the PDF file. 37 | 38 | Returns: 39 | str: The OCR result in markdown format. 40 | """ 41 | # Convert the first page of the PDF to a base64-encoded PNG image. 42 | image_base64 = render_pdf_to_base64png(pdf_path, page_num=page_num, target_longest_image_dim=target_longest_image_dim) 43 | anchor_text = get_anchor_text(pdf_path, page_num, pdf_engine="pdfreport") 44 | 45 | if not os.getenv("OPENAI_API_KEY"): 46 | raise SystemExit("You must specify an OPENAI_API_KEY") 47 | 48 | client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) 49 | 50 | if prompt_template == "full": 51 | prompt = build_openai_silver_data_prompt(anchor_text) 52 | elif prompt_template == "full_no_document_anchoring": 53 | prompt = build_openai_silver_data_prompt_no_document_anchoring(anchor_text) 54 | elif prompt_template == "finetune": 55 | prompt = build_finetuning_prompt(anchor_text) 56 | elif prompt_template == "basic": 57 | prompt = build_basic_prompt() 58 | else: 59 | raise ValueError("Unknown prompt template") 60 | 61 | response = client.chat.completions.create( 62 | model=model, 63 | messages=[ 64 | { 65 | "role": "user", 66 | "content": [ 67 | {"type": "text", "text": prompt}, 68 | {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}}, 69 | ], 70 | } 71 | ], 72 | temperature=temperature, 73 | max_tokens=3000, 74 | response_format=openai_response_format_schema() if response_template == "json" else None, 75 | ) 76 | 77 | raw_response = response.choices[0].message.content 78 | 79 | assert len(response.choices) > 0 80 | assert response.choices[0].message.refusal is None 81 | assert response.choices[0].finish_reason == "stop" 82 | 83 | if response_template == "json": 84 | data = json.loads(raw_response) 85 | data = PageResponse(**data) 86 | 87 | return data.natural_text 88 | else: 89 | return raw_response 90 | -------------------------------------------------------------------------------- /olmocr/bench/runners/run_claude.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | from anthropic import Anthropic 5 | from prompts import build_openai_silver_data_prompt, claude_response_format_schema 6 | 7 | from olmocr.data.renderpdf import render_pdf_to_base64png 8 | from olmocr.prompts.anchor import get_anchor_text 9 | 10 | 11 | def run_claude(pdf_path: str, page_num: int = 1, model: str = "claude-3-7-sonnet-20250219", temperature: float = 0.1) -> str: 12 | """ 13 | Convert page of a PDF file to markdown using Claude OCR. 14 | This function renders the specified page of the PDF to an image, runs OCR on that image, 15 | and returns the OCR result as a markdown-formatted string. 16 | 17 | Args: 18 | pdf_path (str): The local path to the PDF file. 19 | page_num (int): The page number to process (starting from 1). 20 | model (str): The Claude model to use. 21 | temperature (float): The temperature parameter for generation. 22 | 23 | Returns: 24 | str: The OCR result in markdown format. 25 | """ 26 | 27 | if not os.getenv("ANTHROPIC_API_KEY"): 28 | raise SystemExit("You must specify an ANTHROPIC_API_KEY") 29 | 30 | image_base64 = render_pdf_to_base64png(pdf_path, page_num=page_num, target_longest_image_dim=2048) 31 | anchor_text = get_anchor_text(pdf_path, page_num, pdf_engine="pdfreport") 32 | client = Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY")) 33 | response = client.messages.create( 34 | model=model, 35 | max_tokens=3000, 36 | temperature=temperature, 37 | # system=system_prompt, 38 | tools=claude_response_format_schema(), 39 | messages=[ 40 | { 41 | "role": "user", 42 | "content": [ 43 | {"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": image_base64}}, 44 | { 45 | "type": "text", 46 | "text": f"{build_openai_silver_data_prompt(anchor_text)}. Use the page_response tool to respond. If the propeties are true, then extract the text from them and respond in natural_text.", 47 | }, 48 | ], 49 | } 50 | ], 51 | ) 52 | 53 | json_sentiment = None 54 | for content in response.content: 55 | if content.type == "tool_use" and content.name == "page_response": 56 | json_sentiment = content.input 57 | break 58 | 59 | if json_sentiment: 60 | response = json.dumps(json_sentiment, indent=2) 61 | return response 62 | -------------------------------------------------------------------------------- /olmocr/bench/runners/run_docling.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import os 3 | import tempfile 4 | from typing import Literal 5 | 6 | from pypdf import PdfReader, PdfWriter 7 | 8 | 9 | async def run_docling( 10 | pdf_path: str, 11 | page_num: int = 1, 12 | output_format: Literal["markdown"] = "markdown", 13 | use_smoldocling: bool = False, 14 | ) -> str: 15 | """Run docling CLI on a PDF file and return the results. 16 | 17 | Args: 18 | pdf_path: Path to the PDF file 19 | page_num: Page number to process (1-indexed) 20 | output_format: Output format (only markdown is supported for CLI version) 21 | 22 | Returns: 23 | String containing the markdown output 24 | """ 25 | if output_format != "markdown": 26 | raise ValueError("Only markdown output format is supported for CLI version") 27 | 28 | # Extract the specific page using pypdf 29 | pdf_reader = PdfReader(pdf_path) 30 | pdf_writer = PdfWriter() 31 | 32 | # Convert from 1-indexed to 0-indexed 33 | zero_based_page_num = page_num - 1 34 | 35 | if zero_based_page_num >= len(pdf_reader.pages) or zero_based_page_num < 0: 36 | raise ValueError(f"Page number {page_num} is out of bounds for PDF with {len(pdf_reader.pages)} pages") 37 | 38 | # Add the selected page to the writer 39 | pdf_writer.add_page(pdf_reader.pages[zero_based_page_num]) 40 | 41 | # Create temporary files for the single-page PDF and output markdown 42 | with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_pdf_file, tempfile.NamedTemporaryFile(suffix=".md", delete=False) as tmp_md_file: 43 | tmp_pdf_path = tmp_pdf_file.name 44 | tmp_md_path = tmp_md_file.name 45 | 46 | try: 47 | # Write the single-page PDF to the temporary file 48 | with open(tmp_pdf_path, "wb") as f: 49 | pdf_writer.write(f) 50 | 51 | # Build the command to run docling on the single-page PDF 52 | if use_smoldocling: 53 | cmd = ["docling", tmp_pdf_path, "-o", tmp_md_path] # Output file 54 | else: 55 | cmd = ["docling", "--pipeline", "vlm", "--vlm-model", "smoldocling", tmp_pdf_path, "-o", tmp_md_path] # Output file 56 | 57 | # Run the command asynchronously 58 | proc = await asyncio.create_subprocess_exec(*cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE) 59 | 60 | stdout, stderr = await proc.communicate() 61 | 62 | if proc.returncode != 0: 63 | error_msg = stderr.decode() if stderr else "Unknown error" 64 | raise RuntimeError(f"docling command failed with return code {proc.returncode}: {error_msg}") 65 | 66 | # Read the results from the temporary markdown file 67 | with open(tmp_md_path, "r", encoding="utf-8") as f: 68 | result = f.read() 69 | 70 | return result 71 | 72 | finally: 73 | # Clean up the temporary files 74 | for path in [tmp_pdf_path, tmp_md_path]: 75 | if os.path.exists(path): 76 | os.unlink(path) 77 | -------------------------------------------------------------------------------- /olmocr/bench/runners/run_gotocr.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import os 3 | import tempfile 4 | 5 | import torch 6 | from transformers import AutoModel, AutoTokenizer 7 | 8 | from olmocr.data.renderpdf import render_pdf_to_base64png 9 | 10 | # Global cache for the model and tokenizer. 11 | _device = "cuda" if torch.cuda.is_available() else "cpu" 12 | _model = None 13 | _tokenizer = None 14 | 15 | 16 | def load_model(): 17 | """ 18 | Load the GOT-OCR model and tokenizer if they haven't been loaded already. 19 | Returns: 20 | model: The GOT-OCR model loaded on the appropriate device. 21 | tokenizer: The corresponding tokenizer. 22 | """ 23 | global _model, _tokenizer 24 | if _model is None or _tokenizer is None: 25 | _tokenizer = AutoTokenizer.from_pretrained("ucaslcl/GOT-OCR2_0", trust_remote_code=True) 26 | _model = AutoModel.from_pretrained( 27 | "ucaslcl/GOT-OCR2_0", 28 | trust_remote_code=True, 29 | use_safetensors=True, 30 | revision="979938bf89ccdc949c0131ddd3841e24578a4742", 31 | pad_token_id=_tokenizer.eos_token_id, 32 | ) 33 | _model = _model.eval().to(_device) 34 | return _model, _tokenizer 35 | 36 | 37 | def run_gotocr(pdf_path: str, page_num: int = 1, ocr_type: str = "ocr") -> str: 38 | """ 39 | Convert page of a PDF file to markdown using GOT-OCR. 40 | 41 | This function renders the first page of the PDF to an image, runs OCR on that image, 42 | and returns the OCR result as a markdown-formatted string. 43 | 44 | Args: 45 | pdf_path (str): The local path to the PDF file. 46 | 47 | Returns: 48 | str: The OCR result in markdown format. 49 | """ 50 | # Ensure the model is loaded (cached across calls) 51 | model, tokenizer = load_model() 52 | 53 | # Convert the first page of the PDF to a base64-encoded PNG image. 54 | base64image = render_pdf_to_base64png(pdf_path, page_num=page_num, target_longest_image_dim=1024) 55 | 56 | # Write the image to a temporary file. 57 | with tempfile.NamedTemporaryFile("wb", suffix=".png", delete=False) as tmp: 58 | tmp.write(base64.b64decode(base64image)) 59 | tmp_filename = tmp.name 60 | 61 | # Run GOT-OCR on the saved image. 62 | result = model.chat(tokenizer, tmp_filename, ocr_type=ocr_type) 63 | 64 | # Clean up the temporary file. 65 | os.remove(tmp_filename) 66 | 67 | return result 68 | -------------------------------------------------------------------------------- /olmocr/bench/runners/run_marker.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tempfile 3 | 4 | from marker.converters.pdf import PdfConverter 5 | from marker.models import create_model_dict 6 | from marker.output import text_from_rendered 7 | from pypdf import PdfReader, PdfWriter 8 | 9 | _marker_converter = None 10 | 11 | 12 | def run_marker(pdf_path: str, page_num: int = 1) -> str: 13 | global _marker_converter 14 | 15 | if _marker_converter is None: 16 | # Create a configuration dictionary with the necessary settings 17 | config = { 18 | "texify_inline_spans": True, # This enables conversion of inline math to LaTeX 19 | } 20 | 21 | _marker_converter = PdfConverter(artifact_dict=create_model_dict(), config=config) 22 | 23 | # Extract the specific page from the PDF 24 | pdf_to_process = pdf_path 25 | temp_file = None 26 | 27 | if page_num > 0: # If a specific page is requested 28 | reader = PdfReader(pdf_path) 29 | 30 | # Check if the requested page exists 31 | if page_num > len(reader.pages): 32 | raise ValueError(f"Page {page_num} does not exist in the PDF. PDF has {len(reader.pages)} pages.") 33 | 34 | # Create a new PDF with just the requested page 35 | writer = PdfWriter() 36 | # pypdf uses 0-based indexing, so subtract 1 from page_num 37 | writer.add_page(reader.pages[page_num - 1]) 38 | 39 | # Save the extracted page to a temporary file 40 | temp_file = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) 41 | temp_file.close() # Close the file but keep the name 42 | 43 | with open(temp_file.name, "wb") as output_pdf: 44 | writer.write(output_pdf) 45 | 46 | pdf_to_process = temp_file.name 47 | 48 | try: 49 | # Process the PDF (either original or single-page extract) 50 | rendered = _marker_converter(pdf_to_process) 51 | text, _, images = text_from_rendered(rendered) 52 | return text 53 | finally: 54 | # Clean up the temporary file if it was created 55 | if temp_file and os.path.exists(temp_file.name): 56 | os.unlink(temp_file.name) 57 | -------------------------------------------------------------------------------- /olmocr/bench/runners/run_mineru.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tempfile 3 | 4 | from magic_pdf.config.enums import SupportedPdfParseMethod 5 | from magic_pdf.data.data_reader_writer import FileBasedDataReader, FileBasedDataWriter 6 | from magic_pdf.data.dataset import PymuDocDataset 7 | from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze 8 | from pypdf import PdfReader, PdfWriter 9 | 10 | 11 | def run_mineru(pdf_path: str, page_num: int = 1) -> str: 12 | output_folder = tempfile.TemporaryDirectory() 13 | image_output_folder = tempfile.TemporaryDirectory() 14 | 15 | # Initialize writers (same for all PDFs) 16 | image_writer = FileBasedDataWriter(image_output_folder.name) 17 | md_writer = FileBasedDataWriter(output_folder.name) 18 | 19 | if page_num > 0: # If a specific page is requested 20 | reader = PdfReader(pdf_path) 21 | 22 | # Check if the requested page exists 23 | if page_num > len(reader.pages): 24 | raise ValueError(f"Page {page_num} does not exist in the PDF. PDF has {len(reader.pages)} pages.") 25 | 26 | # Create a new PDF with just the requested page 27 | writer = PdfWriter() 28 | # pypdf uses 0-based indexing, so subtract 1 from page_num 29 | writer.add_page(reader.pages[page_num - 1]) 30 | 31 | # Save the extracted page to a temporary file 32 | temp_file = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) 33 | temp_file.close() # Close the file but keep the name 34 | 35 | with open(temp_file.name, "wb") as output_pdf: 36 | writer.write(output_pdf) 37 | 38 | pdf_to_process = temp_file.name 39 | else: 40 | pdf_to_process = pdf_path 41 | 42 | try: 43 | # Read the PDF file bytes 44 | reader = FileBasedDataReader("") 45 | pdf_bytes = reader.read(pdf_to_process) 46 | 47 | # Create dataset instance 48 | ds = PymuDocDataset(pdf_bytes) 49 | 50 | # Inference: decide whether to run OCR mode based on dataset classification 51 | if ds.classify() == SupportedPdfParseMethod.OCR: 52 | infer_result = ds.apply(doc_analyze, ocr=True) 53 | pipe_result = infer_result.pipe_ocr_mode(image_writer) 54 | else: 55 | infer_result = ds.apply(doc_analyze, ocr=False) 56 | pipe_result = infer_result.pipe_txt_mode(image_writer) 57 | 58 | # Generate markdown content; the image directory is the basename of the images output folder 59 | image_dir_basename = os.path.basename(image_output_folder.name) 60 | # md_content = pipe_result.get_markdown(image_dir_basename) 61 | 62 | # Dump markdown file 63 | with tempfile.NamedTemporaryFile("w+", suffix="md") as tf: 64 | pipe_result.dump_md(md_writer, tf.name, image_dir_basename) 65 | tf.flush() 66 | 67 | tf.seek(0) 68 | md_data = tf.read() 69 | 70 | return md_data 71 | finally: 72 | # Clean up the temporary file if it was created 73 | if temp_file and os.path.exists(temp_file.name): 74 | os.unlink(temp_file.name) 75 | -------------------------------------------------------------------------------- /olmocr/bench/runners/run_mistral.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tempfile 3 | 4 | from mistralai import Mistral 5 | from pypdf import PdfReader, PdfWriter 6 | 7 | 8 | def run_mistral(pdf_path: str, page_num: int = 1) -> str: 9 | """ 10 | Convert page of a PDF file to markdown using the mistral OCR api 11 | https://docs.mistral.ai/capabilities/document/ 12 | 13 | Args: 14 | pdf_path (str): The local path to the PDF file. 15 | 16 | Returns: 17 | str: The OCR result in markdown format. 18 | """ 19 | if not os.getenv("MISTRAL_API_KEY"): 20 | raise SystemExit("You must specify an MISTRAL_API_KEY") 21 | 22 | api_key = os.environ["MISTRAL_API_KEY"] 23 | client = Mistral(api_key=api_key) 24 | 25 | if page_num > 0: # If a specific page is requested 26 | reader = PdfReader(pdf_path) 27 | 28 | # Check if the requested page exists 29 | if page_num > len(reader.pages): 30 | raise ValueError(f"Page {page_num} does not exist in the PDF. PDF has {len(reader.pages)} pages.") 31 | 32 | # Create a new PDF with just the requested page 33 | writer = PdfWriter() 34 | # pypdf uses 0-based indexing, so subtract 1 from page_num 35 | writer.add_page(reader.pages[page_num - 1]) 36 | 37 | # Save the extracted page to a temporary file 38 | temp_file = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) 39 | temp_file.close() # Close the file but keep the name 40 | 41 | with open(temp_file.name, "wb") as output_pdf: 42 | writer.write(output_pdf) 43 | 44 | pdf_to_process = temp_file.name 45 | else: 46 | pdf_to_process = pdf_path 47 | 48 | try: 49 | with open(pdf_to_process, "rb") as pf: 50 | uploaded_pdf = client.files.upload( 51 | file={ 52 | "file_name": os.path.basename(pdf_path), 53 | "content": pf, 54 | }, 55 | purpose="ocr", 56 | ) 57 | 58 | signed_url = client.files.get_signed_url(file_id=uploaded_pdf.id) 59 | 60 | ocr_response = client.ocr.process( 61 | model="mistral-ocr-2503", 62 | document={ 63 | "type": "document_url", 64 | "document_url": signed_url.url, 65 | }, 66 | ) 67 | 68 | client.files.delete(file_id=uploaded_pdf.id) 69 | 70 | return ocr_response.pages[0].markdown 71 | finally: 72 | # Clean up the temporary file if it was created 73 | if temp_file and os.path.exists(temp_file.name): 74 | os.unlink(temp_file.name) 75 | -------------------------------------------------------------------------------- /olmocr/bench/runners/run_olmocr_pipeline.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import logging 3 | from dataclasses import dataclass 4 | from typing import Optional 5 | 6 | # Import necessary components from olmocr 7 | from olmocr.pipeline import ( 8 | MetricsKeeper, 9 | PageResult, 10 | WorkerTracker, 11 | process_page, 12 | sglang_server_host, 13 | sglang_server_ready, 14 | ) 15 | 16 | # Setup basic logging 17 | logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") 18 | logger = logging.getLogger("olmocr_runner") 19 | 20 | 21 | # Basic configuration 22 | @dataclass 23 | class Args: 24 | model: str = "allenai/olmOCR-7B-0225-preview" 25 | model_chat_template: str = "qwen2-vl" 26 | model_max_context: int = 8192 27 | target_longest_image_dim: int = 1024 28 | target_anchor_text_len: int = 6000 29 | max_page_retries: int = 8 30 | max_page_error_rate: float = 0.004 31 | 32 | 33 | server_check_lock = asyncio.Lock() 34 | 35 | 36 | async def run_olmocr_pipeline(pdf_path: str, page_num: int = 1, model: str = "allenai/olmOCR-7B-0225-preview") -> Optional[str]: 37 | """ 38 | Process a single page of a PDF using the official olmocr pipeline's process_page function 39 | 40 | Args: 41 | pdf_path: Path to the PDF file 42 | page_num: Page number to process (1-indexed) 43 | 44 | Returns: 45 | The extracted text from the page or None if processing failed 46 | """ 47 | # Ensure global variables are initialized 48 | global metrics, tracker 49 | if "metrics" not in globals() or metrics is None: 50 | metrics = MetricsKeeper(window=60 * 5) 51 | if "tracker" not in globals() or tracker is None: 52 | tracker = WorkerTracker() 53 | 54 | args = Args() 55 | args.model = model 56 | semaphore = asyncio.Semaphore(1) 57 | worker_id = 0 # Using 0 as default worker ID 58 | 59 | # Ensure server is running 60 | async with server_check_lock: 61 | _server_task = None 62 | try: 63 | await asyncio.wait_for(sglang_server_ready(), timeout=5) 64 | logger.info("Using existing sglang server") 65 | except Exception: 66 | logger.info("Starting new sglang server") 67 | _server_task = asyncio.create_task(sglang_server_host(args.model, args, semaphore)) 68 | await sglang_server_ready() 69 | 70 | try: 71 | # Process the page using the pipeline's process_page function 72 | # Note: process_page expects both original path and local path 73 | # In our case, we're using the same path for both 74 | page_result: PageResult = await process_page(args=args, worker_id=worker_id, pdf_orig_path=pdf_path, pdf_local_path=pdf_path, page_num=page_num) 75 | 76 | # Return the natural text from the response 77 | if page_result and page_result.response and not page_result.is_fallback: 78 | return page_result.response.natural_text 79 | return None 80 | 81 | except Exception as e: 82 | logger.error(f"Error processing page: {type(e).__name__} - {str(e)}") 83 | return None 84 | 85 | finally: 86 | # We leave the server running for potential reuse 87 | pass 88 | 89 | 90 | async def main(): 91 | # Example usage 92 | pdf_path = "your_pdf_path.pdf" 93 | page_num = 1 94 | 95 | result = await run_olmocr_pipeline(pdf_path, page_num) 96 | if result: 97 | print(f"Extracted text: {result[:200]}...") # Print first 200 chars 98 | else: 99 | print("Failed to extract text from the page") 100 | 101 | 102 | if __name__ == "__main__": 103 | asyncio.run(main()) 104 | -------------------------------------------------------------------------------- /olmocr/bench/runners/run_rolmocr.py: -------------------------------------------------------------------------------- 1 | import httpx 2 | 3 | from olmocr.data.renderpdf import render_pdf_to_base64png 4 | 5 | 6 | async def run_rolmocr( 7 | pdf_path: str, 8 | page_num: int = 1, 9 | server: str = "localhost:30000", 10 | model: str = "reducto/RolmOCR", 11 | temperature: float = 0.2, 12 | target_longest_image_dim: int = 1024, 13 | ) -> str: 14 | """ 15 | 16 | 17 | Returns: 18 | str: The OCR result in markdown format. 19 | """ 20 | # Convert the first page of the PDF to a base64-encoded PNG image. 21 | image_base64 = render_pdf_to_base64png(pdf_path, page_num=page_num, target_longest_image_dim=target_longest_image_dim) 22 | 23 | request = { 24 | "model": model, 25 | "messages": [ 26 | { 27 | "role": "user", 28 | "content": [ 29 | { 30 | "type": "image_url", 31 | "image_url": {"url": f"data:image/png;base64,{image_base64}"}, 32 | }, 33 | { 34 | "type": "text", 35 | "text": "Return the plain text representation of this document as if you were reading it naturally.\n", 36 | }, 37 | ], 38 | } 39 | ], 40 | "temperature": temperature, 41 | "max_tokens": 4096, 42 | } 43 | 44 | # Make request and get response using httpx 45 | url = f"http://{server}/v1/chat/completions" 46 | 47 | async with httpx.AsyncClient(timeout=300) as client: 48 | response = await client.post(url, json=request) 49 | 50 | response.raise_for_status() 51 | data = response.json() 52 | 53 | choice = data["choices"][0] 54 | assert ( 55 | choice["finish_reason"] == "stop" 56 | ), "Response from server did not finish with finish_reason stop as expected, this is probably going to lead to bad data" 57 | 58 | return choice["message"]["content"] 59 | -------------------------------------------------------------------------------- /olmocr/bench/runners/run_server.py: -------------------------------------------------------------------------------- 1 | import json 2 | from typing import Literal 3 | 4 | import httpx 5 | 6 | from olmocr.bench.prompts import ( 7 | build_basic_prompt, 8 | build_openai_silver_data_prompt_no_document_anchoring, 9 | ) 10 | from olmocr.data.renderpdf import render_pdf_to_base64png 11 | from olmocr.prompts.anchor import get_anchor_text 12 | from olmocr.prompts.prompts import ( 13 | PageResponse, 14 | build_finetuning_prompt, 15 | build_openai_silver_data_prompt, 16 | ) 17 | 18 | 19 | async def run_server( 20 | pdf_path: str, 21 | page_num: int = 1, 22 | server: str = "localhost:30000", 23 | model: str = "allenai/olmOCR-7B-0225-preview", 24 | temperature: float = 0.1, 25 | target_longest_image_dim: int = 1024, 26 | prompt_template: Literal["full", "full_no_document_anchoring", "basic", "finetune"] = "finetune", 27 | response_template: Literal["plain", "json"] = "json", 28 | ) -> str: 29 | """ 30 | Convert page of a PDF file to markdown by calling a request 31 | running against an openai compatible server. 32 | 33 | You can use this for running against vllm, sglang, servers 34 | as well as mixing and matching different model's. 35 | 36 | It will only make one direct request, with no retries or error checking. 37 | 38 | Returns: 39 | str: The OCR result in markdown format. 40 | """ 41 | # Convert the first page of the PDF to a base64-encoded PNG image. 42 | image_base64 = render_pdf_to_base64png(pdf_path, page_num=page_num, target_longest_image_dim=target_longest_image_dim) 43 | anchor_text = get_anchor_text(pdf_path, page_num, pdf_engine="pdfreport") 44 | 45 | if prompt_template == "full": 46 | prompt = build_openai_silver_data_prompt(anchor_text) 47 | elif prompt_template == "full_no_document_anchoring": 48 | prompt = build_openai_silver_data_prompt_no_document_anchoring(anchor_text) 49 | elif prompt_template == "finetune": 50 | prompt = build_finetuning_prompt(anchor_text) 51 | elif prompt_template == "basic": 52 | prompt = build_basic_prompt() 53 | else: 54 | raise ValueError("Unknown prompt template") 55 | 56 | request = { 57 | "model": model, 58 | "messages": [ 59 | { 60 | "role": "user", 61 | "content": [ 62 | {"type": "text", "text": prompt}, 63 | {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}}, 64 | ], 65 | } 66 | ], 67 | "temperature": temperature, 68 | "max_tokens": 3000, 69 | } 70 | 71 | # Make request and get response using httpx 72 | url = f"http://{server}/v1/chat/completions" 73 | 74 | async with httpx.AsyncClient(timeout=300) as client: 75 | response = await client.post(url, json=request) 76 | 77 | response.raise_for_status() 78 | data = response.json() 79 | 80 | choice = data["choices"][0] 81 | assert ( 82 | choice["finish_reason"] == "stop" 83 | ), "Response from server did not finish with finish_reason stop as expected, this is probably going to lead to bad data" 84 | 85 | if response_template == "json": 86 | page_data = json.loads(choice["message"]["content"]) 87 | page_response = PageResponse(**page_data) 88 | return page_response.natural_text 89 | elif response_template == "plain": 90 | return choice["message"]["content"] 91 | -------------------------------------------------------------------------------- /olmocr/bench/runners/run_transformers.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import json 3 | from io import BytesIO 4 | from typing import Literal 5 | 6 | import torch 7 | from PIL import Image 8 | from transformers import AutoProcessor, Qwen2VLForConditionalGeneration 9 | 10 | from olmocr.data.renderpdf import render_pdf_to_base64png 11 | from olmocr.prompts.anchor import get_anchor_text 12 | from olmocr.prompts.prompts import ( 13 | PageResponse, 14 | build_finetuning_prompt, 15 | build_openai_silver_data_prompt, 16 | ) 17 | 18 | _cached_model = None 19 | _cached_processor = None 20 | 21 | 22 | def run_transformers( 23 | pdf_path: str, 24 | page_num: int = 1, 25 | model: str = "allenai/olmOCR-7B-0225-preview", 26 | temperature: float = 0.1, 27 | target_longest_image_dim: int = 1024, 28 | prompt_template: Literal["full", "finetune"] = "finetune", 29 | response_template: Literal["plain", "json"] = "json", 30 | ) -> str: 31 | """ 32 | Convert page of a PDF file to markdown by calling a request 33 | running against an openai compatible server. 34 | 35 | You can use this for running against vllm, sglang, servers 36 | as well as mixing and matching different model's. 37 | 38 | It will only make one direct request, with no retries or error checking. 39 | 40 | Returns: 41 | str: The OCR result in markdown format. 42 | """ 43 | # Initialize the model 44 | global _cached_model, _cached_processor 45 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 46 | 47 | if _cached_model is None: 48 | model = Qwen2VLForConditionalGeneration.from_pretrained(model, torch_dtype=torch.bfloat16).eval() 49 | processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct") 50 | model = model.to(device) 51 | 52 | _cached_model = model 53 | _cached_processor = processor 54 | else: 55 | model = _cached_model 56 | processor = _cached_processor 57 | 58 | # Convert the first page of the PDF to a base64-encoded PNG image. 59 | image_base64 = render_pdf_to_base64png(pdf_path, page_num=page_num, target_longest_image_dim=target_longest_image_dim) 60 | anchor_text = get_anchor_text(pdf_path, page_num, pdf_engine="pdfreport") 61 | 62 | if prompt_template == "full": 63 | prompt = build_openai_silver_data_prompt(anchor_text) 64 | else: 65 | prompt = build_finetuning_prompt(anchor_text) 66 | 67 | messages = [ 68 | { 69 | "role": "user", 70 | "content": [ 71 | {"type": "text", "text": prompt}, 72 | {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}}, 73 | ], 74 | } 75 | ] 76 | 77 | # Apply the chat template and processor 78 | text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) 79 | main_image = Image.open(BytesIO(base64.b64decode(image_base64))) 80 | 81 | inputs = processor( 82 | text=[text], 83 | images=[main_image], 84 | padding=True, 85 | return_tensors="pt", 86 | ) 87 | inputs = {key: value.to(device) for (key, value) in inputs.items()} 88 | 89 | # Generate the output 90 | MAX_NEW_TOKENS = 3000 91 | with torch.no_grad(): 92 | output = model.generate( 93 | **inputs, 94 | temperature=temperature, 95 | max_new_tokens=MAX_NEW_TOKENS, 96 | num_return_sequences=1, 97 | do_sample=True, 98 | ) 99 | 100 | # Decode the output 101 | prompt_length = inputs["input_ids"].shape[1] 102 | new_tokens = output[:, prompt_length:] 103 | text_output = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)[0] 104 | 105 | assert new_tokens.shape[1] < MAX_NEW_TOKENS, "Output exceed max new tokens" 106 | 107 | if response_template == "json": 108 | page_data = json.loads(text_output) 109 | page_response = PageResponse(**page_data) 110 | return page_response.natural_text 111 | elif response_template == "plain": 112 | return text_output 113 | -------------------------------------------------------------------------------- /olmocr/bench/sample_data/olmocr_pipeline/buildingnotes_pg1_repeat1.md: -------------------------------------------------------------------------------- 1 | Master - 7 1/4 - 36" 2 | Master Bath - 7 1/4 - 30" 3 | Laundry - 4 3/4 - 36" 4 | Bath - 7 1/4 - 24" 5 | MUD - 7 - 36" 6 | UTIL - 8 1/4 - 36" 7 | DOWN BATH - 7 1/4 - 32" 8 | BUT KIT - 6 3/4 - 30 9 | PANTRY - 4 3/4 - 24 10 | 6 WEST - 32 9/8 - 32 11 | 6 WEST BATH 5" - 24" -------------------------------------------------------------------------------- /olmocr/bench/sample_data/olmocr_pipeline/earnings_pg1_repeat1.md: -------------------------------------------------------------------------------- 1 | Recently Issued Accounting Pronouncements 2 | 3 | Recently Adopted Accounting Pronouncement 4 | 5 | In November 2023, the Financial Accounting Standards Board, or FASB, issued a new accounting standard requiring disclosures of significant expenses in operating segments. We adopted this standard in our fiscal year 2025 annual report. Refer to Note 16 of the Notes to the Consolidated Financial Statements in Part IV, Item 15 of this Annual Report on Form 10-K for further information. 6 | 7 | Recent Accounting Pronouncements Not Yet Adopted 8 | 9 | In December 2023, the FASB issued a new accounting standard which includes new and updated income tax disclosures, including disaggregation of information in the rate reconciliation and income taxes paid. We expect to adopt this standard in our fiscal year 2026 annual report. We do not expect the adoption of this standard to have a material impact on our Consolidated Financial Statements other than additional disclosures. 10 | 11 | In November 2024, the FASB issued a new accounting standard requiring disclosures of certain additional expense information on an annual and interim basis, including, among other items, the amounts of purchases of inventory, employee compensation, depreciation and intangible asset amortization included within each income statement expense caption, as applicable. We expect to adopt this standard in our fiscal year 2028 annual report. We do not expect the adoption of this standard to have a material impact on our Consolidated Financial Statements other than additional disclosures. 12 | 13 | Note 2 - Business Combination 14 | 15 | Termination of the Arm Share Purchase Agreement 16 | 17 | In February 2022, NVIDIA and SoftBank Group Corp, or SoftBank, announced the termination of the Share Purchase Agreement whereby NVIDIA would have acquired Arm from SoftBank. The parties agreed to terminate it due to significant regulatory challenges preventing the completion of the transaction. We recorded an acquisition termination cost of $1.4 billion in fiscal year 2023 reflecting the write-off of the prepayment provided at signing. 18 | 19 | Note 3 - Stock-Based Compensation 20 | 21 | Stock-based compensation expense is associated with RSUs, PSUs, market-based PSUs, and our ESPP. 22 | 23 | Consolidated Statements of Income include stock-based compensation expense, net of amounts capitalized into inventory and subsequently recognized to cost of revenue, as follows: 24 | 25 | | Year Ended | Jan 29, 2023 | Jan 28, 2024 | Jan 29, 2023 | 26 | |------------|--------------|--------------|--------------| 27 | | (In millions) | $138 | $141 | $138 | 28 | | Cost of revenue | $178 | $141 | $138 | 29 | | Research and development | 3,423 | 2,532 | 1,892 | 30 | | Sales, general and administrative | 1,136 | 876 | 680 | 31 | | Total | $4,737 | $3,549 | $2,710 | 32 | 33 | Stock-based compensation capitalized in inventories was not significant during fiscal years 2025, 2024, and 2023. -------------------------------------------------------------------------------- /olmocr/bench/sample_data/olmocr_pipeline/headers_footers/ff1fc6a205ad039139ce566851b6b260c929_pg1_pg1_repeat1.md: -------------------------------------------------------------------------------- 1 | RTG Degradation Primer and Application to MMRTG 2 | 3 | Nuclear and Emerging Technology for Space (NETS) 2015 4 | February 23-26, 2015 5 | Abstract 5107 6 | 7 | Presenting Author: Tom Hammel, Teledyne Energy Systems 8 | Co-Authors: Russell Bennett, Teledyne Energy Systems 9 | Robert Sievers, Teledyne Energy Systems 10 | Bill Otting, Aerojet Rocketdyne -------------------------------------------------------------------------------- /olmocr/bench/sample_data/olmocr_pipeline/headers_footers/ff3d6e051903fe5ca9bc172ece14964c5632_pg1_pg1_repeat1.md: -------------------------------------------------------------------------------- 1 | بررسی دیدگاه و نظرات کتابداران و اعضای هیئت علمی دانشگاه شیراز در بره گیری از فناوری شبکه‌های پی سیم در کتابخانه‌های دانشگاهی 2 | 3 | چکیده: نظر به اهمیت و کاربرد گسترده شبکه‌های پی سیم در محیط‌های دانشگاهی در کشورهای پیشرفته و استفاده از آن در خدمات کتابخانه‌ای، بهره‌گیری از این فناوری در کتابخانه‌های دانشگاهی کشور احساس می‌شود. این پژوهش با استفاده از روش پیام‌برداری، با هدف معرفی شبکه‌های پی سیم، به بررسی دیدگاه و نظرات کتابداران و اعضای هیئت علمی دانشگاه شیراز، در یک کارگیری از این شبکه‌ها در کتابخانه‌های دانشگاهی پرداخت. باقی‌مانده‌های تحقیق نشان داد که کتابداران تا مایل زیادی به استفاده از شبکه‌های پی سیم در امر خدمات کتابخانه‌های تظیم می‌باشند و قفسه‌خوانی و دسترسی به فهرست عمومی پوسته دارند. و گسترش گی پوشش و سیاست‌های پیشرفته و سیاست‌های پیشرفته در کتابخانه‌ها را خواستار شده‌اند. در حالی که اعضای هیئت علمی، ضرورت استفاده بیشتر از این شبکه‌ها در کل محیط دانشگاه و دسترسی به منابع کتابخانه‌ای از خارج از محیط کتابخانه را خواستار شده‌اند. در کل، با توجه به نتایج حاصل از این تحقیق می‌توان گفت که هر چند استفاده از شبکه‌های پی سیم در کتابخانه‌ها پیش‌بینی‌های زیادی و نوآوری‌های می‌باشد و هنوز در کشور ما چندان مورد توجه قرار نگرفته و ناشناخته مانده است، اما تماشای کتابداران، پژوهشگران و استادان به استفاده و کاربرد آن در محیط‌های دانشگاهی زیاد است. 4 | 5 | کلیدواژه‌ها: شبکه‌های پی سیم؛ کتابخانه‌های دانشگاهی؛ اعضای هیئت علمی؛ کتابداران؛ رایانه‌های قابل حمل؛ رایانه‌های دستی؛ منابع و خدمات کتابخانه‌ای؛ دانشگاه شیراز 6 | 7 | farbod4ever@gmail.com 8 | 9 | نویسنده رابطه: -------------------------------------------------------------------------------- /olmocr/bench/sample_data/olmocr_pipeline/headers_footers/ff4f7dad78081cff727d19ab51c181d4a661_pg1_pg1_repeat1.md: -------------------------------------------------------------------------------- 1 | Molecular markers of breast cancer metastasis 2 | Weigelt, B. 3 | 4 | Citation for published version (APA): 5 | Weigelt, B. (2005). Molecular markers of breast cancer metastasis 6 | 7 | General rights 8 | It is not permitted to download or to forward/distribute the text or part of it without the consent of the author(s) and/or copyright holder(s), other than for strictly personal, individual use, unless the work is under an open content license (like Creative Commons). 9 | 10 | Disclaimer/Complaints regulations 11 | If you believe that digital publication of certain material infringes any of your rights or (privacy) interests, please let the Library know, stating your reasons. In case of a legitimate complaint, the Library will make the material inaccessible and/or remove it from the website. Please Ask the Library: http://uba.uva.nl/en/contact, or a letter to: Library of the University of Amsterdam, Secretariat, Singel 425, 1012 WP Amsterdam, The Netherlands. You will be contacted as soon as possible. -------------------------------------------------------------------------------- /olmocr/bench/sample_data/olmocr_pipeline/headers_footers/ff518b1240a66978f22035528ccb029450b5_pg2_pg1_repeat1.md: -------------------------------------------------------------------------------- 1 | Brief Notices 2 | 3 | Prophet of the Jubilee, translated and edited by Ronald D. Dennis (Religious Studies Center, Brigham Young University, 1997) 4 | 5 | In July 1846 in Rhydybont, Carmarthenshire, Wales, Dan Jones published the first issue of a monthly LDS periodical in the Welsh language on a press owned by John Jones, Dan’s brother, who was an ordained Congregational minister. The periodical, Prophwyd y Jubili (Prophet of the Jubilee), ran monthly thereafter through December 1848. Jones’s great-great-grandson Ronald Dennis has presented what he calls a “facsimile translation” (xxix) of the complete series, retaining original fonts, layout, and pagination, slightly enlarging font size for readability. Text and index are over seven hundred pages, and Geraint Bowen, former Archdruid of Wales, offers a superb introduction. 6 | 7 | Many articles in Prophet of the Jubilee rebut arguments of local anti-Mormons or apostates. Articles entitled “The ‘Hater of Deceit’ Proving Himself a False Prophet Again!!” and “The ‘Rev. W. R. Davies, from Dowlais,’ and His Cruel and Shameful Persecution Again!—Again!!” give a glimpse of the intense feelings between early Welsh Saints and their religious adversaries. Jones garnishes his numerous doctrinal treatises with occasional fiction and poetry, excerpts translated from the Millennial Star, the neighboring LDS periodical in England, and portions of articles on religious topics taken from European and U. S. newspapers. 8 | 9 | A brief summary of each article is provided at the beginning of the book, but after that the reader is left to plod through the text without annotations. While pagination is sure to confuse some readers, Prophet of the Jubilee opens up LDS historical documents that have been inaccessible to most English-speaking readers for 150 years. Here is a mass of interesting cultural and doctrinal history, as well as the voice of Dan Jones himself, one of the most prolific and persistent missionaries in the history of the Church. 10 | 11 | —Jed L. Woodworth 12 | 13 | Book of Mormon Authors: Their Words and Messages, by Roger R. Keller (Religious Studies Center, Brigham Young University, 1996) 14 | 15 | The statistical study of Book of Mormon texts is a well-traveled road in Book of Mormon scholarship. However, in Book of Mormon Authors, Roger Keller shows -------------------------------------------------------------------------------- /olmocr/bench/sample_data/olmocr_pipeline/headers_footers/ffaac214730d2b8c2ec842e3618ccb9c4259_pg1_pg1_repeat1.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/olmocr/bench/sample_data/olmocr_pipeline/headers_footers/ffaac214730d2b8c2ec842e3618ccb9c4259_pg1_pg1_repeat1.md -------------------------------------------------------------------------------- /olmocr/bench/sample_data/olmocr_pipeline/headers_footers/fff590bed29a2854ac1f874dad5752ede1aa_pg1_pg1_repeat1.md: -------------------------------------------------------------------------------- 1 | User’s Manual 2 | 3 | Model 475 4 | DSP Gaussmeter 5 | 6 | Lake Shore Cryotronics, Inc. 7 | 575 McCorkle Blvd. 8 | Westerville, Ohio 43082-8888 USA 9 | 10 | E-mail Addresses: 11 | sales@lakeshore.com 12 | service@lakeshore.com 13 | 14 | Visit Our Website At: 15 | www.lakeshore.com 16 | 17 | Fax: (614) 891-1392 18 | Telephone: (614) 891-2243 19 | 20 | Methods and apparatus disclosed and described herein have been developed solely on company funds of Lake Shore Cryotronics, Inc. No government or other contractual support or relationship whatsoever has existed which in any way affects or mitigates proprietary rights of Lake Shore Cryotronics, Inc. in these developments. Methods and apparatus disclosed herein may be subject to U.S. Patents existing or applied for. Lake Shore Cryotronics, Inc. reserves the right to add, improve, modify, or withdraw functions, design modifications, or products at any time without notice. Lake Shore shall not be liable for errors contained herein or for incidental or consequential damages in connection with furnishing, performance, or use of this material. -------------------------------------------------------------------------------- /olmocr/bench/sample_data/olmocr_pipeline/lincoln_letter_pg1_repeat1.md: -------------------------------------------------------------------------------- 1 | Executive Mansion, 2 | 3 | Washington City, 4 | 5 | January 15th, 1864 6 | 7 | Major General Hitchcock, Commissioner of Exchanges, is authorized and directed to offer Brigadier General Trimble, now a prisoner of war in Fort McHenry, in exchange for Major White, who is held as a prisoner at Richmond. He is also directed to send forward the offer of exchange by Henry M. Warfield, Esq. of Baltimore, under a flag of truce, and give him a pass to City Point. 8 | 9 | Abraham Lincoln -------------------------------------------------------------------------------- /olmocr/bench/sample_data/olmocr_pipeline/math_2503_04086_pg1_repeat1.md: -------------------------------------------------------------------------------- 1 | Proof. Let $S$ be the generating set associated with $D$ as described in Proposition 2.5. By the circulant diagonalization theorem, the spectrum of $G_R(D) = \Gamma(R, S)$ is the multiset $\{\lambda_g\}_{g \in R}$ where 2 | 3 | $$\lambda_g = \sum_{s \in S} \zeta_n^{\psi(gs)} = \sum_{i=1}^k \left[ \sum_{s, Rs = I_i} \zeta_n^{\psi(gs)} \right].$$ 4 | 5 | We remark that by Corollary 2.7, if $s \in R$ such that $Rs = I_i = Rx_i$ then $s$ has a unique representation of the form $s = ux_i$ where $u \in (R/\text{Ann}_R(x_i))^\times$ and $\hat{u}$ is a fixed lift of $u$ to $R^\times$. With this presentation, we can write 6 | 7 | $$\sum_{s, Rs = I_i} \zeta_n^{\psi(gs)} = \sum_{u \in (R/\text{Ann}_R(x_i))^\times} \zeta_n^{\psi(gux_i)} = \sum_{u \in (R/\text{Ann}_R(x_i))^\times} \zeta_n^{\psi_xi(gu)} = c(g, R/\text{Ann}_R(x_i)).$$ 8 | 9 | Here we recall that $\psi_xi$ is the induced linear functional on $R/\text{Ann}_R(x_i)$. We conclude that $\lambda_g = \sum_{i=1}^k c(g, R/\text{Ann}_R(x_i)).$ \hfill $\square$ 10 | 11 | The following corollary is simple yet important for our future work on perfect state transfers on gcd-graphs. 12 | 13 | **Corollary 4.17.** Suppose that $g' = ug$ for some $u \in R^\times$. Then $\lambda_g = \lambda_{g'}$. 14 | 15 | **Acknowledgements** 16 | 17 | We thank the Department of Mathematics and Computer Science at Lake Forest College for their generous financial support through an Overleaf subscription. We also thank Ján Mináč for his constant encouragement and support. 18 | 19 | **References** 20 | 21 | 1. Reza Akhtar, Megan Boggess, Tiffany Jackson-Henderson, Isidora Jiménez, Rachel Karpman, Amanda Kinzel, and Dan Pritikin, *On the unitary Cayley graph of a finite ring*, Electron. J. Combin. 16 (2009), no. 1, Research Paper 117, 13 pages. 22 | 2. Milan Bašić, Aleksandar Ilić, and Aleksandar Stamenković, *Maximal diameter of integral circulant graphs*, Information and Computation 301 (2024), 105208. 23 | 3. Maria Chudnovsky, Michal Cizek, Logan Crew, Ján Mináč, Tung T. Nguyen, Sophie Spirkl, and Nguyễn Duy Tấn, *On prime Cayley graphs*, arXiv:2401.06062, to appear in Journal of Combinatorics (2024). 24 | 4. Thomas Honold, *Characterization of finite frobenius rings*, Archiv der Mathematik 76 (2001), no. 6, 406–415. 25 | 5. Irving Kaplansky, *Elementary divisors and modules*, Transactions of the American Mathematical Society 66 (1949), no. 2, 464–491. 26 | 6. Walter Klotz and Torsten Sander, *Some properties of unitary Cayley graphs*, The Electronic Journal of Combinatorics 14 (2007), no. 1, R45, 12 pages. 27 | 7. Erich Lamprecht, *Allgemeine theorie der Gaußschen Summen in endlichen kommutativen Ringen*, Mathematische Nachrichten 9 (1953), no. 3, 149–196. 28 | 8. Ján Mináč, Tung T Nguyen, and Nguyen Duy Tấn, *Isomorphic gcd-graphs over polynomial rings*, arXiv preprint arXiv:2411.01768 (2024). 29 | 9. ______, *On the gcd graphs over polynomial rings*, arXiv preprint arXiv:2409.01929 (2024). -------------------------------------------------------------------------------- /olmocr/bench/sample_data/olmocr_pipeline/mathfuncs_colswitch_pg1_repeat1.md: -------------------------------------------------------------------------------- 1 | # The 20 Most Important Mathematical Equations 2 | 3 | A journey through the most elegant and influential formulas in mathematics 4 | 5 | | 1. Euler’s Identity | 3. The Fundamental Theorem of Calculus | 6 | |---------------------|----------------------------------------| 7 | | \( e^{i\pi} + 1 = 0 \) | \( \int_a^b f(x) \, dx = F(b) - F(a) \) | 8 | | Connects five fundamental constants (e, i, π, 1, 0), revealing the profound relationship between exponential functions and trigonometry. | Establishes that differentiation and integration are inverse operations. If F is an antiderivative of f, the definite integral equals F(b) - F(a). Revolutionized mathematical problem-solving. | 9 | 10 | | 2. Pythagorean Theorem | 4. Maxwell’s Equations | 11 | |------------------------|-----------------------| 12 | | \( a^2 + b^2 = c^2 \) | \( \nabla \cdot \mathbf{E} = \frac{\rho}{\varepsilon_0} \) | 13 | | In right triangles, the hypotenuse squared equals the sum of the squares of the other sides. Cornerstone of geometry with applications in navigation and architecture. | \( \nabla \cdot \mathbf{B} = 0 \) | 14 | | | \( \nabla \times \mathbf{E} = -\frac{\partial \mathbf{B}}{\partial t} \) | 15 | | | \( \nabla \times \mathbf{B} = \mu_0 \mathbf{J} + \mu_0 \varepsilon_0 \frac{\partial \mathbf{E}}{\partial t} \) | 16 | | | Unified electricity and magnetism as manifestations of the same force. Describes electromagnetic field behavior, predicting waves traveling at light speed. Enabled technologies from radio to smartphones. | -------------------------------------------------------------------------------- /olmocr/bench/sample_data/olmocr_pipeline/mathfuncs_pg1_repeat1.md: -------------------------------------------------------------------------------- 1 | # The 20 Most Important Mathematical Equations 2 | 3 | A journey through the most elegant and influential formulas in mathematics 4 | 5 | | 1. Euler's Identity | 2. Pythagorean Theorem | 6 | |--------------------|------------------------| 7 | | \( e^{i\pi} + 1 = 0 \) | \( a^2 + b^2 = c^2 \) | 8 | 9 | Connects five fundamental constants (e, i, π, 1, 0), revealing the profound relationship between exponential functions and trigonometry. 10 | 11 | In right triangles, the hypotenuse squared equals the sum of the squares of the other sides. Cornerstone of geometry with applications in navigation and architecture. 12 | 13 | | 3. The Fundamental Theorem of Calculus | 4. Maxwell's Equations | 14 | |----------------------------------------|------------------------| 15 | | \( \int_{a}^{b} f(x) \, dx = F(b) - F(a) \) | \( \nabla \cdot \mathbf{E} = \frac{Q}{\varepsilon_0} \) | 16 | 17 | Establishes that differentiation and integration are inverse operations. If \( F \) is an antiderivative of \( f \), the definite integral equals \( F(b) - F(a) \). Revolutionized mathematical problem-solving. 18 | 19 | Unified electricity and magnetism as manifestations of the same force. Describes electromagnetic field behavior, predicting waves traveling at light speed. Enabled technologies from radio to smartphones. -------------------------------------------------------------------------------- /olmocr/bench/sample_data/olmocr_pipeline/mattsnotes_pg1_repeat1.md: -------------------------------------------------------------------------------- 1 | V-February Flow 2 | 3 | Data Components: 4 | 5 | Code: 6 | The-Stack-V2 7 | 8 | CodeText: 9 | SE, whatever we've scraped 10 | 11 | WebText: 12 | HQ DCLM 13 | 14 | DATA MIXES 15 | 16 | ~85% Source Code 17 | ~10% CodeText 18 | ~5% Webtext 19 | 20 | ~85% The-stack-V2 21 | ~15% CodeText 22 | ~0% Webtext 23 | 24 | ~100% Source Code 25 | 26 | Deepseek Coder 27 | 28 | StarCoder 2 29 | 30 | Arctic -------------------------------------------------------------------------------- /olmocr/bench/sample_data/olmocr_pipeline/mattsnotes_pg2_repeat1.md: -------------------------------------------------------------------------------- 1 | P1: 100% Source code 2 | P2: 80% code 3 | 20% language 4 | 5 | Code Data Recipe [StackCoder] 6 | 1) Order by Repo ✓ 7 | 2) Call Heuristic Filters ✗ 8 | 3) Group by Repo, lang → minhash ✓ 9 | 4) Pack into Repo-level docs □ 10 | 5) Select PL's □ 11 | 12 | 6) Pack into FIM tokens ✗ 13 | 14 | ✓: Eng Done 15 | X: Eng definitely NOT done 16 | D: so so easy 17 | 18 | Use Preprocessed code/text, web/text -------------------------------------------------------------------------------- /olmocr/bench/sample_data/olmocr_pipeline/mattsnotes_pg3_repeat1.md: -------------------------------------------------------------------------------- 1 | ARCH + TRAINING 2 | 3 | - Pick Arch like OLMO-IB 4 | - OR replicate a 3D model 5 | - Follow standard LR flow 6 | 7 | Eval: 8 | 9 | Hacky nonsense for now -------------------------------------------------------------------------------- /olmocr/bench/sample_data/olmocr_pipeline/olmo2-pg4_pg1_repeat1.md: -------------------------------------------------------------------------------- 1 | Table 1 Composition of the pretraining data for OLMo 2. The OLMo 2 1124 Mix is composed of StarCoder (Li et al., 2023b; Kocetkov et al., 2022), peS2o (Soldaini and Lo, 2023), web text from DCLM (Li et al., 2024) and Wiki come from Dolma 1.7 (Soldaini et al., 2024). arXiv comes from Red-Pajama (Together AI, 2023), while OpenWebMath (Paster et al., 2023) and Algebraic Stack come from ProofPile II (Azerbayev et al., 2023). 2 | 3 | | Source | Type | Tokens | Words | Bytes | Docs | 4 | |-------------------------------|-----------------------|---------|--------|--------|-------| 5 | | DCLM-Baseline | Web pages | 3.71T | 3.32T | 21.32T | 2.95B | 6 | | StarCoder | Code | 83.0B | 70.0B | 459B | 78.7M | 7 | | filtered version from OLMoE Mix | Academic papers | 58.6B | 51.1B | 413B | 38.8M | 8 | | peS2o from Dolma 1.7 | Math web pages | 12.2B | 11.1B | 47.2B | 2.89M | 9 | | arXiv | Math proofs code | 11.8B | 10.8B | 44.0B | 2.83M | 10 | | OpenWebMath | Encyclopedic | 3.7B | 3.16B | 16.2B | 6.17M | 11 | | Wikipedia & Wikibooks from Dolma 1.7 | | | | | | 12 | | Total | | 3.90T | 3.48T | 22.38T | 3.08B | 13 | 14 | 2.1.1 Pretraining data: OLMo 2 Mix 1124 15 | 16 | The mix used for this stage is shown in Table 1. It consists of approximately 3.9 trillion tokens, with over 95% derived from web data. We refer to this set as OLMo 2 Mix 1124. This is the same pretraining data used in OLMoE (Muennighoff et al., 2024). 17 | 18 | We combine data from DCLM (Li et al., 2024) and Dolma 1.7 (Soldaini et al., 2024). From DCLM, we use the “baseline 1.0” mix. From Dolma, we use the arXiv (Together AI, 2023), OpenWebMath (Paster et al., 2023), Algebraic Stack, peS2o (Soldaini and Lo, 2023), and Wikipedia subsets. arXiv, OpenWebMath, and Algebraic Stack were originally part of ProofPile II (Azerbayev et al., 2023). 19 | 20 | Finally, we include code from StarCoder (Li et al., 2023b), which is derived from permissively-licensed repositories from GitHub (Kocetkov et al., 2022). In an attempt to include higher quality code, we remove any document from a repository with fewer than 2 stars on GitHub. Further, through manual inspection of this source, we found it to contain documents encoded in binary format or containing mostly numerical content; to remove them, we discarded documents whose most frequent word constitutes over 30% of the document, or whose top-2 most frequent words constitute over 50% of the document. To mitigate possible training loss spikes, we remove documents with repeated sequences of 32 or more n-grams. We report details and show effectiveness of this intervention in Section §3.1. 21 | 22 | 2.1.2 Mid-training data: Dolmino Mix 1124 23 | 24 | After the initial pretraining stage on mostly web data, we further train with a mixture of web data that has been more restrictively filtered for quality and a collection of domain-specific high quality data, much of which is synthetic. The purpose of this mixture is to imbue the model with math-centric skills and provide focused exposure to STEM references and high quality text. We generate several variants of this mixture, with varying sizes, but generally refer to this mixture as Dolmino Mix 1124. The base sources from which Dolmino Mix 1124 is subsampled are described in Table 2. We refer the reader to Section §4 for a deep dive detailing our processes for experimenting and curating data for this mix. -------------------------------------------------------------------------------- /olmocr/bench/sample_data/olmocr_pipeline/openstax_caculus_pg_273_pg1_repeat1.md: -------------------------------------------------------------------------------- 1 | 3.4 EXERCISES 2 | 3 | For the following exercises, the given functions represent the position of a particle traveling along a horizontal line. 4 | 5 | a. Find the velocity and acceleration functions. 6 | 7 | b. Determine the time intervals when the object is slowing down or speeding up. 8 | 9 | 150. \( s(t) = 2t^3 - 3t^2 - 12t + 8 \) 10 | 11 | 151. \( s(t) = 2t^3 - 15t^2 + 36t - 10 \) 12 | 13 | 152. \( s(t) = \frac{t}{1 + t^2} \) 14 | 15 | 153. A rocket is fired vertically upward from the ground. The distance \( s \) in feet that the rocket travels from the ground after \( t \) seconds is given by \( s(t) = -16t^2 + 560t \). 16 | 17 | a. Find the velocity of the rocket 3 seconds after being fired. 18 | 19 | b. Find the acceleration of the rocket 3 seconds after being fired. 20 | 21 | 154. A ball is thrown downward with a speed of 8 ft/s from the top of a 64-foot-tall building. After \( t \) seconds, its height above the ground is given by \( s(t) = -16t^2 - 8t + 64 \). 22 | 23 | a. Determine how long it takes for the ball to hit the ground. 24 | 25 | b. Determine the velocity of the ball when it hits the ground. 26 | 27 | 155. The position function \( s(t) = t^2 - 3t - 4 \) represents the position of the back of a car backing out of a driveway and then driving in a straight line, where \( s \) is in feet and \( t \) is in seconds. In this case, \( s(t) = 0 \) represents the time at which the back of the car is at the garage door, so \( s(0) = -4 \) is the starting position of the car, 4 feet inside the garage. 28 | 29 | a. Determine the velocity of the car when \( s(t) = 0 \). 30 | 31 | b. Determine the velocity of the car when \( s(t) = 14 \). 32 | 33 | 156. The position of a hummingbird flying along a straight line in \( t \) seconds is given by \( s(t) = 3t^3 - 7t \) meters. 34 | 35 | a. Determine the velocity of the bird at \( t = 1 \) sec. 36 | 37 | b. Determine the acceleration of the bird at \( t = 1 \) sec. 38 | 39 | c. Determine the acceleration of the bird when the velocity equals 0. 40 | 41 | 157. A potato is launched vertically upward with an initial velocity of 100 ft/s from a potato gun at the top of an 85-foot-tall building. The distance in feet that the potato travels from the ground after \( t \) seconds is given by \( s(t) = -16t^2 + 100t + 85 \). 42 | 43 | a. Find the velocity of the potato after 0.5 s and 5.75 s. 44 | 45 | b. Find the speed of the potato at 0.5 s and 5.75 s. 46 | 47 | c. Determine when the potato reaches its maximum height. 48 | 49 | d. Find the acceleration of the potato at 0.5 s and 1.5 s. 50 | 51 | e. Determine how long the potato is in the air. 52 | 53 | f. Determine the velocity of the potato upon hitting the ground. 54 | 55 | 158. The position function \( s(t) = t^3 - 8t \) gives the position in miles of a freight train where east is the positive direction and \( t \) is measured in hours. 56 | 57 | a. Determine the direction the train is traveling when \( s(t) = 0 \). 58 | 59 | b. Determine the direction the train is traveling when \( s(t) = 0 \). 60 | 61 | c. Determine the time intervals when the train is slowing down or speeding up. 62 | 63 | 159. The following graph shows the position \( y = s(t) \) of an object moving along a straight line. 64 | 65 | ![Graph of position function](image) 66 | 67 | a. Use the graph of the position function to determine the time intervals when the velocity is positive, negative, or zero. 68 | 69 | b. Sketch the graph of the velocity function. 70 | 71 | c. Use the graph of the velocity function to determine the time intervals when the acceleration is positive, negative, or zero. 72 | 73 | d. Determine the time intervals when the object is speeding up or slowing down. -------------------------------------------------------------------------------- /olmocr/bench/sample_data/olmocr_pipeline/small_page_size_pg1_repeat1.md: -------------------------------------------------------------------------------- 1 | any—was very trifling. Since the use of bones has, however, become general, the turnip crop has been, in many instances, ten-fold, and in few less than four or five-fold its former bulk. All the succeeding crops of grain and seeds have been amazingly increased, and, upon the four or five-shift system, there is no doubt the land will go on progressively improving, requiring a less quantity of bones annually, from its increased fertility and power." 2 | 3 | On light loams, the returns to the Doncaster Committee give bones a preference to farm-yard dung. And we learn that, upon the calcareous soil of the Yorkshire Wolds, heavy crops of turnips have been raised from 16 bushels per acre of bones, while in the same field, and under similar circumstances, but manured from the farm-yard at the rate of from 8 to 10 tons per acre, the turnips have been of the most inferior description. 4 | 5 | On peat soils, if previously drained and laid dry, their advantages are reported to be so striking, that from fifteen to twenty bushels of dust per acre, drilled, have been also found to very far surpass the ordinary dressing of stable-dung, and even of lime and pigeons'-dung. 6 | 7 | On gravels, the reports are meagre and contradictory, though perhaps reconcilable in principle, as it has been justly observed, that "a gravelly soil may embrace every variety of texture and quality, from the light dry sand to the water-logged yellow clay—preserving in each the necessary admixture of stones and grit." To wet gravel, their application has been found decidedly unfavourable. 8 | 9 | **ANALYSIS.** 10 | 11 | An examination of the component parts of soils, and of the power of bones, when applied to them as manure, would go far to explain the irregularity of their different effects upon various kinds of soil. Bone is known to consist of about equal parts of earthy and animal matter; the former chiefly composed of gypsum—which is of so indestructible a nature as to have been termed, by early chemists, the "earth of bones"—and a small portion of carbonate of lime; from which we may conclude that probably half the weight of bones is in the greater part consumed by plants as direct nourishment in their state of growth, and that the remainder is more gradually absorbed by the soil, as well also as by the plants; for lime, though in small amount, is always present, in greater or less quantity, in all vegetable substances. 12 | 13 | "The quantity of earthy matter varies according to the age of the animal; and, in like manner, the quantity of animal matter varies also in proportion to the condition of the animal. In the best kinds of bones for manure, viz., those from fat young animals, perhaps the following proportions may give an approximation to the relative quantities of each in 100 parts: 14 | 15 | | Earthy and saline matter | 40 | 16 | | Cartilage and jelly | 40 | 17 | | Fatty matter | 20 | 18 | 19 | The soft parts thus form, in the best bone, about sixty, and upon an average, perhaps, amount to fifty per cent., which are almost entirely constituted of the same elements of plants, and all of them, sooner or later, liable to be dissolved and absorbed by the roots. The cartilage, indeed, when the bones have been buried in a dry situation, is very indestructible; but when exposed to the action of air, water, soil, and vegetation, will probably pass into the state of jelly, and be dissolved, or otherwise decomposed, 20 | 21 | * Doncaster Report, p. 8. -------------------------------------------------------------------------------- /olmocr/bench/sample_data/olmocr_pipeline/test-graphical-text_pg1_repeat1.md: -------------------------------------------------------------------------------- 1 | THE POWER OF STORYTELLING 2 | FOR LEADERS 3 | ดร.วิทย์ สิทธิเวคิน -------------------------------------------------------------------------------- /olmocr/bench/sample_data/pdfs/buildingnotes.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/olmocr/bench/sample_data/pdfs/buildingnotes.pdf -------------------------------------------------------------------------------- /olmocr/bench/sample_data/pdfs/discoverworld_crazy_table4.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/olmocr/bench/sample_data/pdfs/discoverworld_crazy_table4.pdf -------------------------------------------------------------------------------- /olmocr/bench/sample_data/pdfs/earnings.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/olmocr/bench/sample_data/pdfs/earnings.pdf -------------------------------------------------------------------------------- /olmocr/bench/sample_data/pdfs/headers_footers/ff0f0b22c55d8b90dd77d153f48e144fc9db_pg2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/olmocr/bench/sample_data/pdfs/headers_footers/ff0f0b22c55d8b90dd77d153f48e144fc9db_pg2.pdf -------------------------------------------------------------------------------- /olmocr/bench/sample_data/pdfs/headers_footers/ff1fc6a205ad039139ce566851b6b260c929_pg1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/olmocr/bench/sample_data/pdfs/headers_footers/ff1fc6a205ad039139ce566851b6b260c929_pg1.pdf -------------------------------------------------------------------------------- /olmocr/bench/sample_data/pdfs/headers_footers/ff3d6e051903fe5ca9bc172ece14964c5632_pg1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/olmocr/bench/sample_data/pdfs/headers_footers/ff3d6e051903fe5ca9bc172ece14964c5632_pg1.pdf -------------------------------------------------------------------------------- /olmocr/bench/sample_data/pdfs/headers_footers/ff4f7dad78081cff727d19ab51c181d4a661_pg1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/olmocr/bench/sample_data/pdfs/headers_footers/ff4f7dad78081cff727d19ab51c181d4a661_pg1.pdf -------------------------------------------------------------------------------- /olmocr/bench/sample_data/pdfs/headers_footers/ff518b1240a66978f22035528ccb029450b5_pg2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/olmocr/bench/sample_data/pdfs/headers_footers/ff518b1240a66978f22035528ccb029450b5_pg2.pdf -------------------------------------------------------------------------------- /olmocr/bench/sample_data/pdfs/headers_footers/ffaac214730d2b8c2ec842e3618ccb9c4259_pg1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/olmocr/bench/sample_data/pdfs/headers_footers/ffaac214730d2b8c2ec842e3618ccb9c4259_pg1.pdf -------------------------------------------------------------------------------- /olmocr/bench/sample_data/pdfs/headers_footers/fff590bed29a2854ac1f874dad5752ede1aa_pg1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/olmocr/bench/sample_data/pdfs/headers_footers/fff590bed29a2854ac1f874dad5752ede1aa_pg1.pdf -------------------------------------------------------------------------------- /olmocr/bench/sample_data/pdfs/lincoln_letter.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/olmocr/bench/sample_data/pdfs/lincoln_letter.pdf -------------------------------------------------------------------------------- /olmocr/bench/sample_data/pdfs/math_2503_04086.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/olmocr/bench/sample_data/pdfs/math_2503_04086.pdf -------------------------------------------------------------------------------- /olmocr/bench/sample_data/pdfs/mathfuncs.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/olmocr/bench/sample_data/pdfs/mathfuncs.pdf -------------------------------------------------------------------------------- /olmocr/bench/sample_data/pdfs/mathfuncs_colswitch.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/olmocr/bench/sample_data/pdfs/mathfuncs_colswitch.pdf -------------------------------------------------------------------------------- /olmocr/bench/sample_data/pdfs/mattsnotes.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/olmocr/bench/sample_data/pdfs/mattsnotes.pdf -------------------------------------------------------------------------------- /olmocr/bench/sample_data/pdfs/multi_column_miss.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/olmocr/bench/sample_data/pdfs/multi_column_miss.pdf -------------------------------------------------------------------------------- /olmocr/bench/sample_data/pdfs/olmo2-pg4.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/olmocr/bench/sample_data/pdfs/olmo2-pg4.pdf -------------------------------------------------------------------------------- /olmocr/bench/sample_data/pdfs/openstax_caculus_pg_273.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/olmocr/bench/sample_data/pdfs/openstax_caculus_pg_273.pdf -------------------------------------------------------------------------------- /olmocr/bench/sample_data/pdfs/small_page_size.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/olmocr/bench/sample_data/pdfs/small_page_size.pdf -------------------------------------------------------------------------------- /olmocr/bench/sample_data/pdfs/test-graphical-text.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/olmocr/bench/sample_data/pdfs/test-graphical-text.pdf -------------------------------------------------------------------------------- /olmocr/bench/scripts/run_difference.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from openai import OpenAI 4 | from runners.run_chatgpt import run_chatgpt 5 | from runners.run_gemini import run_gemini 6 | 7 | from olmocr.data.renderpdf import render_pdf_to_base64png 8 | 9 | 10 | def build_find_difference_prompt(base_text: str) -> str: 11 | return ( 12 | f"Below is an image of a document page, along with raw textual content previously extracted using different models." 13 | f"Your goal is to carefully identify the differences between the extracted texts from both models and determine which one is more accurate by comparing them with the image." 14 | f"Only return the differences and specify which model extracted the text with higher accuracy.\n" 15 | f"Do not hallucinate.\n" 16 | f"RAW_TEXT_START\n{base_text}\nRAW_TEXT_END" 17 | ) 18 | 19 | 20 | def combined_output(pdf_path: str) -> str: 21 | chatgpt_output = run_chatgpt(pdf_path) 22 | gemini_output = run_gemini(pdf_path) 23 | return f"ChatGPT OUTPUT: \n" f"{chatgpt_output}\n\n" f"Gemini OUTPUT: \n" f"{gemini_output}" 24 | 25 | 26 | def run_difference(pdf_path: str, page_num: int = 1, model: str = "gpt-4o-2024-08-06", temperature: float = 0.1) -> str: 27 | """ 28 | Convert page of a PDF file to markdown using GPT. 29 | 30 | This function renders the first page of the PDF to an image, runs OCR on that image, 31 | and returns the OCR result as a markdown-formatted string. 32 | 33 | Args: 34 | pdf_path (str): The local path to the PDF file. 35 | page_num (int): Which page from document to pass. 36 | model (str): Model used to process. 37 | Temperature (float): Temperature used while utilizing the model. 38 | 39 | Returns: 40 | str: The result in markdown format. 41 | """ 42 | # Convert the first page of the PDF to a base64-encoded PNG image. 43 | image_base64 = render_pdf_to_base64png(pdf_path, page_num=page_num, target_longest_image_dim=2048) 44 | anchor_text = combined_output(pdf_path) 45 | client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) 46 | 47 | response = client.chat.completions.create( 48 | model=model, 49 | messages=[ 50 | { 51 | "role": "user", 52 | "content": [ 53 | {"type": "text", "text": build_find_difference_prompt(anchor_text)}, 54 | {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}}, 55 | ], 56 | } 57 | ], 58 | temperature=temperature, 59 | max_tokens=3000, 60 | ) 61 | 62 | raw_response = response.choices[0].message.content 63 | 64 | return raw_response 65 | -------------------------------------------------------------------------------- /olmocr/bench/synth/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/olmocr/bench/synth/__init__.py -------------------------------------------------------------------------------- /olmocr/bench/templates/all_done.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | All Tests Reviewed 7 | 31 | 32 | 33 |
34 |

All Tests Reviewed!

35 |

You have completed reviewing all tests in the dataset.

36 |
37 | 38 | -------------------------------------------------------------------------------- /olmocr/bench/templates/all_done_latex.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | All Done! 8 | 43 | 44 | 45 |
46 |

All Done! 🎉

47 |

You have reviewed all equations in the dataset.

48 |
49 | 50 |
51 |
52 | 53 | 54 | -------------------------------------------------------------------------------- /olmocr/check.py: -------------------------------------------------------------------------------- 1 | import importlib.util 2 | import logging 3 | import subprocess 4 | import sys 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | def check_poppler_version(): 10 | try: 11 | result = subprocess.run(["pdftoppm", "-h"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) 12 | if result.returncode == 0 and result.stderr.startswith("pdftoppm"): 13 | logger.info("pdftoppm is installed and working.") 14 | else: 15 | logger.error("pdftoppm is installed but returned an error.") 16 | sys.exit(1) 17 | except FileNotFoundError: 18 | logger.error("pdftoppm is not installed.") 19 | logger.error("Check the README in the https://github.com/allenai/olmocr/blob/main/README.md for installation instructions") 20 | sys.exit(1) 21 | 22 | 23 | def check_sglang_version(): 24 | if importlib.util.find_spec("sglang") is None: 25 | logger.error("Please make sure sglang is installed according to the latest instructions here: https://docs.sglang.ai/start/install.html") 26 | logger.error("Sglang needs to be installed with a separate command in order to find all dependencies properly.") 27 | sys.exit(1) 28 | 29 | 30 | def check_torch_gpu_available(min_gpu_memory: int = 20 * 1024**3): 31 | try: 32 | import torch 33 | except: 34 | logger.error("Pytorch must be installed, visit https://pytorch.org/ for installation instructions") 35 | raise 36 | 37 | try: 38 | gpu_memory = torch.cuda.get_device_properties(0).total_memory 39 | assert gpu_memory >= min_gpu_memory 40 | except: 41 | logger.error(f"Torch was not able to find a GPU with at least {min_gpu_memory // (1024 ** 3)} GB of RAM.") 42 | raise 43 | 44 | 45 | if __name__ == "__main__": 46 | check_poppler_version() 47 | check_sglang_version() 48 | -------------------------------------------------------------------------------- /olmocr/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/olmocr/data/__init__.py -------------------------------------------------------------------------------- /olmocr/datatypes.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import hashlib 3 | import json 4 | from dataclasses import dataclass 5 | 6 | 7 | @dataclass(frozen=True) 8 | class PdfOutput: 9 | path: str 10 | text: str 11 | total_pdf_pages: int 12 | processed_pdf_pages: int 13 | 14 | def mk_dolma_doc(self, **kwargs) -> str: 15 | metadata = { 16 | "Source-File": self.path, 17 | "pdf-pages": self.processed_pdf_pages, 18 | "pdf-total-pages": self.total_pdf_pages, 19 | # Kwargs are added as extra metadata 20 | **kwargs, 21 | } 22 | id_ = hashlib.sha1(self.text.encode()).hexdigest() 23 | 24 | dolma_doc = { 25 | "id": id_, 26 | "text": self.text, 27 | "source": "s2pdf", 28 | "added": datetime.datetime.now().strftime("%Y-%m-%d"), 29 | "created": datetime.datetime.now().strftime("%Y-%m-%d"), 30 | "metadata": metadata, 31 | } 32 | 33 | return json.dumps(dolma_doc) 34 | -------------------------------------------------------------------------------- /olmocr/eval/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/olmocr/eval/__init__.py -------------------------------------------------------------------------------- /olmocr/eval/dolma_refine/aligners.py: -------------------------------------------------------------------------------- 1 | from typing import Type 2 | 3 | from sequence_align.pairwise import hirschberg, needleman_wunsch 4 | 5 | from .registry import BaseRegistry 6 | 7 | 8 | class AlignerRegistry(BaseRegistry[Type["BaseAligner"]]): 9 | """A registry for aligners.""" 10 | 11 | 12 | class BaseAligner: 13 | def __init__(self, *args, **kwargs): 14 | super().__init__() 15 | 16 | def align(self, gold: list[str], pred: list[str]) -> tuple[list[str], list[str]]: 17 | raise NotImplementedError() 18 | 19 | 20 | @AlignerRegistry.add("hirschberg") 21 | class HirschbergAligner(BaseAligner): 22 | def __init__( 23 | self, 24 | match_score: float = 1.0, 25 | mismatch_score: float = -1.0, 26 | indel_score: float = -1.0, 27 | gap_token: str = "▓", 28 | ): 29 | self.match_score = match_score 30 | self.mismatch_score = mismatch_score 31 | self.indel_score = indel_score 32 | self.gap_token = gap_token 33 | super().__init__() 34 | 35 | def align(self, gold: list[str], pred: list[str]) -> tuple[list[str], list[str]]: 36 | return hirschberg( 37 | gold, 38 | pred, 39 | match_score=self.match_score, 40 | mismatch_score=self.mismatch_score, 41 | indel_score=self.indel_score, 42 | gap=self.gap_token, 43 | ) 44 | 45 | 46 | @AlignerRegistry.add("needleman-wunsch") 47 | class NeedlemanWunschAligner(BaseAligner): 48 | def __init__( 49 | self, 50 | match_score: float = 1.0, 51 | mismatch_score: float = -1.0, 52 | indel_score: float = -1.0, 53 | gap_token: str = "▓", 54 | ): 55 | self.match_score = match_score 56 | self.mismatch_score = mismatch_score 57 | self.indel_score = indel_score 58 | self.gap_token = gap_token 59 | super().__init__() 60 | 61 | def align(self, gold: list[str], pred: list[str]) -> tuple[list[str], list[str]]: 62 | return needleman_wunsch( 63 | gold, 64 | pred, 65 | match_score=self.match_score, 66 | mismatch_score=self.mismatch_score, 67 | indel_score=self.indel_score, 68 | gap=self.gap_token, 69 | ) 70 | -------------------------------------------------------------------------------- /olmocr/eval/dolma_refine/segmenters.py: -------------------------------------------------------------------------------- 1 | from typing import Type 2 | 3 | from spacy.lang.en import English 4 | 5 | from .registry import BaseRegistry 6 | 7 | 8 | class SegmenterRegistry(BaseRegistry[Type["BaseSegmenter"]]): 9 | """A registry for segmenters.""" 10 | 11 | 12 | class BaseSegmenter: 13 | def __init__(self, segmenter_name_or_path: str, *args, **kwargs): 14 | super().__init__() 15 | 16 | def segment(self, text: str) -> list[str]: 17 | raise NotImplementedError() 18 | 19 | 20 | @SegmenterRegistry.add("spacy") 21 | class SpacySegmenter(BaseSegmenter): 22 | def __init__(self, segmenter_name_or_path: str, *args, **kwargs): 23 | assert segmenter_name_or_path == "spacy", "Only 'spacy' segmenter is supported" 24 | self.nlp = English() 25 | self.nlp.add_pipe("sentencizer") 26 | 27 | def segment(self, text: str) -> list[str]: 28 | return [sent.text_with_ws for sent in self.nlp(text).sents] 29 | -------------------------------------------------------------------------------- /olmocr/filter/__init__.py: -------------------------------------------------------------------------------- 1 | from .filter import PdfFilter 2 | -------------------------------------------------------------------------------- /olmocr/filter/coherency.py: -------------------------------------------------------------------------------- 1 | from functools import lru_cache 2 | 3 | import torch 4 | from transformers import AutoModelForCausalLM, AutoTokenizer 5 | 6 | 7 | @lru_cache() 8 | def load_coherency_model(model_name: str = "HuggingFaceTB/SmolLM-135M"): 9 | tokenizer = AutoTokenizer.from_pretrained(model_name) 10 | model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16) 11 | model.eval() # Set the model to evaluation mode 12 | 13 | return tokenizer, model 14 | 15 | 16 | def get_document_coherency(text: str) -> float: 17 | """ 18 | Calculates the coherency of a document based on the log likelihood of its tokens. 19 | Handles texts longer than the model's maximum token limit by splitting them into chunks. 20 | 21 | Args: 22 | text (str): The input text to evaluate. 23 | 24 | Returns: 25 | float: The average log likelihood per token as a measure of coherency. 26 | """ 27 | tokenizer, model = load_coherency_model() 28 | 29 | # Determine the model's maximum number of tokens 30 | max_length = tokenizer.model_max_length - 1 31 | # Some tokenizers have a default value indicating no limit; use model config if so 32 | if max_length > 1_000_000: 33 | max_length = model.config.max_position_embeddings 34 | 35 | # Tokenize the entire text 36 | tokens = tokenizer.encode(text, return_tensors="pt").squeeze(0) 37 | 38 | total_log_likelihood = 0.0 39 | total_tokens = 0 40 | 41 | # Split tokens into chunks that fit within the model's max length 42 | for i in range(0, len(tokens), max_length): 43 | chunk = tokens[i : i + max_length] 44 | inputs = chunk.unsqueeze(0) # Add batch dimension 45 | 46 | # Move inputs to CPU (ensure compatibility) 47 | inputs = {k: v.cpu() for k, v in {"input_ids": inputs}.items()} 48 | 49 | with torch.no_grad(): 50 | outputs = model(**inputs, labels=inputs["input_ids"]) 51 | # Compute log likelihood for the chunk 52 | log_likelihood = -outputs.loss.item() * chunk.size(0) 53 | total_log_likelihood += log_likelihood 54 | total_tokens += chunk.size(0) 55 | 56 | # Calculate the average log likelihood per token 57 | avg_log_likelihood = total_log_likelihood / total_tokens if total_tokens > 0 else 0.0 58 | 59 | return avg_log_likelihood 60 | -------------------------------------------------------------------------------- /olmocr/image_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | from typing import List, Union 4 | 5 | 6 | def convert_image_to_pdf_bytes(image_files: Union[str, List[str]]) -> bytes: 7 | """ 8 | Convert one or multiple image files to PDF bytes. 9 | 10 | Args: 11 | image_files: A single image file path (str) or a list of image file paths 12 | 13 | Returns: 14 | bytes: The PDF content as bytes 15 | 16 | Raises: 17 | RuntimeError: If the conversion fails 18 | ValueError: If invalid input is provided 19 | """ 20 | # Handle different input types 21 | if isinstance(image_files, str): 22 | # Single image case 23 | image_files = [image_files] 24 | elif not isinstance(image_files, list) or not image_files: 25 | raise ValueError("image_files must be a non-empty string or list of strings") 26 | 27 | # Validate files exist and are valid image formats 28 | for image_file in image_files: 29 | if not os.path.exists(image_file): 30 | raise ValueError(f"File does not exist: {image_file}") 31 | 32 | try: 33 | # Run img2pdf with all images as arguments 34 | result = subprocess.run(["img2pdf"] + image_files, check=True, capture_output=True) 35 | 36 | # Return the stdout content which contains the PDF data 37 | return result.stdout 38 | 39 | except subprocess.CalledProcessError as e: 40 | # Raise error with stderr information if the conversion fails 41 | raise RuntimeError(f"Error converting image(s) to PDF: {e.stderr.decode('utf-8')}") 42 | 43 | 44 | def is_png(file_path): 45 | try: 46 | with open(file_path, "rb") as f: 47 | header = f.read(8) 48 | return header == b"\x89PNG\r\n\x1a\n" 49 | except Exception as e: 50 | print(f"Error: {e}") 51 | return False 52 | 53 | 54 | def is_jpeg(file_path): 55 | try: 56 | with open(file_path, "rb") as f: 57 | header = f.read(2) 58 | return header == b"\xff\xd8" 59 | except Exception as e: 60 | print(f"Error: {e}") 61 | return False 62 | -------------------------------------------------------------------------------- /olmocr/loadertest.py: -------------------------------------------------------------------------------- 1 | import json 2 | from concurrent.futures import ProcessPoolExecutor, as_completed 3 | 4 | import boto3 5 | from tqdm import tqdm 6 | 7 | # Configuration 8 | BUCKET = "ai2-llm" 9 | PREFIX = "pretraining-data/sources/soldni-open-access-books/v0/pipeline/results" 10 | OUTPUT_FILENAME = "all_completed_files.txt" 11 | 12 | 13 | def process_file(key: str): 14 | """ 15 | Process a single S3 file given by its key. 16 | Reads a jsonl file from S3, decodes each line, 17 | extracts the 'Source-File' from the 'metadata' field, 18 | and returns a list of these source file strings. 19 | """ 20 | # Create a new S3 client in the worker thread (thread-safe) 21 | s3 = boto3.client("s3") 22 | extracted_lines = [] 23 | try: 24 | response = s3.get_object(Bucket=BUCKET, Key=key) 25 | for raw_line in response["Body"].iter_lines(): 26 | try: 27 | # Decode the line from bytes to text 28 | line_str = raw_line.decode("utf-8") 29 | except UnicodeDecodeError as e: 30 | print(f"Skipping a line in {key} due to decode error: {e}") 31 | continue 32 | try: 33 | data = json.loads(line_str) 34 | except json.JSONDecodeError as e: 35 | print(f"Skipping a malformed json line in {key}: {e}") 36 | continue 37 | # Extract 'Source-File' from metadata if present 38 | metadata = data.get("metadata", {}) 39 | source_file = metadata.get("Source-File") 40 | if source_file: 41 | extracted_lines.append(source_file) 42 | except Exception as e: 43 | print(f"Error processing file {key}: {e}") 44 | return extracted_lines 45 | 46 | 47 | def main(): 48 | s3 = boto3.client("s3") 49 | paginator = s3.get_paginator("list_objects_v2") 50 | page_iterator = paginator.paginate(Bucket=BUCKET, Prefix=PREFIX) 51 | 52 | # Gather all S3 object keys under the specified prefix 53 | keys = [] 54 | for page in page_iterator: 55 | if "Contents" not in page: 56 | continue 57 | for obj in page["Contents"]: 58 | keys.append(obj["Key"]) 59 | 60 | print(f"Found {len(keys)} files to process.") 61 | 62 | # Open the output file for writing 63 | with open(OUTPUT_FILENAME, "w", encoding="utf-8") as output_file: 64 | # Create a thread pool to process files concurrently. 65 | # Adjust max_workers based on your environment and workload. 66 | with ProcessPoolExecutor() as executor: 67 | # Submit all processing jobs and map each future to its key 68 | future_to_key = {executor.submit(process_file, key): key for key in keys} 69 | # Use tqdm to wrap the as_completed iterator for progress display 70 | for future in tqdm(as_completed(future_to_key), total=len(future_to_key), desc="Processing files"): 71 | try: 72 | source_files = future.result() 73 | # Write each extracted line to the output file as soon as the future completes 74 | for source in source_files: 75 | output_file.write(source + "\n") 76 | # Optionally flush after each completed task 77 | output_file.flush() 78 | except Exception as e: 79 | key = future_to_key[future] 80 | print(f"Exception occurred for file {key}: {e}") 81 | 82 | print(f"Finished writing the source file names to {OUTPUT_FILENAME}") 83 | 84 | 85 | if __name__ == "__main__": 86 | main() 87 | -------------------------------------------------------------------------------- /olmocr/prompts/__init__.py: -------------------------------------------------------------------------------- 1 | from .prompts import ( 2 | PageResponse, 3 | build_finetuning_prompt, 4 | build_openai_silver_data_prompt, 5 | extract_raw_text, 6 | openai_response_format_schema, 7 | ) 8 | -------------------------------------------------------------------------------- /olmocr/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/olmocr/py.typed -------------------------------------------------------------------------------- /olmocr/train/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/olmocr/train/__init__.py -------------------------------------------------------------------------------- /olmocr/train/config/molmo-o-lora-8192.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | name_or_path: allenai/Molmo-7B-O-0924 3 | arch: causal 4 | use_flash_attn: true 5 | 6 | wandb: 7 | project: pdelfin 8 | entity: ai2-llm 9 | 10 | generate: 11 | max_length: 8192 12 | 13 | train_data: 14 | seed: 1337 15 | cache_location: /data/jakep/pdfdata/pdelfin_cache 16 | sources: 17 | - name: openai_batch_data_v5_1_train 18 | response_glob_path: /data/jakep/pdfdata/openai_batch_data_v5_1_train_done/*.json 19 | target_longest_image_dim: [1024] 20 | target_anchor_text_len: [6000] 21 | - name: openai_batch_data_v5_1_iabooks_train 22 | response_glob_path: /data/jakep/pdfdata/openai_batch_data_v5_1_iabooks_train_done/*.json 23 | target_longest_image_dim: [1024] 24 | target_anchor_text_len: [6000] 25 | 26 | valid_data: 27 | cache_location: /data/jakep/pdfdata/pdelfin_cache 28 | metric_for_best_model: openai_batch_data_v5_1_eval_loss 29 | sources: 30 | # These tend to be small, so you can load from s3 it's no big deal 31 | - name: openai_batch_data_v5_1_eval 32 | response_glob_path: s3://ai2-oe-data/jakep/pdfdata/openai_batch_done_v5_1_eval/*.json 33 | target_longest_image_dim: [1024] 34 | target_anchor_text_len: [6000] 35 | - name: openai_batch_data_v5_1_iabooks_eval 36 | response_glob_path: s3://ai2-oe-data/jakep/pdfdata/openai_batch_done_v5_1_iabooks_eval/*.json 37 | target_longest_image_dim: [1024] 38 | target_anchor_text_len: [6000] 39 | 40 | 41 | # Mostly pulled from https://github.com/QwenLM/Qwen2/blob/main/examples/sft/finetune.sh 42 | hparams: 43 | batch_size: 1 44 | eval_batch_size: 1 45 | gradient_accumulation_steps: 4 46 | gradient_checkpointing: true 47 | find_unused_parameters: true 48 | clip_grad_norm: 1.0 49 | learning_rate: 3e-4 50 | max_steps: 10000 51 | pad_multiple_of: 16 52 | log_every_steps: 10 53 | eval_every_steps: 100 54 | optim: adamw_torch 55 | lr_scheduler: cosine 56 | weight_decay: 0.01 57 | warmup_ratio: 0.03 58 | 59 | # From https://github.com/QwenLM/Qwen2/blob/main/examples/sft/finetune.py 60 | lora: 61 | rank: 32 62 | alpha: 32 63 | dropout: 0.05 64 | task_type: CAUSAL_LM 65 | target_modules: 66 | # attention layers in main transformer 67 | - att_proj 68 | - ff_proj 69 | - attn_out 70 | - ff_out 71 | # vision transformer attention and FF 72 | - attention.wq 73 | - attention.wk 74 | - attention.wv 75 | - attention.wo 76 | - feed_forward.w1 77 | - feed_forward.w2 78 | # vision image projector 79 | - vision_backbone.image_projector.w1 80 | - vision_backbone.image_projector.w2 81 | - vision_backbone.image_projector.w3 82 | 83 | save: 84 | path: s3://ai2-oe-data/jakep/experiments/molmo-o-0924/v1/models/ 85 | save_every_steps: 1000 86 | 87 | max_workers: 10 -------------------------------------------------------------------------------- /olmocr/train/config/molmo-o-lora.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | name_or_path: allenai/Molmo-7B-O-0924 3 | arch: causal 4 | use_flash_attn: true 5 | 6 | wandb: 7 | project: pdelfin 8 | entity: ai2-llm 9 | 10 | generate: 11 | max_length: 4096 12 | 13 | train_data: 14 | seed: 1337 15 | cache_location: /data/jakep/pdfdata/pdelfin_cache 16 | sources: 17 | - name: openai_batch_data_v5_1_train 18 | response_glob_path: /data/jakep/pdfdata/openai_batch_data_v5_1_train_done/*.json 19 | target_longest_image_dim: [1024] 20 | target_anchor_text_len: [6000] 21 | - name: openai_batch_data_v5_1_iabooks_train 22 | response_glob_path: /data/jakep/pdfdata/openai_batch_data_v5_1_iabooks_train_done/*.json 23 | target_longest_image_dim: [1024] 24 | target_anchor_text_len: [6000] 25 | 26 | valid_data: 27 | cache_location: /data/jakep/pdfdata/pdelfin_cache 28 | metric_for_best_model: openai_batch_data_v5_1_eval_loss 29 | sources: 30 | # These tend to be small, so you can load from s3 it's no big deal 31 | - name: openai_batch_data_v5_1_eval 32 | response_glob_path: s3://ai2-oe-data/jakep/pdfdata/openai_batch_done_v5_1_eval/*.json 33 | target_longest_image_dim: [1024] 34 | target_anchor_text_len: [6000] 35 | - name: openai_batch_data_v5_1_eval 36 | response_glob_path: s3://ai2-oe-data/jakep/pdfdata/openai_batch_done_v5_1_eval/*.json 37 | target_longest_image_dim: [1024] 38 | target_anchor_text_len: [6000] 39 | 40 | 41 | 42 | 43 | # Mostly pulled from https://github.com/QwenLM/Qwen2/blob/main/examples/sft/finetune.sh 44 | hparams: 45 | batch_size: 1 46 | eval_batch_size: 1 47 | gradient_accumulation_steps: 4 48 | gradient_checkpointing: true 49 | find_unused_parameters: true 50 | clip_grad_norm: 1.0 51 | learning_rate: 1e-4 52 | max_steps: 10000 53 | pad_multiple_of: 16 54 | log_every_steps: 10 55 | eval_every_steps: 100 56 | optim: adamw_torch 57 | lr_scheduler: cosine 58 | weight_decay: 0.01 59 | warmup_ratio: 0.03 60 | 61 | # From https://github.com/QwenLM/Qwen2/blob/main/examples/sft/finetune.py 62 | lora: 63 | rank: 32 64 | alpha: 32 65 | dropout: 0.05 66 | task_type: CAUSAL_LM 67 | target_modules: 68 | # attention layers in main transformer 69 | - att_proj 70 | - ff_proj 71 | - attn_out 72 | - ff_out 73 | # vision transformer attention and FF 74 | - attention.wq 75 | - attention.wk 76 | - attention.wv 77 | - attention.wo 78 | - feed_forward.w1 79 | - feed_forward.w2 80 | # vision image projector 81 | - vision_backbone.image_projector.w1 82 | - vision_backbone.image_projector.w2 83 | - vision_backbone.image_projector.w3 84 | 85 | save: 86 | path: s3://ai2-oe-data/jakep/experiments/molmo-o-0924/v1/models/ 87 | save_every_steps: 1000 88 | 89 | max_workers: 10 -------------------------------------------------------------------------------- /olmocr/train/config/qwen25vl-7b.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | name_or_path: Qwen/Qwen2.5-VL-7B-Instruct 3 | arch: causal 4 | use_flash_attn: true 5 | 6 | wandb: 7 | project: pdelfin 8 | entity: ai2-llm 9 | 10 | generate: 11 | max_length: 8192 12 | 13 | train_data: 14 | seed: 1337 15 | cache_location: /data/jakep/pdfdata/pdelfin_cache 16 | sources: 17 | # These tend to be small, so you can load from s3 it's no big deal 18 | - name: openai_batch_data_v5_1_eval 19 | response_glob_path: s3://ai2-oe-data/jakep/pdfdata/openai_batch_done_v5_1_eval/*.json 20 | target_longest_image_dim: [1024] 21 | target_anchor_text_len: [6000] 22 | - name: openai_batch_data_v5_1_iabooks_eval 23 | response_glob_path: s3://ai2-oe-data/jakep/pdfdata/openai_batch_done_v5_1_iabooks_eval/*.json 24 | target_longest_image_dim: [1024] 25 | target_anchor_text_len: [6000] 26 | # - name: openai_batch_data_v5_1_train 27 | # response_glob_path: /data/jakep/pdfdata/openai_batch_data_v5_1_train_done/*.json 28 | # target_longest_image_dim: [1024] 29 | # target_anchor_text_len: [6000] 30 | # - name: openai_batch_data_v5_1_iabooks_train 31 | # response_glob_path: /data/jakep/pdfdata/openai_batch_data_v5_1_iabooks_train_done/*.json 32 | # target_longest_image_dim: [1024] 33 | # target_anchor_text_len: [6000] 34 | 35 | valid_data: 36 | cache_location: /data/jakep/pdfdata/pdelfin_cache 37 | metric_for_best_model: openai_batch_data_v5_1_eval_loss 38 | sources: 39 | # These tend to be small, so you can load from s3 it's no big deal 40 | - name: openai_batch_data_v5_1_eval 41 | response_glob_path: s3://ai2-oe-data/jakep/pdfdata/openai_batch_done_v5_1_eval/*.json 42 | target_longest_image_dim: [1024] 43 | target_anchor_text_len: [6000] 44 | - name: openai_batch_data_v5_1_iabooks_eval 45 | response_glob_path: s3://ai2-oe-data/jakep/pdfdata/openai_batch_done_v5_1_iabooks_eval/*.json 46 | target_longest_image_dim: [1024] 47 | target_anchor_text_len: [6000] 48 | 49 | 50 | 51 | # Mostly pulled from https://github.com/QwenLM/Qwen2/blob/main/examples/sft/finetune.sh 52 | hparams: 53 | batch_size: 1 54 | eval_batch_size: 1 55 | gradient_accumulation_steps: 4 56 | gradient_checkpointing: true 57 | clip_grad_norm: 1.0 58 | learning_rate: 1e-6 59 | max_steps: 10000 60 | pad_multiple_of: 16 61 | log_every_steps: 10 62 | eval_every_steps: 100 63 | optim: adamw_torch 64 | lr_scheduler: cosine 65 | weight_decay: 0.01 66 | warmup_ratio: 0.03 67 | 68 | 69 | save: 70 | path: s3://ai2-oe-data/jakep/experiments/qwen25vl-pdf/v1/models/ 71 | save_every_steps: 9500 72 | 73 | max_workers: 10 -------------------------------------------------------------------------------- /olmocr/train/config/qwen2vl-2b-lora.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | name_or_path: Qwen/Qwen2-VL-2B-Instruct 3 | arch: causal 4 | use_flash_attn: true 5 | 6 | wandb: 7 | project: pdelfin 8 | entity: ai2-llm 9 | 10 | # TODO This is not used 11 | format: 12 | instruction_template: "Original:" 13 | response_template: "Rewritten:" 14 | # Template from here: https://github.com/QwenLM/Qwen2/blob/main/examples/sft/finetune.py#L30 15 | chat_template: | 16 | {% for message in messages %} 17 | {{'<|im_start|>' + message['role'] + '\n' + message['content']}} 18 | {% if loop.last %} 19 | {{ '<|im_end|>'}} 20 | {% else %} 21 | {{ '<|im_end|>\n' }} 22 | {% endif %} 23 | {% endfor %} 24 | 25 | generate: 26 | max_length: 4096 27 | 28 | train_data: 29 | seed: 1337 30 | sources: 31 | - name: openai_batch_data_v2 32 | query_glob_path: s3://ai2-oe-data/jakep/openai_batch_data_v2/*.jsonl 33 | response_glob_path: s3://ai2-oe-data/jakep/openai_batch_done_v2/*.json 34 | backend: 35 | - openai 36 | size: 100_000 37 | 38 | valid_data: 39 | sources: 40 | - name: openai_batch_data_eval_mini 41 | query_glob_path: s3://ai2-oe-data/jakep/openai_batch_data_eval_mini/*.jsonl 42 | response_glob_path: s3://ai2-oe-data/jakep/openai_batch_done_eval_mini/*.json 43 | backend: 44 | - openai 45 | size: 100_000 46 | 47 | # Mostly pulled from https://github.com/QwenLM/Qwen2/blob/main/examples/sft/finetune.sh 48 | hparams: 49 | batch_size: 1 50 | eval_batch_size: 1 51 | gradient_accumulation_steps: 4 52 | gradient_checkpointing: false 53 | clip_grad_norm: 1.0 54 | learning_rate: 3e-4 55 | max_steps: 2000 56 | pad_multiple_of: 16 57 | log_every_steps: 50 58 | eval_every_steps: 1000 59 | optim: adamw_torch 60 | lr_scheduler: cosine 61 | weight_decay: 0.01 62 | warmup_ratio: 0.03 63 | 64 | # From https://github.com/QwenLM/Qwen2/blob/main/examples/sft/finetune.py 65 | lora: 66 | rank: 32 67 | alpha: 32 68 | dropout: 0.05 69 | task_type: causal_lm 70 | target_modules: 71 | - q_proj 72 | - k_proj 73 | - v_proj 74 | - o_proj 75 | - gate_proj 76 | - up_proj 77 | - down_proj 78 | - visual.blocks.[0-9]+.attn.qkv 79 | - visual.blocks.[0-9]+.attn.proj 80 | - visual.blocks.[0-9]+.mlp.fc1 81 | - visual.blocks.[0-9]+.mlp.fc2 82 | - visual.merger.mlp.0 83 | - visual.merger.mlp.2 84 | 85 | save: 86 | path: s3://ai2-oe-data/jakep/experiments/qwen2vl-pdf/v1/models/ 87 | save_every_steps: 1000 88 | 89 | max_workers: 10 -------------------------------------------------------------------------------- /olmocr/train/config/qwen2vl-2b.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | name_or_path: Qwen/Qwen2-VL-2B-Instruct 3 | arch: causal 4 | use_flash_attn: true 5 | 6 | wandb: 7 | project: pdelfin 8 | entity: ai2-llm 9 | 10 | # TODO This is not used 11 | format: 12 | instruction_template: "Original:" 13 | response_template: "Rewritten:" 14 | # Template from here: https://github.com/QwenLM/Qwen2/blob/main/examples/sft/finetune.py#L30 15 | chat_template: | 16 | {% for message in messages %} 17 | {{'<|im_start|>' + message['role'] + '\n' + message['content']}} 18 | {% if loop.last %} 19 | {{ '<|im_end|>'}} 20 | {% else %} 21 | {{ '<|im_end|>\n' }} 22 | {% endif %} 23 | {% endfor %} 24 | 25 | generate: 26 | max_length: 4096 27 | 28 | train_data: 29 | seed: 1337 30 | sources: 31 | - name: openai_batch_data_v2 32 | query_glob_path: s3://ai2-oe-data/jakep/openai_batch_data_v2/*.jsonl 33 | response_glob_path: s3://ai2-oe-data/jakep/openai_batch_done_v2/*.json 34 | backend: 35 | - openai 36 | size: 100_000 37 | 38 | valid_data: 39 | sources: 40 | - name: openai_batch_data_eval_mini 41 | query_glob_path: s3://ai2-oe-data/jakep/openai_batch_data_eval_mini/*.jsonl 42 | response_glob_path: s3://ai2-oe-data/jakep/openai_batch_done_eval_mini/*.json 43 | backend: 44 | - openai 45 | size: 100_000 46 | 47 | # Mostly pulled from https://github.com/QwenLM/Qwen2/blob/main/examples/sft/finetune.sh 48 | hparams: 49 | batch_size: 1 50 | eval_batch_size: 1 51 | gradient_accumulation_steps: 4 52 | gradient_checkpointing: false 53 | clip_grad_norm: 1.0 54 | learning_rate: 3e-4 55 | max_steps: 2000 56 | pad_multiple_of: 16 57 | log_every_steps: 50 58 | eval_every_steps: 1000 59 | optim: adamw_torch 60 | lr_scheduler: cosine 61 | weight_decay: 0.01 62 | warmup_ratio: 0.03 63 | 64 | # From https://github.com/QwenLM/Qwen2/blob/main/examples/sft/finetune.py 65 | # Disable LORA for now, because we want the visual network to get trained too 66 | # lora: 67 | # rank: 32 68 | # alpha: 32 69 | # dropout: 0.05 70 | # task_type: causal_lm 71 | # target_modules: 72 | # - q_proj 73 | # - k_proj 74 | # - v_proj 75 | # - o_proj 76 | # - gate_proj 77 | # - up_proj 78 | # - down_proj 79 | 80 | save: 81 | path: s3://ai2-oe-data/jakep/experiments/qwen2vl-pdf/v1/models/ 82 | save_every_steps: 1000 83 | 84 | max_workers: 10 -------------------------------------------------------------------------------- /olmocr/train/config/qwen2vl-7b-lora.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | name_or_path: Qwen/Qwen2-VL-7B-Instruct 3 | arch: causal 4 | use_flash_attn: true 5 | 6 | wandb: 7 | project: pdelfin 8 | entity: ai2-llm 9 | 10 | generate: 11 | max_length: 8192 12 | 13 | train_data: 14 | seed: 1337 15 | cache_location: /data/jakep/pdfdata/pdelfin_cache 16 | sources: 17 | - name: openai_batch_data_v5_1_train 18 | response_glob_path: /data/jakep/pdfdata/openai_batch_data_v5_1_train_done/*.json 19 | target_longest_image_dim: 1024 20 | target_anchor_text_len: 6000 21 | - name: openai_batch_data_v5_1_iabooks_train 22 | response_glob_path: /data/jakep/pdfdata/openai_batch_data_v5_1_iabooks_train_done/*.json 23 | target_longest_image_dim: 1024 24 | target_anchor_text_len: 6000 25 | 26 | valid_data: 27 | cache_location: /data/jakep/pdfdata/pdelfin_cache 28 | metric_for_best_model: openai_batch_data_v5_1_eval_loss 29 | sources: 30 | # These tend to be small, so you can load from s3 it's no big deal 31 | - name: openai_batch_data_v5_1_eval 32 | response_glob_path: s3://ai2-oe-data/jakep/pdfdata/openai_batch_done_v5_1_eval/*.json 33 | target_longest_image_dim: 1024 34 | target_anchor_text_len: 6000 35 | - name: openai_batch_data_v5_1_iabooks_eval 36 | response_glob_path: s3://ai2-oe-data/jakep/pdfdata/openai_batch_done_v5_1_iabooks_eval/*.json 37 | target_longest_image_dim: 1024 38 | target_anchor_text_len: 6000 39 | 40 | 41 | 42 | # Mostly pulled from https://github.com/QwenLM/Qwen2/blob/main/examples/sft/finetune.sh 43 | hparams: 44 | batch_size: 1 45 | eval_batch_size: 1 46 | gradient_accumulation_steps: 4 47 | gradient_checkpointing: true 48 | clip_grad_norm: 1.0 49 | learning_rate: 1e-4 50 | max_steps: 10000 51 | pad_multiple_of: 16 52 | log_every_steps: 10 53 | eval_every_steps: 100 54 | optim: adamw_torch 55 | lr_scheduler: cosine 56 | weight_decay: 0.01 57 | warmup_ratio: 0.03 58 | 59 | # From https://github.com/QwenLM/Qwen2/blob/main/examples/sft/finetune.py 60 | lora: 61 | rank: 32 62 | alpha: 32 63 | dropout: 0.05 64 | task_type: causal_lm 65 | target_modules: 66 | - q_proj 67 | - k_proj 68 | - v_proj 69 | - o_proj 70 | - gate_proj 71 | - up_proj 72 | - down_proj 73 | - visual.blocks.[0-9]+.attn.qkv 74 | - visual.blocks.[0-9]+.attn.proj 75 | - visual.blocks.[0-9]+.mlp.fc1 76 | - visual.blocks.[0-9]+.mlp.fc2 77 | - visual.merger.mlp.0 78 | - visual.merger.mlp.2 79 | 80 | save: 81 | path: s3://ai2-oe-data/jakep/experiments/qwen2vl-pdf/v1/models/ 82 | save_every_steps: 1000 83 | 84 | max_workers: 10 -------------------------------------------------------------------------------- /olmocr/train/config/qwen2vl-7b.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | name_or_path: Qwen/Qwen2-VL-7B-Instruct 3 | arch: causal 4 | use_flash_attn: true 5 | 6 | wandb: 7 | project: pdelfin 8 | entity: ai2-llm 9 | 10 | generate: 11 | max_length: 8192 12 | 13 | train_data: 14 | seed: 1337 15 | cache_location: /data/jakep/pdfdata/pdelfin_cache 16 | sources: 17 | - name: openai_batch_data_v5_1_train 18 | response_glob_path: /data/jakep/pdfdata/openai_batch_data_v5_1_train_done/*.json 19 | target_longest_image_dim: [1024] 20 | target_anchor_text_len: [6000] 21 | - name: openai_batch_data_v5_1_iabooks_train 22 | response_glob_path: /data/jakep/pdfdata/openai_batch_data_v5_1_iabooks_train_done/*.json 23 | target_longest_image_dim: [1024] 24 | target_anchor_text_len: [6000] 25 | 26 | valid_data: 27 | cache_location: /data/jakep/pdfdata/pdelfin_cache 28 | metric_for_best_model: openai_batch_data_v5_1_eval_loss 29 | sources: 30 | # These tend to be small, so you can load from s3 it's no big deal 31 | - name: openai_batch_data_v5_1_eval 32 | response_glob_path: s3://ai2-oe-data/jakep/pdfdata/openai_batch_done_v5_1_eval/*.json 33 | target_longest_image_dim: [1024] 34 | target_anchor_text_len: [6000] 35 | - name: openai_batch_data_v5_1_iabooks_eval 36 | response_glob_path: s3://ai2-oe-data/jakep/pdfdata/openai_batch_done_v5_1_iabooks_eval/*.json 37 | target_longest_image_dim: [1024] 38 | target_anchor_text_len: [6000] 39 | 40 | 41 | 42 | # Mostly pulled from https://github.com/QwenLM/Qwen2/blob/main/examples/sft/finetune.sh 43 | hparams: 44 | batch_size: 1 45 | eval_batch_size: 1 46 | gradient_accumulation_steps: 4 47 | gradient_checkpointing: true 48 | clip_grad_norm: 1.0 49 | learning_rate: 1e-6 50 | max_steps: 10000 51 | pad_multiple_of: 16 52 | log_every_steps: 10 53 | eval_every_steps: 100 54 | optim: adamw_torch 55 | lr_scheduler: cosine 56 | weight_decay: 0.01 57 | warmup_ratio: 0.03 58 | 59 | 60 | save: 61 | path: s3://ai2-oe-data/jakep/experiments/qwen2vl-pdf/v1/models/ 62 | save_every_steps: 9500 63 | 64 | max_workers: 10 -------------------------------------------------------------------------------- /olmocr/train/core/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/olmocr/train/core/__init__.py -------------------------------------------------------------------------------- /olmocr/train/core/adapters.py: -------------------------------------------------------------------------------- 1 | import json 2 | from logging import Logger 3 | from typing import Optional, Type 4 | 5 | import smart_open 6 | import torch 7 | from peft.peft_model import PeftModel 8 | from transformers import ( 9 | AutoModelForCausalLM, 10 | AutoModelForSeq2SeqLM, 11 | AutoModelWithLMHead, 12 | AutoTokenizer, 13 | ) 14 | 15 | from .config import ModelConfig 16 | from .loggers import get_logger 17 | from .paths import cached_path, exists, get_cache_dir, join_path, resource_to_filename 18 | 19 | __all__ = ["load_model", "cache_merged_model"] 20 | 21 | 22 | def get_model_cls(config: ModelConfig) -> Type[AutoModelWithLMHead]: 23 | if config.arch == "seq2seq": 24 | return AutoModelForSeq2SeqLM # pyright: ignore 25 | elif config.arch == "causal" or config.arch == "vllm": 26 | return AutoModelForCausalLM # pyright: ignore 27 | else: 28 | raise ValueError(f"Unsupported model architecture: {config.arch}") 29 | 30 | 31 | def get_adapter_config(config: ModelConfig) -> dict: 32 | local_path = cached_path(config.name_or_path) 33 | if exists(adapter_config_path := join_path("", local_path, "adapter_config.json")): 34 | with smart_open.open(adapter_config_path, "rt", encoding="utf-8") as f: 35 | return json.load(f) 36 | return {} 37 | 38 | 39 | def load_model(config: ModelConfig, logger: Optional[Logger] = None) -> AutoModelWithLMHead: 40 | logger = logger or get_logger(__file__, level="INFO") 41 | 42 | logger.info(f"Loading model from {config.name_or_path}") 43 | local_path = cached_path(config.name_or_path) 44 | if local_path != config.name_or_path: 45 | logger.info(f"Model cached at {local_path}") 46 | 47 | if exists(adapter_config_path := join_path("", local_path, "adapter_config.json")): 48 | logger.info(f"Loading LoRA adapter from {adapter_config_path}") 49 | with smart_open.open(adapter_config_path) as f: 50 | adapter_config = json.load(f) 51 | base_model_name_or_path = adapter_config["base_model_name_or_path"] 52 | enable_lora = True 53 | else: 54 | base_model_name_or_path = local_path 55 | enable_lora = False 56 | 57 | model = get_model_cls(config).from_pretrained( 58 | base_model_name_or_path, 59 | device_map="auto", 60 | trust_remote_code=config.trust_remote_code, 61 | # low_cpu_mem_usage=model_config.low_cpu_mem_usage, 62 | use_flash_attention_2=True if config.use_flash_attn else False, 63 | revision=config.model_revision, 64 | torch_dtype=torch.bfloat16 if config.use_flash_attn else getattr(torch, config.dtype), 65 | ) 66 | logger.info(f"Successfully loaded base model from {base_model_name_or_path}") 67 | 68 | if enable_lora: 69 | peft_model = PeftModel.from_pretrained(model, local_path) 70 | model = peft_model.merge_and_unload() 71 | logger.info(f"Successfully loaded LoRA adapter from base model: {base_model_name_or_path}") 72 | 73 | return model 74 | 75 | 76 | def cache_merged_model(config: ModelConfig, logger: Optional[Logger] = None) -> str: 77 | logger = logger or get_logger(__file__, level="INFO") 78 | 79 | base_local_path = cached_path(config.name_or_path) 80 | adapter_config = get_adapter_config(config) 81 | if not adapter_config: 82 | logger.info("No adapter config found; using base model") 83 | return base_local_path 84 | 85 | local_fn = resource_to_filename(json.dumps({"adapter": adapter_config, "model": config.name_or_path})) 86 | merged_local_path = f"{get_cache_dir()}/{local_fn}" 87 | 88 | if not exists(merged_local_path): 89 | model = load_model(config=config, logger=logger) 90 | tokenizer = AutoTokenizer.from_pretrained(base_local_path) 91 | 92 | logger.info(f"Saving merged model to {merged_local_path}") 93 | model.save_pretrained(merged_local_path) 94 | tokenizer.save_pretrained(merged_local_path) 95 | 96 | return merged_local_path 97 | -------------------------------------------------------------------------------- /olmocr/train/core/compression.py: -------------------------------------------------------------------------------- 1 | from smart_open import register_compressor 2 | 3 | __all__ = ["mk_compression"] 4 | 5 | 6 | def mk_compression(): 7 | def _handle_zst(file_obj, mode): 8 | try: 9 | import zstandard as zstd 10 | except ImportError: 11 | raise ImportError("zstandard is required for zstd support") 12 | 13 | return zstd.open(file_obj, mode) 14 | 15 | register_compressor(".zstd", _handle_zst) 16 | register_compressor(".zst", _handle_zst) 17 | -------------------------------------------------------------------------------- /olmocr/train/core/errors.py: -------------------------------------------------------------------------------- 1 | class DolmaRefineError(RuntimeError): ... 2 | -------------------------------------------------------------------------------- /olmocr/train/core/loggers.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import multiprocessing 3 | from typing import Union 4 | 5 | LOGGER_PREFIX = "dolma-refine" 6 | 7 | 8 | def get_logger(name: str, level: Union[int, str] = logging.WARN) -> logging.Logger: 9 | if (proc_name := multiprocessing.current_process().name) == "MainProcess": 10 | proc_name = "main" 11 | proc_name = proc_name.replace(" ", "_") 12 | 13 | # set the log level 14 | level = level if isinstance(level, int) else getattr(logging, level.strip().upper(), logging.WARN) 15 | 16 | # set name 17 | name = f"{LOGGER_PREFIX}.{proc_name}.{name}" 18 | logger = logging.getLogger(name) 19 | logger.setLevel(level) 20 | 21 | # add handler 22 | if not logger.handlers: 23 | handler = logging.StreamHandler() 24 | formatter = logging.Formatter("[%(asctime)s %(name)s %(levelname)s] %(message)s", datefmt="%Y-%m-%d %H:%M:%S") 25 | handler.setFormatter(formatter) 26 | logger.addHandler(handler) 27 | 28 | return logger 29 | 30 | 31 | def reset_level(level: Union[int, str]) -> None: 32 | """ 33 | Reset the log level for all Dolma loggers. 34 | 35 | Args: 36 | level (Union[int, str]): The log level to set. It can be either an integer 37 | representing the log level (e.g., logging.DEBUG) or a string 38 | representing the log level name (e.g., 'debug'). 39 | 40 | Returns: 41 | None 42 | """ 43 | if isinstance(level, str): 44 | if (level_tmp := getattr(logging, level.strip().upper(), None)) is not None: 45 | level = level_tmp 46 | else: 47 | raise ValueError(f"Invalid log level: {level}") 48 | 49 | for logger in logging.Logger.manager.loggerDict.values(): 50 | if isinstance(logger, logging.Logger): 51 | if logger.name.startswith(LOGGER_PREFIX): 52 | logger.setLevel(level) 53 | -------------------------------------------------------------------------------- /olmocr/train/core/state.py: -------------------------------------------------------------------------------- 1 | import os 2 | from dataclasses import dataclass 3 | from typing import Optional 4 | 5 | 6 | @dataclass 7 | class BeakerState: 8 | job_id: Optional[str] = None 9 | job_kind: Optional[str] = None 10 | task_id: Optional[str] = None 11 | experiment_id: Optional[str] = None 12 | replica_rank: Optional[str] = None 13 | leader_replica_hostname: Optional[str] = None 14 | leader_replica_node_id: Optional[str] = None 15 | user_id: Optional[str] = None 16 | 17 | def __post_init__(self): 18 | for key, value in os.environ.items(): 19 | if not key.startswith("BEAKER_"): 20 | continue 21 | setattr(self, key.lstrip("BEAKER_").lower(), value) 22 | 23 | @property 24 | def url(self) -> Optional[str]: 25 | if self.job_id: 26 | return f"https://beaker.org/jobs/{self.job_id}" 27 | return None 28 | -------------------------------------------------------------------------------- /olmocr/train/hf/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/olmocr/train/hf/__init__.py -------------------------------------------------------------------------------- /olmocr/train/hf/hfhub_upload.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import tarfile 4 | from math import ceil 5 | 6 | from huggingface_hub import HfApi 7 | 8 | # Configuration 9 | pdf_dir = "pdfs" # Directory with PDF files (flat structure) 10 | tarball_dir = "tarballs" # Directory where tar.gz files will be saved 11 | os.makedirs(tarball_dir, exist_ok=True) 12 | repo_id = "allenai/olmOCR-mix-0225" # Hugging Face dataset repo ID 13 | 14 | # Set up logging to file 15 | logging.basicConfig(filename="upload.log", level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") 16 | 17 | 18 | def process_chunk(args): 19 | """ 20 | Worker function to create a tar.gz file for a given chunk. 21 | Returns a tuple: (chunk_index, success (bool), message). 22 | """ 23 | chunk_index, chunk_files = args 24 | tarball_name = f"pdf_chunk_{chunk_index:04d}.tar.gz" 25 | tarball_path = os.path.join(tarball_dir, tarball_name) 26 | 27 | try: 28 | with tarfile.open(tarball_path, "w:gz") as tar: 29 | for pdf_filename in chunk_files: 30 | pdf_path = os.path.join(pdf_dir, pdf_filename) 31 | # Add the file with its basename to maintain a flat structure 32 | tar.add(pdf_path, arcname=pdf_filename) 33 | logging.info(f"Chunk {chunk_index:04d}: Created '{tarball_name}' with {len(chunk_files)} PDFs.") 34 | return chunk_index, True, "Success" 35 | except Exception as e: 36 | error_msg = f"Chunk {chunk_index:04d}: Error creating '{tarball_name}': {e}" 37 | logging.error(error_msg) 38 | return chunk_index, False, error_msg 39 | 40 | 41 | def main(): 42 | # List all PDF files (assuming a flat directory) 43 | try: 44 | pdf_files = sorted([f for f in os.listdir(pdf_dir) if f.lower().endswith(".pdf")]) 45 | except Exception as e: 46 | logging.error(f"Error listing PDFs in '{pdf_dir}': {e}") 47 | return 48 | 49 | total_files = len(pdf_files) 50 | chunk_size = 5000 51 | total_chunks = ceil(total_files / chunk_size) 52 | logging.info(f"Found {total_files} PDFs; dividing into {total_chunks} chunks of up to {chunk_size} files each.") 53 | 54 | # # Enumerate chunks (starting at 0000) 55 | # chunks = [] 56 | # for idx in range(total_chunks): 57 | # start = idx * chunk_size 58 | # end = start + chunk_size 59 | # chunk_files = pdf_files[start:end] 60 | # chunks.append((idx, chunk_files)) 61 | 62 | # # Create tarballs in parallel 63 | # results = [] 64 | # with ProcessPoolExecutor() as executor: 65 | # futures = {executor.submit(process_chunk, chunk): chunk for chunk in chunks} 66 | # for future in tqdm(as_completed(futures), total=len(futures), desc="Creating tarballs"): 67 | # try: 68 | # result = future.result() 69 | # results.append(result) 70 | # chunk_index, success, message = result 71 | # if not success: 72 | # logging.error(f"Chunk {chunk_index:04d} failed: {message}") 73 | # except Exception as e: 74 | # logging.error(f"Unexpected error processing a chunk: {e}") 75 | 76 | # # Abort upload if any tarball creation failed 77 | # failed_chunks = [r for r in results if not r[1]] 78 | # if failed_chunks: 79 | # logging.error(f"{len(failed_chunks)} chunk(s) failed to create. Aborting upload.") 80 | # return 81 | 82 | # All tarballs created successfully; now upload the entire tarball directory 83 | 84 | api = HfApi() 85 | logging.info("Starting upload of tarballs folder to Hugging Face Hub...") 86 | # This will upload all files in tarball_dir to the repo under "pdf_tarballs" 87 | api.upload_large_folder( 88 | folder_path=tarball_dir, 89 | repo_id=repo_id, 90 | # path_in_repo="pdf_tarballs", 91 | repo_type="dataset", 92 | ) 93 | logging.info("Successfully uploaded tarballs folder to Hugging Face Hub.") 94 | 95 | 96 | if __name__ == "__main__": 97 | main() 98 | -------------------------------------------------------------------------------- /olmocr/train/inference.py: -------------------------------------------------------------------------------- 1 | import base64 2 | from io import BytesIO 3 | 4 | import torch 5 | import torch.distributed 6 | from PIL import Image 7 | from transformers import AutoConfig, AutoProcessor, Qwen2_5_VLForConditionalGeneration 8 | 9 | from olmocr.data.renderpdf import render_pdf_to_base64png 10 | from olmocr.prompts.anchor import get_anchor_text 11 | from olmocr.prompts.prompts import build_openai_silver_data_prompt 12 | 13 | 14 | @torch.no_grad() 15 | def run_inference(model_name: str): 16 | config = AutoConfig.from_pretrained(model_name) 17 | processor = AutoProcessor.from_pretrained(model_name) 18 | 19 | # If it doesn't load, change the type:mrope key to "default" 20 | 21 | # model = Qwen2VLForConditionalGeneration.from_pretrained(model_name, device_map="auto", config=config) 22 | model = Qwen2_5_VLForConditionalGeneration.from_pretrained(model_name, device_map="auto", config=config) 23 | model.eval() 24 | 25 | # local_pdf_path = os.path.join(os.path.dirname(__file__), "..", "..", "tests", "gnarly_pdfs", "horribleocr.pdf") 26 | local_pdf_path = "/root/brochure.pdf" 27 | page = 1 28 | 29 | image_base64 = render_pdf_to_base64png(local_pdf_path, page, 1024) 30 | anchor_text = get_anchor_text(local_pdf_path, page, pdf_engine="pdfreport") 31 | 32 | messages = [ 33 | { 34 | "role": "user", 35 | "content": [ 36 | {"type": "text", "text": build_openai_silver_data_prompt(anchor_text)}, 37 | {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}}, 38 | ], 39 | } 40 | ] 41 | 42 | # Preparation for inference 43 | text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) 44 | 45 | main_image = Image.open(BytesIO(base64.b64decode(image_base64))) 46 | 47 | inputs = processor( 48 | text=[text], 49 | images=[main_image], 50 | padding=True, 51 | return_tensors="pt", 52 | ) 53 | inputs = inputs.to("cuda") 54 | 55 | output_ids = model.generate(**inputs, temperature=0.8, do_sample=True, max_new_tokens=1500) 56 | generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs["input_ids"], output_ids)] 57 | output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True) 58 | print(output_text[0]) 59 | 60 | 61 | def main(): 62 | run_inference(model_name="Qwen/Qwen2.5-VL-7B-Instruct") 63 | 64 | 65 | if __name__ == "__main__": 66 | main() 67 | -------------------------------------------------------------------------------- /olmocr/train/loaddataset.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoProcessor 2 | 3 | from olmocr.train.core.cli import make_cli 4 | from olmocr.train.core.config import TrainConfig 5 | 6 | from .utils import make_dataset 7 | 8 | 9 | def main(): 10 | train_config = make_cli(TrainConfig) # pyright: ignore 11 | 12 | processor = AutoProcessor.from_pretrained(train_config.model.name_or_path, trust_remote_code=True) 13 | train_dataset, valid_dataset = make_dataset(train_config, processor) 14 | 15 | print("Training dataset........") 16 | print(train_dataset) 17 | 18 | train_example = train_dataset[0] 19 | print(train_example) 20 | print({(x, y.shape) for x, y in train_example.items()}) 21 | print("\nTokens") 22 | print(processor.tokenizer.batch_decode(train_example["input_ids"])) 23 | 24 | print("\n\n") 25 | 26 | print("Validation dataset........") 27 | print(valid_dataset) 28 | print(valid_dataset[list(valid_dataset.keys())[0]][0]) 29 | print("\n\n") 30 | 31 | print("Datasets loaded into hugging face cache directory") 32 | 33 | # data_collator = TruncatingCollator( 34 | # max_length=4096 35 | # ) 36 | 37 | # train_dataloader = DataLoader(train_dataset, batch_size=1, num_workers=4, shuffle=False, collate_fn=data_collator) 38 | # max_seen_len = 0 39 | # for index, entry in tqdm(enumerate(train_dataloader)): 40 | # if index == 0: 41 | # print(entry) 42 | 43 | # num_input_tokens = entry["input_ids"].shape[1] 44 | # max_seen_len = max(max_seen_len, num_input_tokens) 45 | 46 | # print(max_seen_len) 47 | 48 | 49 | if __name__ == "__main__": 50 | main() 51 | -------------------------------------------------------------------------------- /olmocr/train/molmo/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/olmocr/train/molmo/__init__.py -------------------------------------------------------------------------------- /olmocr/train/molmo/config_molmo.py: -------------------------------------------------------------------------------- 1 | from transformers import PretrainedConfig 2 | 3 | 4 | class MolmoConfig(PretrainedConfig): 5 | model_type = "molmo" 6 | keys_to_ignore_at_inference = ["past_key_values"] 7 | 8 | def __init__( 9 | self, 10 | vocab_size=50304, 11 | embedding_size=50304, 12 | hidden_size=4096, 13 | intermediate_size=11008, 14 | num_hidden_layers=32, 15 | num_attention_heads=32, 16 | num_key_value_heads=None, 17 | max_position_embeddings=2048, 18 | initializer_range=0.02, 19 | use_cache=True, 20 | layer_norm_eps: float = 1e-5, 21 | rope_theta=10000.0, 22 | clip_qkv=None, 23 | qkv_bias: bool = False, 24 | weight_tying: bool = False, 25 | use_position_ids: bool = True, 26 | tie_word_embeddings: bool = True, 27 | attention_layer_norm: bool = False, 28 | norm_after: bool = False, 29 | layer_norm_type: str = "rms", 30 | **kwargs, 31 | ): 32 | self.vocab_size = vocab_size 33 | self.embedding_size = embedding_size 34 | self.max_position_embeddings = max_position_embeddings 35 | self.hidden_size = hidden_size 36 | self.intermediate_size = intermediate_size 37 | self.num_hidden_layers = num_hidden_layers 38 | self.num_attention_heads = num_attention_heads 39 | self.layer_norm_eps = layer_norm_eps 40 | self.weight_tying = weight_tying 41 | self.use_position_ids = use_position_ids 42 | self.attention_layer_norm = attention_layer_norm 43 | self.num_key_value_heads = num_key_value_heads 44 | self.initializer_range = initializer_range 45 | self.use_cache = use_cache 46 | self.rope_theta = rope_theta 47 | self.clip_qkv = clip_qkv 48 | self.qkv_bias = qkv_bias 49 | self.norm_after = norm_after 50 | self.tie_word_embeddings = tie_word_embeddings 51 | self.layer_norm_type = layer_norm_type 52 | 53 | super().__init__( 54 | tie_word_embeddings=tie_word_embeddings, 55 | **kwargs, 56 | ) 57 | 58 | 59 | MolmoConfig.register_for_auto_class() 60 | -------------------------------------------------------------------------------- /olmocr/version.py: -------------------------------------------------------------------------------- 1 | _MAJOR = "0" 2 | _MINOR = "1" 3 | # On main and in a nightly release the patch should be one ahead of the last 4 | # released build. 5 | _PATCH = "71" 6 | # This is mainly for nightly builds which have the suffix ".dev$DATE". See 7 | # https://semver.org/#is-v123-a-semantic-version for the semantics. 8 | _SUFFIX = "" 9 | 10 | VERSION_SHORT = "{0}.{1}".format(_MAJOR, _MINOR) 11 | VERSION = "{0}.{1}.{2}{3}".format(_MAJOR, _MINOR, _PATCH, _SUFFIX) 12 | -------------------------------------------------------------------------------- /olmocr/viewer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/olmocr/viewer/__init__.py -------------------------------------------------------------------------------- /scripts/beaker/Dockerfile-gpu-ci: -------------------------------------------------------------------------------- 1 | FROM --platform=linux/amd64 nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu20.04 2 | 3 | RUN apt-get update -y && apt-get install -y software-properties-common \ 4 | && add-apt-repository ppa:deadsnakes/ppa \ 5 | && apt-get -y update 6 | 7 | # Install requirements specific to pdfs 8 | RUN apt-get update && apt-get -y install python3-apt 9 | RUN echo "ttf-mscorefonts-installer msttcorefonts/accepted-mscorefonts-eula select true" | debconf-set-selections 10 | RUN apt-get update -y && apt-get install -y poppler-utils ttf-mscorefonts-installer msttcorefonts fonts-crosextra-caladea fonts-crosextra-carlito gsfonts lcdf-typetools 11 | 12 | RUN apt-get update -y && apt-get install -y --no-install-recommends \ 13 | git \ 14 | python3.11 \ 15 | python3.11-dev \ 16 | python3.11-distutils \ 17 | ca-certificates \ 18 | build-essential \ 19 | curl \ 20 | unzip 21 | 22 | RUN rm -rf /var/lib/apt/lists/* \ 23 | && unlink /usr/bin/python3 \ 24 | && ln -s /usr/bin/python3.11 /usr/bin/python3 \ 25 | && ln -s /usr/bin/python3 /usr/bin/python \ 26 | && curl -sS https://bootstrap.pypa.io/get-pip.py | python \ 27 | && pip3 install -U pip 28 | 29 | RUN apt-get update && apt-get -y install python3.11-venv 30 | ADD --chmod=755 https://astral.sh/uv/install.sh /install.sh 31 | RUN /install.sh && rm /install.sh 32 | 33 | 34 | WORKDIR /root 35 | COPY gpu-ci-script.sh . 36 | 37 | ENV PYTHONUNBUFFERED=1 38 | -------------------------------------------------------------------------------- /scripts/beaker/Dockerfile-inference: -------------------------------------------------------------------------------- 1 | FROM --platform=linux/amd64 nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu20.04 2 | 3 | RUN apt-get update -y && apt-get install -y software-properties-common \ 4 | && add-apt-repository ppa:deadsnakes/ppa \ 5 | && apt-get -y update 6 | 7 | # Install requirements specific to pdfs 8 | RUN apt-get update && apt-get -y install python3-apt 9 | RUN echo "ttf-mscorefonts-installer msttcorefonts/accepted-mscorefonts-eula select true" | debconf-set-selections 10 | RUN apt-get update -y && apt-get install -y poppler-utils ttf-mscorefonts-installer msttcorefonts fonts-crosextra-caladea fonts-crosextra-carlito gsfonts lcdf-typetools 11 | 12 | RUN apt-get update -y && apt-get install -y --no-install-recommends \ 13 | git \ 14 | python3.11 \ 15 | python3.11-dev \ 16 | python3.11-distutils \ 17 | ca-certificates \ 18 | build-essential \ 19 | curl \ 20 | unzip 21 | 22 | RUN rm -rf /var/lib/apt/lists/* \ 23 | && unlink /usr/bin/python3 \ 24 | && ln -s /usr/bin/python3.11 /usr/bin/python3 \ 25 | && ln -s /usr/bin/python3 /usr/bin/python \ 26 | && curl -sS https://bootstrap.pypa.io/get-pip.py | python \ 27 | && pip3 install -U pip 28 | 29 | RUN apt-get update && apt-get -y install python3.11-venv 30 | ADD --chmod=755 https://astral.sh/uv/install.sh /install.sh 31 | RUN /install.sh && rm /install.sh 32 | 33 | ENV PYTHONUNBUFFERED=1 34 | WORKDIR /root 35 | COPY pyproject.toml pyproject.toml 36 | COPY olmocr/version.py olmocr/version.py 37 | 38 | RUN /root/.local/bin/uv pip install --system --no-cache -e . 39 | 40 | RUN /root/.local/bin/uv pip install --system --no-cache sgl-kernel==0.0.3.post1 --force-reinstall --no-deps 41 | RUN /root/.local/bin/uv pip install --system --no-cache "sglang[all]==0.4.2" --find-links https://flashinfer.ai/whl/cu124/torch2.4/flashinfer/ 42 | 43 | COPY olmocr olmocr 44 | 45 | WORKDIR /root 46 | COPY olmocr olmocr 47 | 48 | RUN python3 -m sglang.launch_server --help 49 | RUN python3 -m olmocr.pipeline --help -------------------------------------------------------------------------------- /scripts/beaker/Dockerfile-tagging: -------------------------------------------------------------------------------- 1 | FROM --platform=linux/amd64 nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu20.04 2 | 3 | RUN apt-get update -y && apt-get install -y software-properties-common \ 4 | && add-apt-repository ppa:deadsnakes/ppa \ 5 | && apt-get -y update 6 | 7 | # Install requirements specific to pdfs 8 | RUN apt-get update && apt-get -y install python3-apt 9 | RUN echo "ttf-mscorefonts-installer msttcorefonts/accepted-mscorefonts-eula select true" | debconf-set-selections 10 | RUN apt-get update -y && apt-get install -y poppler-utils ttf-mscorefonts-installer msttcorefonts fonts-crosextra-caladea fonts-crosextra-carlito gsfonts lcdf-typetools 11 | 12 | RUN apt-get update -y && apt-get install -y --no-install-recommends \ 13 | git \ 14 | python3.11 \ 15 | python3.11-dev \ 16 | python3.11-distutils \ 17 | ca-certificates \ 18 | build-essential \ 19 | curl \ 20 | unzip 21 | 22 | RUN rm -rf /var/lib/apt/lists/* \ 23 | && unlink /usr/bin/python3 \ 24 | && ln -s /usr/bin/python3.11 /usr/bin/python3 \ 25 | && ln -s /usr/bin/python3 /usr/bin/python \ 26 | && curl -sS https://bootstrap.pypa.io/get-pip.py | python \ 27 | && pip3 install -U pip 28 | 29 | RUN apt-get update && apt-get -y install python3.11-venv 30 | ADD --chmod=755 https://astral.sh/uv/install.sh /install.sh 31 | RUN /install.sh && rm /install.sh 32 | 33 | ENV PYTHONUNBUFFERED=1 34 | WORKDIR /root 35 | COPY pyproject.toml pyproject.toml 36 | COPY olmocr/version.py olmocr/version.py 37 | 38 | RUN /root/.local/bin/uv pip install --system --no-cache -e . 39 | 40 | RUN /root/.local/bin/uv pip install --system --no-cache vllm==0.8.2 41 | 42 | 43 | WORKDIR /root 44 | COPY olmocr olmocr 45 | COPY scripts scripts 46 | 47 | RUN vllm --help 48 | RUN python3 -m olmocr.pipeline --help 49 | RUN python scripts/tagging_pipeline.py --help -------------------------------------------------------------------------------- /scripts/beaker/Dockerfile-train: -------------------------------------------------------------------------------- 1 | FROM gcr.io/ai2-beaker-core/public/cqgl31u2ba5vrtuc91og:latest 2 | 3 | # Update the package list and install libaio-dev and gnupg2 4 | RUN apt update && apt-get install -y libaio-dev gnupg2 5 | 6 | # Add NVIDIA package repository keys 7 | RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub \ 8 | && apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub \ 9 | && apt-get -y update 10 | 11 | # Set up the NVIDIA CUDA repository 12 | RUN apt-get install -y software-properties-common \ 13 | && add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/ /" \ 14 | && apt-get update 15 | 16 | # Install CUDA toolkit and nvcc 12.1 17 | RUN apt-get install -y cuda-nvcc-12-1 18 | 19 | # Get flash attention setup 20 | RUN pip install flash-attn --no-build-isolation 21 | 22 | # Install PDF utilities 23 | RUN apt-get install -y poppler-utils 24 | RUN echo "ttf-mscorefonts-installer msttcorefonts/accepted-mscorefonts-eula select true" | debconf-set-selections 25 | RUN apt-get install -y ttf-mscorefonts-installer msttcorefonts fonts-crosextra-caladea fonts-crosextra-carlito gsfonts lcdf-typetools 26 | 27 | -------------------------------------------------------------------------------- /scripts/beaker/gpu-ci-script.sh: -------------------------------------------------------------------------------- 1 | #/usr/bin/bash 2 | 3 | set -ex 4 | 5 | git clone https://github.com/allenai/olmocr.git olmocr \ 6 | && cd olmocr \ 7 | && git checkout $GIT_REVISION \ 8 | && /root/.local/bin/uv pip install --system --no-cache \ 9 | .[gpu] \ 10 | pytest \ 11 | --find-links https://flashinfer.ai/whl/cu124/torch2.4/flashinfer/ \ 12 | && bash scripts/run_integration_test.sh 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /scripts/beaker/jupiter-ib.sh: -------------------------------------------------------------------------------- 1 | set -ex 2 | export NCCL_DEBUG=INFO NCCL_SOCKET_IFNAME=ib NCCL_IB_HCA="^=mlx5_bond_0" -------------------------------------------------------------------------------- /scripts/beaker/pluto-ib.sh: -------------------------------------------------------------------------------- 1 | set -ex 2 | export NCCL_DEBUG=INFO NCCL_SOCKET_IFNAME=ib NCCL_IB_HCA="^=mlx5_1,mlx5_2" -------------------------------------------------------------------------------- /scripts/birr/config/qwen2-vl-7b-pdf-weka.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | # full fine tune 3 | name_or_path: weka://oe-data-default/jakep/Qwen_Qwen2-VL-7B-Instruct-e4ecf8-01JAH8GMWHTJ376S2N7ETXRXH4/best_bf16/ 4 | #name_or_path: s3://ai2-oe-data/jakep/experiments/qwen2vl-pdf/v1/models/jakep/Qwen_Qwen2-VL-7B-Instruct-e4ecf8-01JAH8GMWHTJ376S2N7ETXRXH4/checkpoint-9500/bf16/ 5 | vlm: true 6 | 7 | # necessary to prevent random crashes, until vllm fixes some bugs 8 | num_scheduler_steps: 1 9 | 10 | format: 11 | add_generation_prompt: true 12 | 13 | generate: 14 | # The model's max context length is 8096, but around 1500 tokens are reserved for the image itself 15 | max_context_length: 6500 16 | temperature: 0.8 17 | top_p: 1.0 18 | drop_long_outputs: false 19 | 20 | 21 | pipeline: 22 | sqs_queue_name: jake-pdf 23 | num_workers: 3 24 | generation_batch_size: 256 25 | tokenization_batch_size: 64 26 | output_serializer: default 27 | target_bucket: ai2-oe-data 28 | target_object_prefix: [your username]/pdfworkspaces/s2orc_3200k_v2/inference_outputs 29 | allowed_restarts_per_predictor: 10 30 | 31 | task: 32 | budget: ai2/oe-data 33 | workspace: ai2/oe-data-model-based-cleanup 34 | name: qwen2vl-schedsteps-bg 35 | replicas: 128 36 | priority: LOW 37 | gpu_count: 1 38 | cluster: 39 | - ai2/jupiter-cirrascale-2 40 | - ai2/saturn-cirrascale 41 | 42 | -------------------------------------------------------------------------------- /scripts/build-docker.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | VERSION=$(python -c 'import olmocr.version; print(olmocr.version.VERSION)') 6 | echo "$VERSION" 7 | 8 | docker build --platform linux/amd64 -f ./scripts/beaker/Dockerfile-inference -t olmocr-inference-$VERSION . 9 | beaker image create --workspace ai2/oe-data-pdf --name olmocr-inference-$VERSION olmocr-inference-$VERSION 10 | 11 | docker build --platform linux/amd64 -f ./scripts/beaker/Dockerfile-tagging -t olmocr-tagging-$VERSION . 12 | beaker image create --workspace ai2/oe-data-pdf --name olmocr-tagging-$VERSION olmocr-tagging-$VERSION -------------------------------------------------------------------------------- /scripts/check_qual.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | python scripts/pii_rule_comparison.py \ 6 | --docs-folder /home/ubuntu/s2pdf_dedupe_minhash_v1_with_no_pii/documents \ 7 | --ref-rule "ft_lang_id_en_doc_v2__ft_lang_id_en_doc_v2__en:avg>0.5 and \ 8 | fineweb_edu_fasttext_gt2__fineweb_edu_fasttext_gt2__score:avg>0.001 and \ 9 | avg_fraction_numbers_in_line_v1__avg_fraction_numbers_in_line_v1__avg_fraction_numbers_in_line_ratio:avg<0.2 and \ 10 | pipe_delimited_lines_v1__pipe_delimited_lines_v1__pipe_delimited_lines_ratio:avg<0.3 \ 11 | " \ 12 | --hyp-rule "ft_lang_id_en_doc_v2__ft_lang_id_en_doc_v2__en:avg>0.5 and \ 13 | fineweb_edu_fasttext_gt2__fineweb_edu_fasttext_gt2__score:avg>0.001 and \ 14 | avg_fraction_numbers_in_line_v1__avg_fraction_numbers_in_line_v1__avg_fraction_numbers_in_line_ratio:avg<0.2 and \ 15 | pipe_delimited_lines_v1__pipe_delimited_lines_v1__pipe_delimited_lines_ratio:avg<0.4 \ 16 | " \ 17 | --output-dir results/pii_detection \ 18 | 19 | 20 | # Run1, langid, pipes and numbers 21 | # Prompt, boilerplate, reference, prose, table classification -> train fasttext 22 | # 50k docs to train fast text 23 | 24 | tinyhost results/pii_detection/* -------------------------------------------------------------------------------- /scripts/elo/README.md: -------------------------------------------------------------------------------- 1 | # elo rating 2 | 3 | Calculates elo rating of olmOCR vs other tools. 4 | 5 | ## Data 6 | 7 | The pairwise judgment data is stored in `ratings.csv` as win/loss counts: 8 | ``` 9 | MethodA,MethodB,A_wins,B_wins,A_rate(%),B_rate(%) 10 | marker,mineru,53,26,67.1,32.9 11 | mineru,pdelf,22,55,28.6,71.4 12 | gotocr_format,marker,26,45,36.6,63.4 13 | marker,pdelf,31,49,38.8,61.3 14 | gotocr_format,pdelf,29,41,41.4,58.6 15 | gotocr_format,mineru,38,37,50.7,49.3 16 | ``` 17 | 18 | *Note* `pdfelf` is olmOCR. 19 | 20 | ## Usage 21 | 22 | To calculate elo ratings, run the following command: 23 | ```bash 24 | python calculate_elo_ratings.py ratings.csv --num-bootstrap 5000 --num-elo-sims 100 --confidence-level 95 --seed 123 25 | ``` 26 | 27 | It should print something like: 28 | ``` 29 | Bootstrapped Elo Ratings (95% CI): 30 | -------------------------------------------------- 31 | pdelf 1813.0 ± 84.9 [1605.9, 1930.0] 32 | mineru 1545.2 ± 99.7 [1336.7, 1714.1] 33 | marker 1429.1 ± 100.7 [1267.6, 1645.5] 34 | gotocr_format 1212.7 ± 82.0 [1097.3, 1408.3] 35 | 36 | Pairwise Significance Tests: 37 | -------------------------------------------------- 38 | gotocr_format vs marker Δ = -216.3 [-470.8, 135.0] p = 0.218 39 | gotocr_format vs mineru Δ = -332.5 [-567.5, 19.3] p = 0.051 40 | gotocr_format vs pdelf Δ = -600.3 [-826.1, -344.3] p = 0.000* 41 | marker vs mineru Δ = -116.1 [-365.4, 246.5] p = 0.430 42 | marker vs pdelf Δ = -383.9 [-610.6, -10.9] p = 0.044* 43 | mineru vs pdelf Δ = -267.8 [-517.3, 104.0] p = 0.135 44 | ``` 45 | 46 | which is also already saved in `results.txt`. 47 | 48 | To generate boxplots of elo ratings, run the following command: 49 | ```bash 50 | python draw_boxplots.py results.txt boxplots.png 51 | ``` 52 | 53 | which should save boxplots as `boxplots.png`. -------------------------------------------------------------------------------- /scripts/elo/boxplots.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/scripts/elo/boxplots.png -------------------------------------------------------------------------------- /scripts/elo/ratings.csv: -------------------------------------------------------------------------------- 1 | MethodA,MethodB,A_wins,B_wins,A_rate(%),B_rate(%) 2 | marker,mineru,53,26,67.1,32.9 3 | mineru,pdelf,22,55,28.6,71.4 4 | gotocr_format,marker,26,45,36.6,63.4 5 | marker,pdelf,31,49,38.8,61.3 6 | gotocr_format,pdelf,29,41,41.4,58.6 7 | gotocr_format,mineru,38,37,50.7,49.3 -------------------------------------------------------------------------------- /scripts/elo/results.txt: -------------------------------------------------------------------------------- 1 | Bootstrapped Elo Ratings (95% CI): 2 | -------------------------------------------------- 3 | pdelf 1813.0 ± 84.9 [1605.9, 1930.0] 4 | mineru 1545.2 ± 99.7 [1336.7, 1714.1] 5 | marker 1429.1 ± 100.7 [1267.6, 1645.5] 6 | gotocr_format 1212.7 ± 82.0 [1097.3, 1408.3] 7 | 8 | Pairwise Significance Tests: 9 | -------------------------------------------------- 10 | gotocr_format vs marker Δ = -216.3 [-470.8, 135.0] p = 0.218 11 | gotocr_format vs mineru Δ = -332.5 [-567.5, 19.3] p = 0.051 12 | gotocr_format vs pdelf Δ = -600.3 [-826.1, -344.3] p = 0.000* 13 | marker vs mineru Δ = -116.1 [-365.4, 246.5] p = 0.430 14 | marker vs pdelf Δ = -383.9 [-610.6, -10.9] p = 0.044* 15 | mineru vs pdelf Δ = -267.8 [-517.3, 104.0] p = 0.135 16 | 17 | -------------------------------------------------------------------------------- /scripts/jsonl_to_markdown.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import sys 4 | 5 | 6 | # This is a simple script to convert JSONL files to Markdown format. 7 | # It reads each line of the JSONL file, extracts the 'text' field, 8 | # and saves it as a Markdown file with the line number as the filename. 9 | # The script also handles potential JSON decoding errors and prints relevant messages. 10 | def jsonl_to_markdown(input_file, output_dir): 11 | """ 12 | Reads a JSONL file, extracts the 'text' field from each line, and saves it as a Markdown file. 13 | 14 | Args: 15 | input_file (str): Path to the input JSONL file. 16 | output_dir (str): Directory to save the Markdown files. 17 | """ 18 | if not os.path.exists(output_dir): 19 | os.makedirs(output_dir) 20 | 21 | with open(input_file, "r", encoding="utf-8") as file: 22 | for i, line in enumerate(file): 23 | try: 24 | # Parse the JSON line 25 | data = json.loads(line) 26 | text_content = data.get("text", "") 27 | 28 | # Save to a Markdown file 29 | output_file = os.path.join(output_dir, f"line_{i + 1}.md") 30 | with open(output_file, "w", encoding="utf-8") as md_file: 31 | md_file.write(text_content) 32 | 33 | print(f"Extracted and saved line {i + 1} to {output_file}") 34 | except json.JSONDecodeError as e: 35 | print(f"Error decoding JSON on line {i + 1}: {e}") 36 | except Exception as e: 37 | print(f"Unexpected error on line {i + 1}: {e}") 38 | 39 | 40 | # Example usage 41 | # input_jsonl_file = "/path/to/test.jsonl" # Replace with the actual path to your JSONL file 42 | # output_directory = "/path/to/output_markdown" # Replace with the desired output directory 43 | # jsonl_to_markdown(input_jsonl_file, output_directory) 44 | 45 | # This is the main entrypoint to use the script from the command line. 46 | # It takes two arguments: the input JSONL file and the output directory. 47 | # The script will create the output directory if it does not exist. 48 | if __name__ == "__main__": 49 | if len(sys.argv) != 3: 50 | print("Usage: python jsonl_to_markdown.py ") 51 | sys.exit(1) 52 | 53 | input_file = sys.argv[1] 54 | output_dir = sys.argv[2] 55 | 56 | jsonl_to_markdown(input_file, output_dir) 57 | -------------------------------------------------------------------------------- /scripts/molmo-7b-lora-gantry.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -ex 4 | 5 | # check if jq is installed 6 | if ! command -v jq &> /dev/null 7 | then 8 | echo "jq could not be found. Please install it." 9 | exit 10 | fi 11 | 12 | 13 | EXTRA_ARGS="-c olmocr/train/config/molmo-o-lora-8192.yaml --num_proc 64 --save.path \"s3://ai2-oe-data/jakep/experiments/molmo-pdf/v1/models/\${BEAKER_USER_ID}\"" 14 | 15 | run_name=$(basename "$0" .sh) 16 | 17 | # --cluster 'ai2/jupiter*' \ 18 | # --cluster 'ai2/pluto*' \ 19 | # --cluster 'ai2/allennlp-cirrascale' \ 20 | # --priority high \ 21 | 22 | CLUSTER='jupiter' 23 | 24 | gantry run \ 25 | --description "${run_name}-8192"\ 26 | --task-name "${run_name}-8192"\ 27 | --allow-dirty \ 28 | --host-networking \ 29 | --workspace ai2/oe-data-model-based-cleanup \ 30 | --beaker-image 'jakep/jakep-pdf-finetunev1.2' \ 31 | --venv 'base' \ 32 | --pip gantry-requirements.txt \ 33 | --priority high \ 34 | --gpus 8 \ 35 | --cluster "ai2/${CLUSTER}*" \ 36 | --budget ai2/oe-data \ 37 | --weka "oe-data-default:/data" \ 38 | --env LOG_FILTER_TYPE=local_rank0_only \ 39 | --env OMP_NUM_THREADS=8 \ 40 | --env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \ 41 | --env-secret AWS_ACCESS_KEY_ID=S2_AWS_ACCESS_KEY_ID \ 42 | --env-secret AWS_SECRET_ACCESS_KEY=S2_AWS_SECRET_ACCESS_KEY \ 43 | --env-secret DS_AWS_ACCESS_KEY_ID=S2_AWS_ACCESS_KEY_ID \ 44 | --env-secret DS_AWS_SECRET_ACCESS_KEY=S2_AWS_SECRET_ACCESS_KEY \ 45 | --env-secret WANDB_API_KEY=JAKE_WANDB_API_KEY \ 46 | --shared-memory 10GiB \ 47 | --yes \ 48 | -- /bin/bash -c "source scripts/beaker/${CLUSTER}-ib.sh && python -m olmocr.train.loaddataset ${EXTRA_ARGS} && accelerate launch --multi_gpu --num_processes \${BEAKER_ASSIGNED_GPU_COUNT} --mixed_precision bf16 -m olmocr.train.train ${EXTRA_ARGS}" -------------------------------------------------------------------------------- /scripts/parse_with_pdfminer.py: -------------------------------------------------------------------------------- 1 | from pdfminer.high_level import extract_pages 2 | from pdfminer.layout import LTChar 3 | 4 | 5 | def extract_chars_with_transforms(pdf_path, page_num=0): 6 | """ 7 | Extract characters with transformation data for a specific page in a PDF. 8 | 9 | Args: 10 | pdf_path (str): Path to the PDF file 11 | page_num (int): Page number to extract (0-indexed) 12 | """ 13 | print(f"Analyzing PDF: {pdf_path}, Page: {page_num + 1}") 14 | char_count = 0 15 | 16 | # Extract only the specified page 17 | for i, page_layout in enumerate(extract_pages(pdf_path)): 18 | if i == page_num: 19 | print(f"Processing page {page_num + 1}") 20 | 21 | # Recursively process all elements 22 | def process_element(element, level=0): 23 | nonlocal char_count 24 | indent = " " * level 25 | 26 | if isinstance(element, LTChar): 27 | char = element.get_text() 28 | matrix = element.matrix 29 | font = element.fontname if hasattr(element, "fontname") else "Unknown" 30 | size = element.size if hasattr(element, "size") else "Unknown" 31 | 32 | print(f"{indent}Character: '{char}'") 33 | print(f"{indent}Transform Matrix: {matrix}") 34 | print(f"{indent}Font: {font}, Size: {size}") 35 | print(f"{indent}{'-' * 30}") 36 | char_count += 1 37 | 38 | # For container elements, process their children 39 | if hasattr(element, "_objs"): 40 | for obj in element._objs: 41 | process_element(obj, level + 1) 42 | 43 | # Process all elements in the page 44 | for element in page_layout: 45 | process_element(element) 46 | 47 | break # Stop after processing the requested page 48 | 49 | print(f"\nTotal characters extracted: {char_count}") 50 | 51 | if char_count == 0: 52 | print("No characters were extracted. This could mean:") 53 | print(f"1. Page {page_num + 1} doesn't exist or is empty") 54 | print("2. The PDF contains scanned images rather than text") 55 | print("3. The text is embedded in a way PDFMiner can't extract") 56 | 57 | 58 | # Usage 59 | 60 | pdf_path = "/Users/kylel/Downloads/olmOCR_Technical_Report_COLM_2025.pdf" 61 | extract_chars_with_transforms(pdf_path) 62 | -------------------------------------------------------------------------------- /scripts/prepare_changelog.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from pathlib import Path 3 | 4 | from olmocr.version import VERSION 5 | 6 | 7 | def main(): 8 | changelog = Path("CHANGELOG.md") 9 | 10 | with changelog.open() as f: 11 | lines = f.readlines() 12 | 13 | insert_index: int = -1 14 | for i in range(len(lines)): 15 | line = lines[i] 16 | if line.startswith("## Unreleased"): 17 | insert_index = i + 1 18 | elif line.startswith(f"## [v{VERSION}]"): 19 | print("CHANGELOG already up-to-date") 20 | return 21 | elif line.startswith("## [v"): 22 | break 23 | 24 | if insert_index < 0: 25 | raise RuntimeError("Couldn't find 'Unreleased' section") 26 | 27 | lines.insert(insert_index, "\n") 28 | lines.insert( 29 | insert_index + 1, 30 | f"## [v{VERSION}](https://github.com/allenai/olmocr/releases/tag/v{VERSION}) - " f"{datetime.now().strftime('%Y-%m-%d')}\n", 31 | ) 32 | 33 | with changelog.open("w") as f: 34 | f.writelines(lines) 35 | 36 | 37 | if __name__ == "__main__": 38 | main() 39 | -------------------------------------------------------------------------------- /scripts/qwen25vl-7b-gantry.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -ex 4 | 5 | # check if jq is installed 6 | if ! command -v jq &> /dev/null 7 | then 8 | echo "jq could not be found. Please install it." 9 | exit 10 | fi 11 | 12 | 13 | EXTRA_ARGS="-c olmocr/train/config/qwen25vl-7b.yaml --num_proc 64 --save.path \"s3://ai2-oe-data/jakep/experiments/qwen25vl-pdf/v1/models/\${BEAKER_USER_ID}\"" 14 | 15 | run_name=$(basename "$0" .sh) 16 | 17 | # --cluster 'ai2/jupiter*' \ 18 | # --cluster 'ai2/pluto*' \ 19 | # --cluster 'ai2/allennlp-cirrascale' \ 20 | # --priority high \ 21 | 22 | CLUSTER='jupiter' 23 | 24 | gantry run \ 25 | --description "${run_name}"\ 26 | --task-name "${run_name}"\ 27 | --allow-dirty \ 28 | --host-networking \ 29 | --workspace ai2/oe-data-model-based-cleanup \ 30 | --beaker-image 'jakep/jakep-pdf-finetunev1.2' \ 31 | --venv 'base' \ 32 | --pip gantry-requirements.txt \ 33 | --priority high \ 34 | --gpus 8 \ 35 | --preemptible \ 36 | --cluster "ai2/${CLUSTER}*" \ 37 | --budget ai2/oe-data \ 38 | --weka "oe-data-default:/data" \ 39 | --env LOG_FILTER_TYPE=local_rank0_only \ 40 | --env OMP_NUM_THREADS=8 \ 41 | --env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \ 42 | --env-secret AWS_ACCESS_KEY_ID=S2_AWS_ACCESS_KEY_ID \ 43 | --env-secret AWS_SECRET_ACCESS_KEY=S2_AWS_SECRET_ACCESS_KEY \ 44 | --env-secret WANDB_API_KEY=JAKE_WANDB_API_KEY \ 45 | --shared-memory 10GiB \ 46 | --yes \ 47 | -- /bin/bash -c "pip install transformers==4.51.3 && source scripts/beaker/${CLUSTER}-ib.sh && python -m olmocr.train.loaddataset ${EXTRA_ARGS} && accelerate launch --use_fsdp --num_processes \${BEAKER_ASSIGNED_GPU_COUNT} --fsdp_offload_params false --fsdp_sharding_strategy FULL_SHARD --fsdp_auto_wrap_policy TRANSFORMER_BASED_WRAP --mixed_precision bf16 -m olmocr.train.train ${EXTRA_ARGS}" -------------------------------------------------------------------------------- /scripts/qwen2vl-2b-gantry.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -ex 4 | 5 | # check if jq is installed 6 | if ! command -v jq &> /dev/null 7 | then 8 | echo "jq could not be found. Please install it." 9 | exit 10 | fi 11 | 12 | 13 | EXTRA_ARGS="-c olmocr/train/config/qwen2vl-2b.yaml --num_proc 64 --save.path \"s3://ai2-oe-data/jakep/experiments/qwen2vl-pdf/v1/models/\${BEAKER_USER_ID}\"" 14 | 15 | run_name=$(basename "$0" .sh) 16 | 17 | # --cluster 'ai2/jupiter*' \ 18 | # --cluster 'ai2/pluto*' \ 19 | # --cluster 'ai2/allennlp-cirrascale' \ 20 | # --priority high \ 21 | 22 | CLUSTER='jupiter' 23 | 24 | gantry run \ 25 | --description "${run_name}"\ 26 | --task-name "${run_name}"\ 27 | --allow-dirty \ 28 | --host-networking \ 29 | --workspace ai2/oe-data-pdf \ 30 | --beaker-image 'jakep/jakep-pdf-finetunev1.2' \ 31 | --venv 'base' \ 32 | --pip gantry-requirements.txt \ 33 | --priority normal \ 34 | --gpus 8 \ 35 | --preemptible \ 36 | --cluster "ai2/${CLUSTER}*" \ 37 | --budget ai2/oe-data \ 38 | --env LOG_FILTER_TYPE=local_rank0_only \ 39 | --env OMP_NUM_THREADS=8 \ 40 | --env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \ 41 | --env-secret AWS_ACCESS_KEY_ID=S2_AWS_ACCESS_KEY_ID \ 42 | --env-secret AWS_SECRET_ACCESS_KEY=S2_AWS_SECRET_ACCESS_KEY \ 43 | --env-secret WANDB_API_KEY=JAKE_WANDB_API_KEY \ 44 | --shared-memory 10GiB \ 45 | --yes \ 46 | -- /bin/bash -c "source scripts/beaker/${CLUSTER}-ib.sh && accelerate launch --multi_gpu --num_processes \${BEAKER_ASSIGNED_GPU_COUNT} --mixed_precision bf16 -m olmocr.train.train ${EXTRA_ARGS}" -------------------------------------------------------------------------------- /scripts/qwen2vl-7b-gantry.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -ex 4 | 5 | # check if jq is installed 6 | if ! command -v jq &> /dev/null 7 | then 8 | echo "jq could not be found. Please install it." 9 | exit 10 | fi 11 | 12 | 13 | EXTRA_ARGS="-c olmocr/train/config/qwen2vl-7b.yaml --num_proc 64 --save.path \"s3://ai2-oe-data/jakep/experiments/qwen2vl-pdf/v1/models/\${BEAKER_USER_ID}\"" 14 | 15 | run_name=$(basename "$0" .sh) 16 | 17 | # --cluster 'ai2/jupiter*' \ 18 | # --cluster 'ai2/pluto*' \ 19 | # --cluster 'ai2/allennlp-cirrascale' \ 20 | # --priority high \ 21 | 22 | CLUSTER='jupiter' 23 | 24 | gantry run \ 25 | --description "${run_name}"\ 26 | --task-name "${run_name}"\ 27 | --allow-dirty \ 28 | --host-networking \ 29 | --workspace ai2/oe-data-model-based-cleanup \ 30 | --beaker-image 'jakep/jakep-pdf-finetunev1.2' \ 31 | --venv 'base' \ 32 | --pip gantry-requirements.txt \ 33 | --priority high \ 34 | --gpus 8 \ 35 | --preemptible \ 36 | --cluster "ai2/${CLUSTER}*" \ 37 | --budget ai2/oe-data \ 38 | --weka "oe-data-default:/data" \ 39 | --env LOG_FILTER_TYPE=local_rank0_only \ 40 | --env OMP_NUM_THREADS=8 \ 41 | --env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \ 42 | --env-secret AWS_ACCESS_KEY_ID=S2_AWS_ACCESS_KEY_ID \ 43 | --env-secret AWS_SECRET_ACCESS_KEY=S2_AWS_SECRET_ACCESS_KEY \ 44 | --env-secret WANDB_API_KEY=JAKE_WANDB_API_KEY \ 45 | --shared-memory 10GiB \ 46 | --yes \ 47 | -- /bin/bash -c "source scripts/beaker/${CLUSTER}-ib.sh && python -m olmocr.train.loaddataset ${EXTRA_ARGS} && accelerate launch --use_fsdp --num_processes \${BEAKER_ASSIGNED_GPU_COUNT} --fsdp_offload_params false --fsdp_sharding_strategy FULL_SHARD --fsdp_auto_wrap_policy TRANSFORMER_BASED_WRAP --mixed_precision bf16 -m olmocr.train.train ${EXTRA_ARGS}" -------------------------------------------------------------------------------- /scripts/qwen2vl-7b-lora-gantry.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -ex 4 | 5 | # check if jq is installed 6 | if ! command -v jq &> /dev/null 7 | then 8 | echo "jq could not be found. Please install it." 9 | exit 10 | fi 11 | 12 | 13 | EXTRA_ARGS="-c olmocr/train/config/qwen2vl-7b-lora.yaml --num_proc 64 --save.path \"s3://ai2-oe-data/jakep/experiments/qwen2vl-pdf/v1/models/\${BEAKER_USER_ID}\"" 14 | 15 | run_name=$(basename "$0" .sh) 16 | 17 | # --cluster 'ai2/jupiter*' \ 18 | # --cluster 'ai2/pluto*' \ 19 | # --cluster 'ai2/allennlp-cirrascale' \ 20 | # --priority high \ 21 | 22 | CLUSTER='jupiter' 23 | 24 | gantry run \ 25 | --description "${run_name}"\ 26 | --task-name "${run_name}"\ 27 | --allow-dirty \ 28 | --host-networking \ 29 | --workspace ai2/oe-data-model-based-cleanup \ 30 | --beaker-image 'jakep/jakep-pdf-finetunev1.2' \ 31 | --venv 'base' \ 32 | --pip gantry-requirements.txt \ 33 | --priority high \ 34 | --gpus 8 \ 35 | --preemptible \ 36 | --cluster "ai2/${CLUSTER}*" \ 37 | --budget ai2/oe-data \ 38 | --weka "oe-data-default:/data" \ 39 | --env LOG_FILTER_TYPE=local_rank0_only \ 40 | --env OMP_NUM_THREADS=8 \ 41 | --env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \ 42 | --env-secret AWS_ACCESS_KEY_ID=S2_AWS_ACCESS_KEY_ID \ 43 | --env-secret AWS_SECRET_ACCESS_KEY=S2_AWS_SECRET_ACCESS_KEY \ 44 | --env-secret DS_AWS_ACCESS_KEY_ID=S2_AWS_ACCESS_KEY_ID \ 45 | --env-secret DS_AWS_SECRET_ACCESS_KEY=S2_AWS_SECRET_ACCESS_KEY \ 46 | --env-secret WANDB_API_KEY=JAKE_WANDB_API_KEY \ 47 | --shared-memory 10GiB \ 48 | --yes \ 49 | -- /bin/bash -c "source scripts/beaker/${CLUSTER}-ib.sh && python -m olmocr.train.loaddataset ${EXTRA_ARGS} && accelerate launch --multi_gpu --num_processes \${BEAKER_ASSIGNED_GPU_COUNT} --mixed_precision bf16 -m olmocr.train.train ${EXTRA_ARGS}" -------------------------------------------------------------------------------- /scripts/release.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | # Function to extract version components from version.py using regex 6 | get_version_from_file() { 7 | VERSION_FILE="olmocr/version.py" 8 | 9 | if [[ ! -f "$VERSION_FILE" ]]; then 10 | echo "Error: $VERSION_FILE does not exist." 11 | exit 1 12 | fi 13 | 14 | # Extract _MAJOR 15 | _MAJOR=$(grep -E '^_MAJOR\s*=\s*"([^"]+)"' "$VERSION_FILE" | sed -E 's/_MAJOR\s*=\s*"([^"]+)"/\1/') 16 | if [[ -z "$_MAJOR" ]]; then 17 | echo "Error: Could not extract _MAJOR from $VERSION_FILE." 18 | exit 1 19 | fi 20 | 21 | # Extract _MINOR 22 | _MINOR=$(grep -E '^_MINOR\s*=\s*"([^"]+)"' "$VERSION_FILE" | sed -E 's/_MINOR\s*=\s*"([^"]+)"/\1/') 23 | if [[ -z "$_MINOR" ]]; then 24 | echo "Error: Could not extract _MINOR from $VERSION_FILE." 25 | exit 1 26 | fi 27 | 28 | # Extract _PATCH 29 | _PATCH=$(grep -E '^_PATCH\s*=\s*"([^"]+)"' "$VERSION_FILE" | sed -E 's/_PATCH\s*=\s*"([^"]+)"/\1/') 30 | if [[ -z "$_PATCH" ]]; then 31 | echo "Error: Could not extract _PATCH from $VERSION_FILE." 32 | exit 1 33 | fi 34 | 35 | # Extract _SUFFIX (optional) 36 | _SUFFIX=$(grep -E '^_SUFFIX\s*=\s*"([^"]*)"' "$VERSION_FILE" | sed -E 's/_SUFFIX\s*=\s*"([^"]*)"/\1/') 37 | if [[ -z "$_SUFFIX" ]]; then 38 | _SUFFIX="" 39 | fi 40 | 41 | # Construct VERSION 42 | VERSION_PY="${_MAJOR}.${_MINOR}.${_PATCH}${_SUFFIX}" 43 | echo "$VERSION_PY" 44 | } 45 | 46 | TAG=$(python -c 'from olmocr.version import VERSION; print("v" + VERSION)') 47 | 48 | # Get the VERSION from version.py 49 | VERSION_PY=$(get_version_from_file) 50 | 51 | # Compare the two versions 52 | if [[ "v$VERSION_PY" != "$TAG" ]]; then 53 | echo "Version mismatch detected:" 54 | echo " Python reported version: $TAG" 55 | echo " version.py contains: v$VERSION_PY" 56 | echo 57 | read -p "The versions do not match. Please run 'pip install -e .' to synchronize versions. Do you want to continue? [Y/n] " prompt 58 | 59 | if [[ ! "$prompt" =~ ^([yY][eE][sS]|[yY])$ ]]; then 60 | echo "Release process aborted due to version mismatch." 61 | exit 1 62 | else 63 | echo "Proceeding with the release despite the version mismatch." 64 | fi 65 | fi 66 | 67 | read -p "Creating new release for $TAG. Do you want to continue? [Y/n] " prompt 68 | 69 | if [[ $prompt == "y" || $prompt == "Y" || $prompt == "yes" || $prompt == "Yes" ]]; then 70 | python scripts/prepare_changelog.py 71 | git add -A 72 | git commit -m "Bump version to $TAG for release" || true && git push 73 | echo "Creating new git tag $TAG" 74 | git tag "$TAG" -m "$TAG" 75 | git push --tags 76 | else 77 | echo "Cancelled" 78 | exit 1 79 | fi -------------------------------------------------------------------------------- /scripts/release_notes.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | """ 4 | Prepares markdown release notes for GitHub releases. 5 | """ 6 | 7 | import os 8 | from typing import List, Optional 9 | 10 | import packaging.version 11 | 12 | TAG = os.environ["TAG"] 13 | 14 | ADDED_HEADER = "### Added 🎉" 15 | CHANGED_HEADER = "### Changed ⚠️" 16 | FIXED_HEADER = "### Fixed ✅" 17 | REMOVED_HEADER = "### Removed 👋" 18 | 19 | 20 | def get_change_log_notes() -> str: 21 | in_current_section = False 22 | current_section_notes: List[str] = [] 23 | with open("CHANGELOG.md") as changelog: 24 | for line in changelog: 25 | if line.startswith("## "): 26 | if line.startswith("## Unreleased"): 27 | continue 28 | if line.startswith(f"## [{TAG}]"): 29 | in_current_section = True 30 | continue 31 | break 32 | if in_current_section: 33 | if line.startswith("### Added"): 34 | line = ADDED_HEADER + "\n" 35 | elif line.startswith("### Changed"): 36 | line = CHANGED_HEADER + "\n" 37 | elif line.startswith("### Fixed"): 38 | line = FIXED_HEADER + "\n" 39 | elif line.startswith("### Removed"): 40 | line = REMOVED_HEADER + "\n" 41 | current_section_notes.append(line) 42 | assert current_section_notes 43 | return "## What's new\n\n" + "".join(current_section_notes).strip() + "\n" 44 | 45 | 46 | def get_commit_history() -> str: 47 | new_version = packaging.version.parse(TAG) 48 | 49 | # Pull all tags. 50 | os.popen("git fetch --tags") 51 | 52 | # Get all tags sorted by version, latest first. 53 | all_tags = os.popen("git tag -l --sort=-version:refname 'v*'").read().split("\n") 54 | 55 | # Out of `all_tags`, find the latest previous version so that we can collect all 56 | # commits between that version and the new version we're about to publish. 57 | # Note that we ignore pre-releases unless the new version is also a pre-release. 58 | last_tag: Optional[str] = None 59 | for tag in all_tags: 60 | if not tag.strip(): # could be blank line 61 | continue 62 | version = packaging.version.parse(tag) 63 | if new_version.pre is None and version.pre is not None: 64 | continue 65 | if version < new_version: 66 | last_tag = tag 67 | break 68 | if last_tag is not None: 69 | commits = os.popen(f"git log {last_tag}..{TAG} --oneline --first-parent").read() 70 | else: 71 | commits = os.popen("git log --oneline --first-parent").read() 72 | return "## Commits\n\n" + commits 73 | 74 | 75 | def main(): 76 | print(get_change_log_notes()) 77 | print(get_commit_history()) 78 | 79 | 80 | if __name__ == "__main__": 81 | main() 82 | -------------------------------------------------------------------------------- /scripts/run_benchmark.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | # Use conda environment Python if available, otherwise use system Python 6 | if [ -n "$CONDA_PREFIX" ]; then 7 | PYTHON="$CONDA_PREFIX/bin/python" 8 | echo "Using conda Python from: $CONDA_PREFIX" 9 | else 10 | PYTHON="python" 11 | echo "Warning: No conda environment detected, using system Python" 12 | fi 13 | 14 | # Get version from version.py 15 | VERSION=$($PYTHON -c 'import olmocr.version; print(olmocr.version.VERSION)') 16 | echo "OlmOCR version: $VERSION" 17 | 18 | # Get first 10 characters of git hash 19 | GIT_HASH=$(git rev-parse HEAD | cut -c1-10) 20 | echo "Git hash: $GIT_HASH" 21 | 22 | # Get current git branch name 23 | GIT_BRANCH=$(git rev-parse --abbrev-ref HEAD) 24 | echo "Git branch: $GIT_BRANCH" 25 | 26 | # Create full image tag 27 | IMAGE_TAG="olmocr-benchmark-${VERSION}-${GIT_HASH}" 28 | echo "Building Docker image with tag: $IMAGE_TAG" 29 | 30 | # Build the Docker image 31 | echo "Building Docker image..." 32 | docker build --platform linux/amd64 -f ./Dockerfile -t $IMAGE_TAG . 33 | 34 | # Get Beaker username 35 | BEAKER_USER=$(beaker account whoami --format json | jq -r '.[0].name') 36 | echo "Beaker user: $BEAKER_USER" 37 | 38 | # Push image to beaker 39 | echo "Pushing image to Beaker..." 40 | beaker image create --workspace ai2/oe-data-pdf --name $IMAGE_TAG $IMAGE_TAG 41 | 42 | # Create Python script to run beaker experiment 43 | cat << 'EOF' > /tmp/run_benchmark_experiment.py 44 | import sys 45 | from beaker import Beaker, ExperimentSpec, TaskSpec, TaskContext, ResultSpec, TaskResources, ImageSource, Priority, Constraints 46 | 47 | # Get image tag, beaker user, git branch, and git hash from command line 48 | image_tag = sys.argv[1] 49 | beaker_user = sys.argv[2] 50 | git_branch = sys.argv[3] 51 | git_hash = sys.argv[4] 52 | 53 | # Initialize Beaker client 54 | b = Beaker.from_env(default_workspace="ai2/olmocr") 55 | 56 | # Create experiment spec 57 | experiment_spec = ExperimentSpec( 58 | description=f"OlmOCR Benchmark Run - Branch: {git_branch}, Commit: {git_hash}", 59 | budget="ai2/oe-data", 60 | tasks=[ 61 | TaskSpec( 62 | name="olmocr-benchmark", 63 | image=ImageSource(beaker=f"{beaker_user}/{image_tag}"), 64 | command=[ 65 | "bash", "-c", 66 | " && ".join([ 67 | "git clone https://huggingface.co/datasets/allenai/olmOCR-bench", 68 | "cd olmOCR-bench && git lfs pull && cd ..", 69 | "python -m olmocr.pipeline ./localworkspace --markdown --pdfs ./olmOCR-bench/bench_data/pdfs/**/*.pdf", 70 | "python olmocr/bench/scripts/workspace_to_bench.py localworkspace/ olmOCR-bench/bench_data/olmocr --bench-path ./olmOCR-bench/", 71 | "python -m olmocr.bench.benchmark --dir ./olmOCR-bench/bench_data" 72 | ]) 73 | ], 74 | context=TaskContext( 75 | priority=Priority.normal, 76 | preemptible=True, 77 | ), 78 | resources=TaskResources(gpu_count=1), 79 | constraints=Constraints(cluster=["ai2/ceres-cirrascale", "ai2/jupiter-cirrascale-2"]), 80 | result=ResultSpec(path="/noop-results"), 81 | ) 82 | ], 83 | ) 84 | 85 | # Create the experiment 86 | experiment = b.experiment.create(spec=experiment_spec, workspace="ai2/olmocr") 87 | print(f"Created experiment: {experiment.id}") 88 | print(f"View at: https://beaker.org/ex/{experiment.id}") 89 | EOF 90 | 91 | # Run the Python script to create the experiment 92 | echo "Creating Beaker experiment..." 93 | $PYTHON /tmp/run_benchmark_experiment.py $IMAGE_TAG $BEAKER_USER $GIT_BRANCH $GIT_HASH 94 | 95 | # Clean up temporary file 96 | rm /tmp/run_benchmark_experiment.py 97 | 98 | echo "Benchmark experiment submitted successfully!" -------------------------------------------------------------------------------- /scripts/run_integration_test.sh: -------------------------------------------------------------------------------- 1 | #/usr/bin/bash 2 | 3 | set -ex 4 | 5 | python -m olmocr.pipeline ./localworkspace --pdfs tests/gnarly_pdfs/ambiguous.pdf tests/gnarly_pdfs/edgar.pdf tests/gnarly_pdfs/dolma-page-1.pdf \ 6 | && pytest tests/test_integration.py 7 | -------------------------------------------------------------------------------- /scripts/run_tagging_pipeline.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | gantry run --gpus 1 --workspace ai2/olmocr --beaker-image ai2/pytorch2.5.1-cuda12.1-python3.11 --cluster ai2/jupiter-cirrascale-2 --budget ai2/oe-data --priority normal --env-secret AWS_CREDENTIALS_FILE=jakep-AWS_CREDENTIALS_FILE --env-secret HF_TOKEN=jake-HF_TOKEN --allow-dirty -- /bin/bash -c "pip install -e .[gpu] --find-links https://flashinfer.ai/whl/cu124/torch2.4/flashinfer/ && pip install --upgrade sglang==0.4.5.post3 transformers==4.51.3 && python scripts/tagging_pipeline.py s3://ai2-oe-data/jakep/s2pdf_dedupe_minhash_v1_mini s3://ai2-oe-data/jakep/s2pdf_dedupe_minhash_v1_mini_scratch" 6 | 7 | gantry run --gpus 1 --workspace ai2/olmocr --beaker-image ai2/pytorch2.5.1-cuda12.1-python3.11 --cluster ai2/jupiter-cirrascale-2 --budget ai2/oe-data --priority normal --env-secret AWS_CREDENTIALS_FILE=jakep-AWS_CREDENTIALS_FILE --env-secret HF_TOKEN=jake-HF_TOKEN --allow-dirty -- /bin/bash -c "pip install -e .[gpu,bench] --find-links https://flashinfer.ai/whl/cu124/torch2.4/flashinfer/ && huggingface-cli download allenai/olmOCR-bench --repo-type dataset --local-dir ./olmOCR-bench && olmocr/bench/scripts/convert_all.sh" -------------------------------------------------------------------------------- /scripts/s2orc_extractor.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Define the output file for the metadata.sha1 fields 4 | OUTPUT_FILE="s2orc_pdfs_v2.txt" 5 | 6 | # Clear the output file if it already exists 7 | > "$OUTPUT_FILE" 8 | 9 | # Create a temporary directory for partial outputs 10 | temp_output_dir=$(mktemp -d) 11 | 12 | # Ensure the temporary directory is cleaned up on exit or error 13 | trap 'rm -rf "$temp_output_dir"' EXIT 14 | 15 | # Export the temporary output directory variable for use in xargs 16 | export temp_output_dir 17 | 18 | echo "temp dir $temp_output_dir" 19 | 20 | # Find all .gz files recursively from the current directory 21 | find 'split=train' -type f -name "*.gz" | \ 22 | xargs -P 30 -I{} bash -c ' 23 | gz_file="$1" 24 | partial_output="$temp_output_dir/$(basename "$gz_file").txt" 25 | 26 | # Stream uncompressed data directly into jq and format the output 27 | gunzip -c "$gz_file" | jq -r '"'"' 28 | select(.metadata.sha1 != null) | 29 | "s3://ai2-s2-pdfs/" + (.metadata.sha1[:4]) + "/" + (.metadata.sha1[4:]) + ".pdf" 30 | '"'"' >> "$partial_output" 31 | ' _ {} 32 | 33 | # Concatenate all partial outputs into the final output file 34 | cat "$temp_output_dir"/*.txt >> "$OUTPUT_FILE" 35 | 36 | echo "All metadata.sha1 fields have been extracted to $OUTPUT_FILE." 37 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/tests/__init__.py -------------------------------------------------------------------------------- /tests/gnarly_pdfs/ambiguous.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/tests/gnarly_pdfs/ambiguous.pdf -------------------------------------------------------------------------------- /tests/gnarly_pdfs/badlines.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/tests/gnarly_pdfs/badlines.pdf -------------------------------------------------------------------------------- /tests/gnarly_pdfs/bws_book_ch2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/tests/gnarly_pdfs/bws_book_ch2.pdf -------------------------------------------------------------------------------- /tests/gnarly_pdfs/discoverworld_crazy_tables.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/tests/gnarly_pdfs/discoverworld_crazy_tables.pdf -------------------------------------------------------------------------------- /tests/gnarly_pdfs/dolma-page-1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/tests/gnarly_pdfs/dolma-page-1.pdf -------------------------------------------------------------------------------- /tests/gnarly_pdfs/edgar.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/tests/gnarly_pdfs/edgar.pdf -------------------------------------------------------------------------------- /tests/gnarly_pdfs/failing_anchor_pg4.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/tests/gnarly_pdfs/failing_anchor_pg4.pdf -------------------------------------------------------------------------------- /tests/gnarly_pdfs/failing_pdf_pg9.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/tests/gnarly_pdfs/failing_pdf_pg9.pdf -------------------------------------------------------------------------------- /tests/gnarly_pdfs/form_on_later_pages.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/tests/gnarly_pdfs/form_on_later_pages.pdf -------------------------------------------------------------------------------- /tests/gnarly_pdfs/guidebook_failed_pages.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/tests/gnarly_pdfs/guidebook_failed_pages.pdf -------------------------------------------------------------------------------- /tests/gnarly_pdfs/handwriting_bad_ocr.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/tests/gnarly_pdfs/handwriting_bad_ocr.pdf -------------------------------------------------------------------------------- /tests/gnarly_pdfs/horribleocr.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/tests/gnarly_pdfs/horribleocr.pdf -------------------------------------------------------------------------------- /tests/gnarly_pdfs/instructions_and_schematics.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/tests/gnarly_pdfs/instructions_and_schematics.pdf -------------------------------------------------------------------------------- /tests/gnarly_pdfs/large_prompt_hint1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/tests/gnarly_pdfs/large_prompt_hint1.pdf -------------------------------------------------------------------------------- /tests/gnarly_pdfs/large_prompt_hint2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/tests/gnarly_pdfs/large_prompt_hint2.pdf -------------------------------------------------------------------------------- /tests/gnarly_pdfs/large_prompt_hint3.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/tests/gnarly_pdfs/large_prompt_hint3.pdf -------------------------------------------------------------------------------- /tests/gnarly_pdfs/load_v_error.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/tests/gnarly_pdfs/load_v_error.pdf -------------------------------------------------------------------------------- /tests/gnarly_pdfs/lots_of_chem_tables.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/tests/gnarly_pdfs/lots_of_chem_tables.pdf -------------------------------------------------------------------------------- /tests/gnarly_pdfs/lots_of_sci_tables.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/tests/gnarly_pdfs/lots_of_sci_tables.pdf -------------------------------------------------------------------------------- /tests/gnarly_pdfs/map1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/tests/gnarly_pdfs/map1.pdf -------------------------------------------------------------------------------- /tests/gnarly_pdfs/most_content_in_image_form.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/tests/gnarly_pdfs/most_content_in_image_form.pdf -------------------------------------------------------------------------------- /tests/gnarly_pdfs/newspaper.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/tests/gnarly_pdfs/newspaper.pdf -------------------------------------------------------------------------------- /tests/gnarly_pdfs/not_parsing.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/tests/gnarly_pdfs/not_parsing.pdf -------------------------------------------------------------------------------- /tests/gnarly_pdfs/not_parsing2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/tests/gnarly_pdfs/not_parsing2.pdf -------------------------------------------------------------------------------- /tests/gnarly_pdfs/olmo-page-1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/tests/gnarly_pdfs/olmo-page-1.pdf -------------------------------------------------------------------------------- /tests/gnarly_pdfs/overrun_on_pg8.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/tests/gnarly_pdfs/overrun_on_pg8.pdf -------------------------------------------------------------------------------- /tests/gnarly_pdfs/pdftotext_two_column_issue.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/tests/gnarly_pdfs/pdftotext_two_column_issue.pdf -------------------------------------------------------------------------------- /tests/gnarly_pdfs/repeating_references_on_pg9_pg10.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/tests/gnarly_pdfs/repeating_references_on_pg9_pg10.pdf -------------------------------------------------------------------------------- /tests/gnarly_pdfs/skinnypage.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/tests/gnarly_pdfs/skinnypage.pdf -------------------------------------------------------------------------------- /tests/gnarly_pdfs/slideshow_mostly_good_some_pages_should_get_filtered.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/tests/gnarly_pdfs/slideshow_mostly_good_some_pages_should_get_filtered.pdf -------------------------------------------------------------------------------- /tests/gnarly_pdfs/slideshow_mostly_images.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/tests/gnarly_pdfs/slideshow_mostly_images.pdf -------------------------------------------------------------------------------- /tests/gnarly_pdfs/small_page_size.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/tests/gnarly_pdfs/small_page_size.pdf -------------------------------------------------------------------------------- /tests/gnarly_pdfs/some_ocr1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/tests/gnarly_pdfs/some_ocr1.pdf -------------------------------------------------------------------------------- /tests/gnarly_pdfs/ti89_guidebook_programming.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/tests/gnarly_pdfs/ti89_guidebook_programming.pdf -------------------------------------------------------------------------------- /tests/gnarly_pdfs/tobacco_missed_tokens_pg1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/tests/gnarly_pdfs/tobacco_missed_tokens_pg1.pdf -------------------------------------------------------------------------------- /tests/test_dataloader.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from functools import partial 3 | 4 | import pytest 5 | from torch.utils.data import DataLoader 6 | from tqdm import tqdm 7 | from transformers import AutoProcessor 8 | 9 | from olmocr.train.dataloader import ( 10 | build_finetuning_dataset, 11 | extract_openai_batch_response, 12 | list_dataset_files, 13 | load_jsonl_into_ds, 14 | ) 15 | from olmocr.train.dataprep import batch_prepare_data_for_qwen2_training 16 | 17 | 18 | @pytest.mark.nonci 19 | class TestBatchQueryResponseDataset(unittest.TestCase): 20 | def testLoadS3(self): 21 | ds = load_jsonl_into_ds("s3://ai2-oe-data/jakep/openai_batch_data_v2/*.jsonl", first_n_files=3) 22 | 23 | print(f"Loaded {len(ds)} entries") 24 | print(ds) 25 | print(ds["train"]) 26 | 27 | def testFinetuningDS(self): 28 | ds = build_finetuning_dataset( 29 | response_glob_path="s3://ai2-oe-data/jakep/pdfdata/openai_batch_done_v5_1_eval/*.json", 30 | ) 31 | 32 | print(ds) 33 | 34 | processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct") 35 | 36 | ds = ds.with_transform(partial(batch_prepare_data_for_qwen2_training, processor=processor, target_longest_image_dim=1024, target_anchor_text_len=6000)) 37 | 38 | print(ds[0]) 39 | 40 | def testPlotSequenceLengthHistogram(self): 41 | import plotly.express as px 42 | 43 | ds = build_finetuning_dataset( 44 | response_glob_path="s3://ai2-oe-data/jakep/pdfdata/openai_batch_done_v5_1_eval/*.json", 45 | ) 46 | 47 | processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct") 48 | 49 | ds = ds.with_transform(partial(batch_prepare_data_for_qwen2_training, processor=processor, target_longest_image_dim=1024, target_anchor_text_len=6000)) 50 | 51 | processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct") 52 | 53 | initial_len = len(ds) 54 | 55 | train_dataloader = DataLoader(ds, batch_size=1, num_workers=30, shuffle=False) 56 | 57 | max_seen_len = 0 58 | steps = 0 59 | sequence_lengths = [] # List to store sequence lengths 60 | for entry in tqdm(train_dataloader): 61 | num_input_tokens = entry["input_ids"].shape[1] 62 | max_seen_len = max(max_seen_len, num_input_tokens) 63 | sequence_lengths.append(num_input_tokens) # Collecting sequence lengths 64 | 65 | if steps % 100 == 0: 66 | print(f"Max input len {max_seen_len}") 67 | 68 | steps += 1 69 | 70 | # model.forward(**{k: v.to("cuda:0") for (k,v) in entry.items()}) 71 | print(f"Max input len {max_seen_len}") 72 | print(f"Total elements before filtering: {initial_len}") 73 | print(f"Total elements after filtering: {steps}") 74 | 75 | # Plotting the histogram using Plotly 76 | fig = px.histogram( 77 | sequence_lengths, nbins=100, title="Distribution of Input Sequence Lengths", labels={"value": "Sequence Length", "count": "Frequency"} 78 | ) 79 | 80 | fig.write_image("sequence_lengths_histogram.png") 81 | -------------------------------------------------------------------------------- /tests/test_filter.py: -------------------------------------------------------------------------------- 1 | import os 2 | import unittest 3 | 4 | from pypdf import PdfReader 5 | 6 | from olmocr.filter import PdfFilter 7 | 8 | 9 | class PdfFilterTest(unittest.TestCase): 10 | def testFormLaterPages(self): 11 | self.filter = PdfFilter(apply_form_check=True) 12 | 13 | self.assertTrue(self.filter.filter_out_pdf(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "form_on_later_pages.pdf"))) 14 | 15 | self.filter = PdfFilter(apply_form_check=False) 16 | 17 | self.assertFalse(self.filter.filter_out_pdf(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "form_on_later_pages.pdf"))) 18 | -------------------------------------------------------------------------------- /tests/test_integration.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import json 3 | import os 4 | import unittest 5 | 6 | import pytest 7 | 8 | 9 | @pytest.mark.nonci 10 | class TestPipelineIntegration(unittest.TestCase): 11 | def setUp(self): 12 | self.data = [] 13 | 14 | for file in glob.glob(os.path.join("localworkspace", "results", "*.jsonl")): 15 | with open(file, "r") as jf: 16 | for line in jf: 17 | if len(line.strip()) > 0: 18 | self.data.append(json.loads(line)) 19 | print(self.data[-1]) 20 | 21 | def test_edgar(self) -> None: 22 | self.assertTrue(any("King of the English" in line["text"] for line in self.data)) 23 | 24 | def test_ambig(self) -> None: 25 | self.assertTrue(any("Apples and Bananas" in line["text"] for line in self.data)) 26 | 27 | def test_dolma(self) -> None: 28 | self.assertTrue(any("We extensively document Dolma" in line["text"] for line in self.data)) 29 | -------------------------------------------------------------------------------- /tests/test_molmo.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import pytest 4 | import requests 5 | from PIL import Image 6 | from transformers import ( 7 | AutoModelForCausalLM, 8 | AutoProcessor, 9 | AutoTokenizer, 10 | GenerationConfig, 11 | ) 12 | 13 | 14 | @pytest.mark.nonci 15 | class MolmoProcessorTest(unittest.TestCase): 16 | def test_molmo_demo(self): 17 | # load the processor 18 | processor = AutoProcessor.from_pretrained( 19 | "allenai/Molmo-7B-O-0924", 20 | trust_remote_code=True, 21 | torch_dtype="auto", 22 | ) 23 | 24 | # load the model 25 | model = AutoModelForCausalLM.from_pretrained( 26 | "allenai/Molmo-7B-O-0924", 27 | trust_remote_code=True, 28 | torch_dtype="auto", 29 | ) 30 | 31 | device = "cuda:0" 32 | 33 | model = model.to(device) 34 | 35 | # process the image and text 36 | inputs = processor.process(images=[Image.open(requests.get("https://picsum.photos/id/237/536/354", stream=True).raw)], text="Describe this image.") 37 | 38 | # move inputs to the correct device and make a batch of size 1 39 | inputs = {k: v.to(model.device).unsqueeze(0) for k, v in inputs.items()} 40 | 41 | print("Raw inputs") 42 | print(inputs) 43 | 44 | print("\nShapes") 45 | # {('input_ids', torch.Size([1, 589])), ('images', torch.Size([1, 5, 576, 588])), ('image_masks', torch.Size([1, 5, 576])), ('image_input_idx', torch.Size([1, 5, 144]))} 46 | print({(x, y.shape) for x, y in inputs.items()}) 47 | 48 | print("\nTokens") 49 | print(processor.tokenizer.batch_decode(inputs["input_ids"])) 50 | 51 | # generate output; maximum 200 new tokens; stop generation when <|endoftext|> is generated 52 | output = model.generate_from_batch(inputs, GenerationConfig(max_new_tokens=200, stop_strings="<|endoftext|>"), tokenizer=processor.tokenizer) 53 | 54 | # only get generated tokens; decode them to text 55 | generated_tokens = output[0, inputs["input_ids"].size(1) :] 56 | generated_text = processor.tokenizer.decode(generated_tokens, skip_special_tokens=True) 57 | 58 | # print the generated text 59 | print(generated_text) 60 | -------------------------------------------------------------------------------- /tests/test_renders/output_image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/tests/test_renders/output_image.png -------------------------------------------------------------------------------- /tests/test_renders/output_image_rotated90.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/2235b82c8e16a3b2f63c046e9a8aa5b362287a40/tests/test_renders/output_image_rotated90.png --------------------------------------------------------------------------------