├── .dockerignore ├── .github ├── CONTRIBUTING.md ├── ISSUE_TEMPLATE │ ├── bug_report.yml │ ├── documentation.yml │ └── feature_request.yml ├── actions │ └── setup-venv │ │ └── action.yml ├── dependabot.yml ├── pull_request_template.md └── workflows │ ├── main.yml │ └── pr_checks.yml ├── .gitignore ├── .readthedocs.yaml ├── CHANGELOG.md ├── Dockerfile ├── Dockerfile.with-model ├── LICENSE ├── Makefile ├── README.md ├── RELEASE_PROCESS.md ├── docs ├── .gitignore ├── Makefile ├── make.bat └── source │ ├── CHANGELOG.md │ ├── CONTRIBUTING.md │ ├── _static │ ├── css │ │ └── custom.css │ └── favicon.ico │ ├── conf.py │ ├── index.md │ ├── installation.md │ ├── ocr_pareto.pdf │ ├── ocr_pareto.png │ └── overview.md ├── olmOCR-2-Unit-Test-Rewards-for-Document-OCR.pdf ├── olmocr ├── __init__.py ├── bench │ ├── README.md │ ├── __init__.py │ ├── benchmark.py │ ├── convert.py │ ├── katex │ │ ├── __init__.py │ │ ├── auto-render.min.js │ │ ├── katex.min.css │ │ ├── katex.min.js │ │ └── render.py │ ├── miners │ │ ├── check_headers_footers.py │ │ ├── check_multicolumn.py │ │ ├── check_old_scans_math.py │ │ ├── cleanup_data.py │ │ ├── cleanup_urls.py │ │ ├── delete_rejected.py │ │ ├── download_math.py │ │ ├── mine_blank_pages_gpt.py │ │ ├── mine_diffs.py │ │ ├── mine_headers_footers.py │ │ ├── mine_long_tiny_text.py │ │ ├── mine_math.py │ │ ├── mine_multi_column.py │ │ ├── mine_multilingual_gpt.py │ │ ├── mine_old_scan_pdf.py │ │ ├── mine_old_scans.py │ │ ├── mine_old_scans_math.py │ │ ├── mine_reading_order.py │ │ ├── mine_tables_gemini.py │ │ ├── mine_tables_gpt.py │ │ ├── mine_tables_gpt_simple.py │ │ └── pick_mediod.py │ ├── prompts.py │ ├── report.py │ ├── review_app.py │ ├── review_app_latex.py │ ├── runners │ │ ├── __init__.py │ │ ├── run_chatgpt.py │ │ ├── run_claude.py │ │ ├── run_docling.py │ │ ├── run_gemini.py │ │ ├── run_gotocr.py │ │ ├── run_marker.py │ │ ├── run_mineru.py │ │ ├── run_mistral.py │ │ ├── run_nanonetsocr.py │ │ ├── run_nanonetsocr_2.py │ │ ├── run_olmocr_pipeline.py │ │ ├── run_paddlepaddle.py │ │ ├── run_paddlevl.py │ │ ├── run_rolmocr.py │ │ ├── run_server.py │ │ └── run_transformers.py │ ├── sample_data │ │ ├── blanks.jsonl │ │ ├── dataset.jsonl │ │ ├── olmocr_pipeline │ │ │ ├── blank_book_pg1_pg1_repeat1.md │ │ │ ├── buildingnotes_pg1_repeat1.md │ │ │ ├── discoverworld_crazy_table4_pg1_repeat1.md │ │ │ ├── earnings_pg1_repeat1.md │ │ │ ├── headers_footers │ │ │ │ ├── ff0f0b22c55d8b90dd77d153f48e144fc9db_pg2_pg1_repeat1.md │ │ │ │ ├── ff1fc6a205ad039139ce566851b6b260c929_pg1_pg1_repeat1.md │ │ │ │ ├── ff3d6e051903fe5ca9bc172ece14964c5632_pg1_pg1_repeat1.md │ │ │ │ ├── ff4f7dad78081cff727d19ab51c181d4a661_pg1_pg1_repeat1.md │ │ │ │ ├── ff518b1240a66978f22035528ccb029450b5_pg2_pg1_repeat1.md │ │ │ │ ├── ffaac214730d2b8c2ec842e3618ccb9c4259_pg1_pg1_repeat1.md │ │ │ │ └── fff590bed29a2854ac1f874dad5752ede1aa_pg1_pg1_repeat1.md │ │ │ ├── lincoln_letter_pg1_repeat1.md │ │ │ ├── math_2503_04086_pg1_repeat1.md │ │ │ ├── mathfuncs_colswitch_pg1_repeat1.md │ │ │ ├── mathfuncs_pg1_repeat1.md │ │ │ ├── mattsnotes_pg1_repeat1.md │ │ │ ├── mattsnotes_pg2_repeat1.md │ │ │ ├── mattsnotes_pg3_repeat1.md │ │ │ ├── multi_column_miss_pg1_repeat1.md │ │ │ ├── olmo2-pg4_pg1_repeat1.md │ │ │ ├── openstax_caculus_pg_273_pg1_repeat1.md │ │ │ ├── small_page_size_pg1_repeat1.md │ │ │ └── test-graphical-text_pg1_repeat1.md │ │ └── pdfs │ │ │ ├── blank_book_pg1.pdf │ │ │ ├── buildingnotes.pdf │ │ │ ├── discoverworld_crazy_table4.pdf │ │ │ ├── earnings.pdf │ │ │ ├── headers_footers │ │ │ ├── ff0f0b22c55d8b90dd77d153f48e144fc9db_pg2.pdf │ │ │ ├── ff1fc6a205ad039139ce566851b6b260c929_pg1.pdf │ │ │ ├── ff3d6e051903fe5ca9bc172ece14964c5632_pg1.pdf │ │ │ ├── ff4f7dad78081cff727d19ab51c181d4a661_pg1.pdf │ │ │ ├── ff518b1240a66978f22035528ccb029450b5_pg2.pdf │ │ │ ├── ffaac214730d2b8c2ec842e3618ccb9c4259_pg1.pdf │ │ │ └── fff590bed29a2854ac1f874dad5752ede1aa_pg1.pdf │ │ │ ├── lincoln_letter.pdf │ │ │ ├── math_2503_04086.pdf │ │ │ ├── mathfuncs.pdf │ │ │ ├── mathfuncs_colswitch.pdf │ │ │ ├── mattsnotes.pdf │ │ │ ├── multi_column_miss.pdf │ │ │ ├── olmo2-pg4.pdf │ │ │ ├── openstax_caculus_pg_273.pdf │ │ │ ├── small_page_size.pdf │ │ │ └── test-graphical-text.pdf │ ├── scripts │ │ ├── convert_all.sh │ │ ├── difference_viewer.py │ │ ├── rotate_pdfs.py │ │ ├── rotate_pdfs_random.sh │ │ ├── run_difference.py │ │ ├── url_matcher.py │ │ └── workspace_to_bench.py │ ├── synth │ │ ├── __init__.py │ │ ├── mine_html_templates.py │ │ └── rotate_html_templates.py │ ├── templates │ │ ├── all_done.html │ │ ├── all_done_latex.html │ │ ├── review.html │ │ └── review_latex.html │ ├── tests.py │ └── utils.py ├── check.py ├── data │ ├── __init__.py │ ├── build_openai_batch_from_olmocrmix.py │ ├── buildsilver.py │ ├── clean_olmocrmix.py │ ├── prepare_loc_transcripts.py │ ├── prepare_national_archive_transcripts.py │ ├── prepare_olmocrmix.py │ ├── prepare_workspace.py │ ├── process_openai_batch_results.py │ ├── renderpdf.py │ ├── repackage_olmocrmix.py │ └── runopenaibatch.py ├── datatypes.py ├── filter │ ├── __init__.py │ ├── coherency.py │ └── filter.py ├── image_utils.py ├── metrics.py ├── pipeline.py ├── prompts │ ├── __init__.py │ ├── anchor.py │ └── prompts.py ├── py.typed ├── repeatdetect.py ├── s3_utils.py ├── train │ ├── README.md │ ├── compare_vllm_checkpoint.py │ ├── compress_checkpoint.py │ ├── config.py │ ├── configs │ │ ├── v0.2.0 │ │ │ ├── qwen25_vl_b100_x1_day2_capped.yaml │ │ │ ├── qwen25_vl_b100_x1_day3_json.yaml │ │ │ ├── qwen25_vl_b100_x1_day3_json_1280.yaml │ │ │ ├── qwen25_vl_b100_x1_day3_json_1280_noanchor.yaml │ │ │ ├── qwen25_vl_b100_x1_day3_json_1600.yaml │ │ │ ├── qwen25_vl_b100_x1_day3_json_lr5e-5.yaml │ │ │ ├── qwen25_vl_b100_x1_day3_json_wsd.yaml │ │ │ ├── qwen25_vl_b100_x1_day3_yaml_1280_noanchor.yaml │ │ │ ├── qwen25_vl_b100_x1_day3_yaml_1280_noanchor_128batch.yaml │ │ │ ├── qwen25_vl_b100_x1_day3_yaml_1280_noanchor_latexnormalize.yaml │ │ │ ├── qwen25_vl_b100_x1_day3_yaml_1280_noanchor_newprompt.yaml │ │ │ ├── qwen25_vl_b100_x1_default.yaml │ │ │ ├── qwen25_vl_b100_x1_default_fresh_prompt.yaml │ │ │ ├── qwen25_vl_b100_x1_default_fresh_prompt_no_doc_anchoring.yaml │ │ │ ├── qwen25_vl_b100_x1_default_grad_acc32.yaml │ │ │ ├── qwen25_vl_b100_x1_default_image_1280.yaml │ │ │ ├── qwen25_vl_b100_x1_default_image_1280_no_doc_anchor.yaml │ │ │ ├── qwen25_vl_b100_x1_default_image_1600.yaml │ │ │ ├── qwen25_vl_b100_x1_default_image_1600_no_doc_anchor.yaml │ │ │ ├── qwen25_vl_b100_x1_default_json.yaml │ │ │ ├── qwen25_vl_b100_x1_default_no_doc_anchor.yaml │ │ │ ├── qwen25_vl_olmocrv2_1288_soupfull0.yaml │ │ │ ├── qwen25_vl_olmocrv2_1288_soupfull1.yaml │ │ │ ├── qwen25_vl_olmocrv2_1288_soupfull2.yaml │ │ │ ├── qwen25_vl_olmocrv2_1epoch.yaml │ │ │ ├── qwen25_vl_olmocrv2_1epoch_muon2e-5.yaml │ │ │ ├── qwen25_vl_olmocrv2_1epoch_muon2e-6.yaml │ │ │ ├── qwen25_vl_olmocrv2_1epoch_muon6e-6.yaml │ │ │ ├── qwen25_vl_olmocrv2_2epoch.yaml │ │ │ ├── qwen25_vl_olmocrv2_2epoch_promptfirst.yaml │ │ │ ├── qwen25_vl_olmocrv2_soup0.yaml │ │ │ ├── qwen25_vl_olmocrv2_soup1.yaml │ │ │ ├── qwen25_vl_olmocrv2_soup2.yaml │ │ │ ├── qwen25_vl_olmocrv2_tokflip.yaml │ │ │ ├── qwen25_vl_olmocrv2_tokflip1k.yaml │ │ │ ├── qwen25_vl_olmocrv2_tokflip3k.yaml │ │ │ ├── qwen25_vl_olmocrv2_tokflip500.yaml │ │ │ ├── qwen25_vl_olmocrv2_tokflip_2ep.yaml │ │ │ ├── qwen2_vl_b100_x1_day3_json.yaml │ │ │ └── qwen2_vl_b100_x1_day3_yaml.yaml │ │ ├── v0.3.0 │ │ │ ├── qwen25_vl_olmocrv3_1epoch.yaml │ │ │ ├── qwen25_vl_olmocrv3_rotation_1epoch.yaml │ │ │ ├── qwen25_vl_olmocrv3_rotation_2epoch.yaml │ │ │ ├── qwen25_vl_olmocrv3_rotation_2epoch_resumable.yaml │ │ │ ├── qwen25_vl_olmocrv3_rotation_augment_1epoch.yaml │ │ │ ├── qwen25_vl_olmocrv3_rotation_augment_1epoch_resumable.yaml │ │ │ ├── qwen25_vl_olmocrv3_rotation_augment_2epoch_resumable.yaml │ │ │ ├── qwen25_vl_olmocrv3_rotation_localtest.yaml │ │ │ └── qwen25_vl_olmocrv3_rotation_tokflip_1epoch.yaml │ │ └── v0.4.0 │ │ │ ├── qwen25_vl_olmocrv4_finetuning.yaml │ │ │ ├── qwen25_vl_olmocrv4_rotation_1epoch.yaml │ │ │ ├── qwen25_vl_olmocrv4_rotation_1epoch_filtering.yaml │ │ │ ├── qwen25_vl_olmocrv4_rotation_1epoch_filtering_mix_0925_more_rotation.yaml │ │ │ ├── qwen25_vl_olmocrv4_rotation_1epoch_filtering_mix_0925_preempt.yaml │ │ │ ├── qwen25_vl_olmocrv4_rotation_1epoch_filtering_noise.yaml │ │ │ ├── qwen25_vl_olmocrv4_rotation_1epoch_filtering_preempt.yaml │ │ │ ├── qwen25_vl_olmocrv4_rotation_1epoch_localtest.yaml │ │ │ ├── qwen25_vl_olmocrv4_rotation_1epoch_mix_0925_more_rotation.yaml │ │ │ ├── qwen25_vl_olmocrv4_rotation_1epoch_mix_1025.yaml │ │ │ ├── qwen25_vl_olmocrv4_rotation_1epoch_mix_1025_filtered.yaml │ │ │ └── qwen25_vl_olmocrv4_rotation_2epoch.yaml │ ├── dataloader.py │ ├── grpo_train.py │ ├── muon.py │ ├── prepare_checkpoint.py │ ├── quantization_configs │ │ ├── qwen2_5vl_w8a16_fp8.yaml │ │ ├── qwen2_5vl_w8a16_group.yaml │ │ ├── qwen2_5vl_w8a8_fp8.yaml │ │ ├── qwen2_5vl_w8a8_fp8_alt.yaml │ │ ├── qwen2_5vl_w8a8_fp8_block.yaml │ │ ├── qwen2_5vl_w8a8_fp8_kv8.yaml │ │ ├── qwen2_5vl_w8a8_int8.yaml │ │ └── qwen2vl_w8a8_fp8.yaml │ └── train.py ├── version.py ├── viewer │ ├── __init__.py │ ├── dolmaviewer.py │ ├── dolmaviewer_merged_template.html │ └── dolmaviewer_template.html └── work_queue.py ├── pyproject.toml ├── scripts ├── beaker │ ├── Dockerfile-gpu-ci │ ├── gpu-ci-script.sh │ ├── jupiter-ib.sh │ └── pluto-ib.sh ├── check_contamination.py ├── compare_vllm.sh ├── compress_model.sh ├── data │ ├── __init__.py │ ├── buildsilver.py │ ├── buildsilverdatasummary.py │ ├── buildtestset.py │ ├── convertsilver_birr.py │ ├── convertsilver_openai.py │ └── runopenaibatch.py ├── elo │ ├── README.md │ ├── boxplots.png │ ├── calculate_elo_ratings.py │ ├── draw_boxplots.py │ ├── ratings.csv │ └── results.txt ├── eval │ ├── __init__.py │ ├── buildelo.py │ ├── dolma_refine │ │ ├── aligners.py │ │ ├── metrics.py │ │ ├── registry.py │ │ └── segmenters.py │ ├── evalhtml.py │ ├── evalhtml_template.html │ ├── runeval.py │ └── scoreelo.py ├── hf_local_test.py ├── infinigram_count.py ├── jsonl_to_markdown.py ├── movedolmadocs_to_md.py ├── parse_with_pdfminer.py ├── pii │ ├── autoscan_dolmadocs.py │ ├── chatgpt_tag_dolmadocs_v1.py │ ├── chatgpt_tag_dolmadocs_v2.py │ ├── check_qual.sh │ ├── pii_rule_comparison.py │ ├── rich_tagging_pipeline.py │ ├── run_tagging_pipeline.sh │ ├── tagging_pipeline.py │ └── tagging_pipeline_v2.py ├── plots │ ├── ocr_pareto.pdf │ ├── ocr_pareto.png │ ├── olmocr2_timeline.png │ ├── pareto_plot.py │ └── plot_olmocr2_timeline.py ├── prepare_changelog.py ├── release.sh ├── release_notes.py ├── remove_paths_from_olmocrmix.py ├── run_benchmark.sh ├── run_benchmark_guided_decoding.sh ├── run_infrapartner_benchmark.sh ├── run_integration_test.sh ├── run_marker_benchmark.sh ├── run_mineru_benchmark.sh ├── run_paddle_benchmark.sh ├── run_paddlevl_benchmark.sh ├── run_transformers_benchmark.sh ├── s2orc_extractor.sh ├── scan_dolmadocs.py ├── sync_beaker_image.sh └── train │ ├── grpotrainer-beaker-multi-gpu-augusta.sh │ ├── grpotrainer-beaker-multi-gpu.sh │ ├── grpotrainer-beaker.sh │ ├── newtrainer-beaker.sh │ └── newtrainer-frontier.sh └── tests ├── __init__.py ├── gnarly_pdfs ├── ambiguous.pdf ├── badlines.pdf ├── bws_book_ch2.pdf ├── discoverworld_crazy_tables.pdf ├── dolma-page-1.pdf ├── edgar-rotated90.pdf ├── edgar.pdf ├── edgar_image.pdf ├── failing_anchor_pg4.pdf ├── failing_pdf_pg9.pdf ├── form_on_later_pages.pdf ├── guidebook_failed_pages.pdf ├── handwriting_bad_ocr.pdf ├── horribleocr.pdf ├── instructions_and_schematics.pdf ├── large_prompt_hint1.pdf ├── large_prompt_hint2.pdf ├── large_prompt_hint3.pdf ├── load_v_error.pdf ├── lots_of_chem_tables.pdf ├── lots_of_sci_tables.pdf ├── map1.pdf ├── most_content_in_image_form.pdf ├── newspaper.pdf ├── not_parsing.pdf ├── not_parsing2.pdf ├── olmo-page-1.pdf ├── overrun_on_pg8.pdf ├── pdftotext_two_column_issue.pdf ├── repeating_references_on_pg9_pg10.pdf ├── skinnypage.pdf ├── slideshow_mostly_good_some_pages_should_get_filtered.pdf ├── slideshow_mostly_images.pdf ├── small_page_size.pdf ├── some_ocr1.pdf ├── ti89_guidebook_programming.pdf └── tobacco_missed_tokens_pg1.pdf ├── sample_dataset ├── empty_document │ ├── blanktext.md │ └── blanktext.pdf ├── simple_document │ ├── edgar.md │ └── edgar.pdf └── urls.jsonl ├── test_anchor.py ├── test_dataloader.py ├── test_filter.py ├── test_grpo.py ├── test_integration.py ├── test_katex_render.py ├── test_mine_html_templates.py ├── test_olmocrmix.py ├── test_pipeline.py ├── test_s3_work_queue.py └── test_tests.py /.dockerignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/.dockerignore -------------------------------------------------------------------------------- /.github/CONTRIBUTING.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/.github/CONTRIBUTING.md -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/.github/ISSUE_TEMPLATE/bug_report.yml -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/documentation.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/.github/ISSUE_TEMPLATE/documentation.yml -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/.github/ISSUE_TEMPLATE/feature_request.yml -------------------------------------------------------------------------------- /.github/actions/setup-venv/action.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/.github/actions/setup-venv/action.yml -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/.github/dependabot.yml -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/.github/pull_request_template.md -------------------------------------------------------------------------------- /.github/workflows/main.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/.github/workflows/main.yml -------------------------------------------------------------------------------- /.github/workflows/pr_checks.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/.github/workflows/pr_checks.yml -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/.gitignore -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/.readthedocs.yaml -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/CHANGELOG.md -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/Dockerfile -------------------------------------------------------------------------------- /Dockerfile.with-model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/Dockerfile.with-model -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/LICENSE -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/Makefile -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/README.md -------------------------------------------------------------------------------- /RELEASE_PROCESS.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/RELEASE_PROCESS.md -------------------------------------------------------------------------------- /docs/.gitignore: -------------------------------------------------------------------------------- 1 | build 2 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/docs/Makefile -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/docs/make.bat -------------------------------------------------------------------------------- /docs/source/CHANGELOG.md: -------------------------------------------------------------------------------- 1 | ../../CHANGELOG.md -------------------------------------------------------------------------------- /docs/source/CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | ../../.github/CONTRIBUTING.md -------------------------------------------------------------------------------- /docs/source/_static/css/custom.css: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/source/_static/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/docs/source/_static/favicon.ico -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/docs/source/conf.py -------------------------------------------------------------------------------- /docs/source/index.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/docs/source/index.md -------------------------------------------------------------------------------- /docs/source/installation.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/docs/source/installation.md -------------------------------------------------------------------------------- /docs/source/ocr_pareto.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/docs/source/ocr_pareto.pdf -------------------------------------------------------------------------------- /docs/source/ocr_pareto.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/docs/source/ocr_pareto.png -------------------------------------------------------------------------------- /docs/source/overview.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/docs/source/overview.md -------------------------------------------------------------------------------- /olmOCR-2-Unit-Test-Rewards-for-Document-OCR.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmOCR-2-Unit-Test-Rewards-for-Document-OCR.pdf -------------------------------------------------------------------------------- /olmocr/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/__init__.py -------------------------------------------------------------------------------- /olmocr/bench/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/README.md -------------------------------------------------------------------------------- /olmocr/bench/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /olmocr/bench/benchmark.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/benchmark.py -------------------------------------------------------------------------------- /olmocr/bench/convert.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/convert.py -------------------------------------------------------------------------------- /olmocr/bench/katex/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/katex/__init__.py -------------------------------------------------------------------------------- /olmocr/bench/katex/auto-render.min.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/katex/auto-render.min.js -------------------------------------------------------------------------------- /olmocr/bench/katex/katex.min.css: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/katex/katex.min.css -------------------------------------------------------------------------------- /olmocr/bench/katex/katex.min.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/katex/katex.min.js -------------------------------------------------------------------------------- /olmocr/bench/katex/render.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/katex/render.py -------------------------------------------------------------------------------- /olmocr/bench/miners/check_headers_footers.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/miners/check_headers_footers.py -------------------------------------------------------------------------------- /olmocr/bench/miners/check_multicolumn.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/miners/check_multicolumn.py -------------------------------------------------------------------------------- /olmocr/bench/miners/check_old_scans_math.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/miners/check_old_scans_math.py -------------------------------------------------------------------------------- /olmocr/bench/miners/cleanup_data.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/miners/cleanup_data.py -------------------------------------------------------------------------------- /olmocr/bench/miners/cleanup_urls.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/miners/cleanup_urls.py -------------------------------------------------------------------------------- /olmocr/bench/miners/delete_rejected.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/miners/delete_rejected.py -------------------------------------------------------------------------------- /olmocr/bench/miners/download_math.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/miners/download_math.py -------------------------------------------------------------------------------- /olmocr/bench/miners/mine_blank_pages_gpt.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/miners/mine_blank_pages_gpt.py -------------------------------------------------------------------------------- /olmocr/bench/miners/mine_diffs.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/miners/mine_diffs.py -------------------------------------------------------------------------------- /olmocr/bench/miners/mine_headers_footers.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/miners/mine_headers_footers.py -------------------------------------------------------------------------------- /olmocr/bench/miners/mine_long_tiny_text.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/miners/mine_long_tiny_text.py -------------------------------------------------------------------------------- /olmocr/bench/miners/mine_math.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/miners/mine_math.py -------------------------------------------------------------------------------- /olmocr/bench/miners/mine_multi_column.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/miners/mine_multi_column.py -------------------------------------------------------------------------------- /olmocr/bench/miners/mine_multilingual_gpt.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/miners/mine_multilingual_gpt.py -------------------------------------------------------------------------------- /olmocr/bench/miners/mine_old_scan_pdf.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/miners/mine_old_scan_pdf.py -------------------------------------------------------------------------------- /olmocr/bench/miners/mine_old_scans.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/miners/mine_old_scans.py -------------------------------------------------------------------------------- /olmocr/bench/miners/mine_old_scans_math.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/miners/mine_old_scans_math.py -------------------------------------------------------------------------------- /olmocr/bench/miners/mine_reading_order.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/miners/mine_reading_order.py -------------------------------------------------------------------------------- /olmocr/bench/miners/mine_tables_gemini.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/miners/mine_tables_gemini.py -------------------------------------------------------------------------------- /olmocr/bench/miners/mine_tables_gpt.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/miners/mine_tables_gpt.py -------------------------------------------------------------------------------- /olmocr/bench/miners/mine_tables_gpt_simple.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/miners/mine_tables_gpt_simple.py -------------------------------------------------------------------------------- /olmocr/bench/miners/pick_mediod.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/miners/pick_mediod.py -------------------------------------------------------------------------------- /olmocr/bench/prompts.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/prompts.py -------------------------------------------------------------------------------- /olmocr/bench/report.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/report.py -------------------------------------------------------------------------------- /olmocr/bench/review_app.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/review_app.py -------------------------------------------------------------------------------- /olmocr/bench/review_app_latex.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/review_app_latex.py -------------------------------------------------------------------------------- /olmocr/bench/runners/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /olmocr/bench/runners/run_chatgpt.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/runners/run_chatgpt.py -------------------------------------------------------------------------------- /olmocr/bench/runners/run_claude.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/runners/run_claude.py -------------------------------------------------------------------------------- /olmocr/bench/runners/run_docling.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/runners/run_docling.py -------------------------------------------------------------------------------- /olmocr/bench/runners/run_gemini.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/runners/run_gemini.py -------------------------------------------------------------------------------- /olmocr/bench/runners/run_gotocr.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/runners/run_gotocr.py -------------------------------------------------------------------------------- /olmocr/bench/runners/run_marker.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/runners/run_marker.py -------------------------------------------------------------------------------- /olmocr/bench/runners/run_mineru.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/runners/run_mineru.py -------------------------------------------------------------------------------- /olmocr/bench/runners/run_mistral.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/runners/run_mistral.py -------------------------------------------------------------------------------- /olmocr/bench/runners/run_nanonetsocr.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/runners/run_nanonetsocr.py -------------------------------------------------------------------------------- /olmocr/bench/runners/run_nanonetsocr_2.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/runners/run_nanonetsocr_2.py -------------------------------------------------------------------------------- /olmocr/bench/runners/run_olmocr_pipeline.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/runners/run_olmocr_pipeline.py -------------------------------------------------------------------------------- /olmocr/bench/runners/run_paddlepaddle.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/runners/run_paddlepaddle.py -------------------------------------------------------------------------------- /olmocr/bench/runners/run_paddlevl.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/runners/run_paddlevl.py -------------------------------------------------------------------------------- /olmocr/bench/runners/run_rolmocr.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/runners/run_rolmocr.py -------------------------------------------------------------------------------- /olmocr/bench/runners/run_server.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/runners/run_server.py -------------------------------------------------------------------------------- /olmocr/bench/runners/run_transformers.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/runners/run_transformers.py -------------------------------------------------------------------------------- /olmocr/bench/sample_data/blanks.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/sample_data/blanks.jsonl -------------------------------------------------------------------------------- /olmocr/bench/sample_data/dataset.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/sample_data/dataset.jsonl -------------------------------------------------------------------------------- /olmocr/bench/sample_data/olmocr_pipeline/blank_book_pg1_pg1_repeat1.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /olmocr/bench/sample_data/olmocr_pipeline/buildingnotes_pg1_repeat1.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/sample_data/olmocr_pipeline/buildingnotes_pg1_repeat1.md -------------------------------------------------------------------------------- /olmocr/bench/sample_data/olmocr_pipeline/discoverworld_crazy_table4_pg1_repeat1.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/sample_data/olmocr_pipeline/discoverworld_crazy_table4_pg1_repeat1.md -------------------------------------------------------------------------------- /olmocr/bench/sample_data/olmocr_pipeline/earnings_pg1_repeat1.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/sample_data/olmocr_pipeline/earnings_pg1_repeat1.md -------------------------------------------------------------------------------- /olmocr/bench/sample_data/olmocr_pipeline/headers_footers/ff0f0b22c55d8b90dd77d153f48e144fc9db_pg2_pg1_repeat1.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/sample_data/olmocr_pipeline/headers_footers/ff0f0b22c55d8b90dd77d153f48e144fc9db_pg2_pg1_repeat1.md -------------------------------------------------------------------------------- /olmocr/bench/sample_data/olmocr_pipeline/headers_footers/ff1fc6a205ad039139ce566851b6b260c929_pg1_pg1_repeat1.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/sample_data/olmocr_pipeline/headers_footers/ff1fc6a205ad039139ce566851b6b260c929_pg1_pg1_repeat1.md -------------------------------------------------------------------------------- /olmocr/bench/sample_data/olmocr_pipeline/headers_footers/ff3d6e051903fe5ca9bc172ece14964c5632_pg1_pg1_repeat1.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/sample_data/olmocr_pipeline/headers_footers/ff3d6e051903fe5ca9bc172ece14964c5632_pg1_pg1_repeat1.md -------------------------------------------------------------------------------- /olmocr/bench/sample_data/olmocr_pipeline/headers_footers/ff4f7dad78081cff727d19ab51c181d4a661_pg1_pg1_repeat1.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/sample_data/olmocr_pipeline/headers_footers/ff4f7dad78081cff727d19ab51c181d4a661_pg1_pg1_repeat1.md -------------------------------------------------------------------------------- /olmocr/bench/sample_data/olmocr_pipeline/headers_footers/ff518b1240a66978f22035528ccb029450b5_pg2_pg1_repeat1.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/sample_data/olmocr_pipeline/headers_footers/ff518b1240a66978f22035528ccb029450b5_pg2_pg1_repeat1.md -------------------------------------------------------------------------------- /olmocr/bench/sample_data/olmocr_pipeline/headers_footers/ffaac214730d2b8c2ec842e3618ccb9c4259_pg1_pg1_repeat1.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /olmocr/bench/sample_data/olmocr_pipeline/headers_footers/fff590bed29a2854ac1f874dad5752ede1aa_pg1_pg1_repeat1.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/sample_data/olmocr_pipeline/headers_footers/fff590bed29a2854ac1f874dad5752ede1aa_pg1_pg1_repeat1.md -------------------------------------------------------------------------------- /olmocr/bench/sample_data/olmocr_pipeline/lincoln_letter_pg1_repeat1.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/sample_data/olmocr_pipeline/lincoln_letter_pg1_repeat1.md -------------------------------------------------------------------------------- /olmocr/bench/sample_data/olmocr_pipeline/math_2503_04086_pg1_repeat1.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/sample_data/olmocr_pipeline/math_2503_04086_pg1_repeat1.md -------------------------------------------------------------------------------- /olmocr/bench/sample_data/olmocr_pipeline/mathfuncs_colswitch_pg1_repeat1.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/sample_data/olmocr_pipeline/mathfuncs_colswitch_pg1_repeat1.md -------------------------------------------------------------------------------- /olmocr/bench/sample_data/olmocr_pipeline/mathfuncs_pg1_repeat1.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/sample_data/olmocr_pipeline/mathfuncs_pg1_repeat1.md -------------------------------------------------------------------------------- /olmocr/bench/sample_data/olmocr_pipeline/mattsnotes_pg1_repeat1.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/sample_data/olmocr_pipeline/mattsnotes_pg1_repeat1.md -------------------------------------------------------------------------------- /olmocr/bench/sample_data/olmocr_pipeline/mattsnotes_pg2_repeat1.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/sample_data/olmocr_pipeline/mattsnotes_pg2_repeat1.md -------------------------------------------------------------------------------- /olmocr/bench/sample_data/olmocr_pipeline/mattsnotes_pg3_repeat1.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/sample_data/olmocr_pipeline/mattsnotes_pg3_repeat1.md -------------------------------------------------------------------------------- /olmocr/bench/sample_data/olmocr_pipeline/multi_column_miss_pg1_repeat1.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/sample_data/olmocr_pipeline/multi_column_miss_pg1_repeat1.md -------------------------------------------------------------------------------- /olmocr/bench/sample_data/olmocr_pipeline/olmo2-pg4_pg1_repeat1.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/sample_data/olmocr_pipeline/olmo2-pg4_pg1_repeat1.md -------------------------------------------------------------------------------- /olmocr/bench/sample_data/olmocr_pipeline/openstax_caculus_pg_273_pg1_repeat1.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/sample_data/olmocr_pipeline/openstax_caculus_pg_273_pg1_repeat1.md -------------------------------------------------------------------------------- /olmocr/bench/sample_data/olmocr_pipeline/small_page_size_pg1_repeat1.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/sample_data/olmocr_pipeline/small_page_size_pg1_repeat1.md -------------------------------------------------------------------------------- /olmocr/bench/sample_data/olmocr_pipeline/test-graphical-text_pg1_repeat1.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/sample_data/olmocr_pipeline/test-graphical-text_pg1_repeat1.md -------------------------------------------------------------------------------- /olmocr/bench/sample_data/pdfs/blank_book_pg1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/sample_data/pdfs/blank_book_pg1.pdf -------------------------------------------------------------------------------- /olmocr/bench/sample_data/pdfs/buildingnotes.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/sample_data/pdfs/buildingnotes.pdf -------------------------------------------------------------------------------- /olmocr/bench/sample_data/pdfs/discoverworld_crazy_table4.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/sample_data/pdfs/discoverworld_crazy_table4.pdf -------------------------------------------------------------------------------- /olmocr/bench/sample_data/pdfs/earnings.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/sample_data/pdfs/earnings.pdf -------------------------------------------------------------------------------- /olmocr/bench/sample_data/pdfs/headers_footers/ff0f0b22c55d8b90dd77d153f48e144fc9db_pg2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/sample_data/pdfs/headers_footers/ff0f0b22c55d8b90dd77d153f48e144fc9db_pg2.pdf -------------------------------------------------------------------------------- /olmocr/bench/sample_data/pdfs/headers_footers/ff1fc6a205ad039139ce566851b6b260c929_pg1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/sample_data/pdfs/headers_footers/ff1fc6a205ad039139ce566851b6b260c929_pg1.pdf -------------------------------------------------------------------------------- /olmocr/bench/sample_data/pdfs/headers_footers/ff3d6e051903fe5ca9bc172ece14964c5632_pg1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/sample_data/pdfs/headers_footers/ff3d6e051903fe5ca9bc172ece14964c5632_pg1.pdf -------------------------------------------------------------------------------- /olmocr/bench/sample_data/pdfs/headers_footers/ff4f7dad78081cff727d19ab51c181d4a661_pg1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/sample_data/pdfs/headers_footers/ff4f7dad78081cff727d19ab51c181d4a661_pg1.pdf -------------------------------------------------------------------------------- /olmocr/bench/sample_data/pdfs/headers_footers/ff518b1240a66978f22035528ccb029450b5_pg2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/sample_data/pdfs/headers_footers/ff518b1240a66978f22035528ccb029450b5_pg2.pdf -------------------------------------------------------------------------------- /olmocr/bench/sample_data/pdfs/headers_footers/ffaac214730d2b8c2ec842e3618ccb9c4259_pg1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/sample_data/pdfs/headers_footers/ffaac214730d2b8c2ec842e3618ccb9c4259_pg1.pdf -------------------------------------------------------------------------------- /olmocr/bench/sample_data/pdfs/headers_footers/fff590bed29a2854ac1f874dad5752ede1aa_pg1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/sample_data/pdfs/headers_footers/fff590bed29a2854ac1f874dad5752ede1aa_pg1.pdf -------------------------------------------------------------------------------- /olmocr/bench/sample_data/pdfs/lincoln_letter.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/sample_data/pdfs/lincoln_letter.pdf -------------------------------------------------------------------------------- /olmocr/bench/sample_data/pdfs/math_2503_04086.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/sample_data/pdfs/math_2503_04086.pdf -------------------------------------------------------------------------------- /olmocr/bench/sample_data/pdfs/mathfuncs.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/sample_data/pdfs/mathfuncs.pdf -------------------------------------------------------------------------------- /olmocr/bench/sample_data/pdfs/mathfuncs_colswitch.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/sample_data/pdfs/mathfuncs_colswitch.pdf -------------------------------------------------------------------------------- /olmocr/bench/sample_data/pdfs/mattsnotes.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/sample_data/pdfs/mattsnotes.pdf -------------------------------------------------------------------------------- /olmocr/bench/sample_data/pdfs/multi_column_miss.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/sample_data/pdfs/multi_column_miss.pdf -------------------------------------------------------------------------------- /olmocr/bench/sample_data/pdfs/olmo2-pg4.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/sample_data/pdfs/olmo2-pg4.pdf -------------------------------------------------------------------------------- /olmocr/bench/sample_data/pdfs/openstax_caculus_pg_273.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/sample_data/pdfs/openstax_caculus_pg_273.pdf -------------------------------------------------------------------------------- /olmocr/bench/sample_data/pdfs/small_page_size.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/sample_data/pdfs/small_page_size.pdf -------------------------------------------------------------------------------- /olmocr/bench/sample_data/pdfs/test-graphical-text.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/sample_data/pdfs/test-graphical-text.pdf -------------------------------------------------------------------------------- /olmocr/bench/scripts/convert_all.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/scripts/convert_all.sh -------------------------------------------------------------------------------- /olmocr/bench/scripts/difference_viewer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/scripts/difference_viewer.py -------------------------------------------------------------------------------- /olmocr/bench/scripts/rotate_pdfs.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/scripts/rotate_pdfs.py -------------------------------------------------------------------------------- /olmocr/bench/scripts/rotate_pdfs_random.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/scripts/rotate_pdfs_random.sh -------------------------------------------------------------------------------- /olmocr/bench/scripts/run_difference.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/scripts/run_difference.py -------------------------------------------------------------------------------- /olmocr/bench/scripts/url_matcher.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/scripts/url_matcher.py -------------------------------------------------------------------------------- /olmocr/bench/scripts/workspace_to_bench.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/scripts/workspace_to_bench.py -------------------------------------------------------------------------------- /olmocr/bench/synth/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /olmocr/bench/synth/mine_html_templates.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/synth/mine_html_templates.py -------------------------------------------------------------------------------- /olmocr/bench/synth/rotate_html_templates.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/synth/rotate_html_templates.py -------------------------------------------------------------------------------- /olmocr/bench/templates/all_done.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/templates/all_done.html -------------------------------------------------------------------------------- /olmocr/bench/templates/all_done_latex.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/templates/all_done_latex.html -------------------------------------------------------------------------------- /olmocr/bench/templates/review.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/templates/review.html -------------------------------------------------------------------------------- /olmocr/bench/templates/review_latex.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/templates/review_latex.html -------------------------------------------------------------------------------- /olmocr/bench/tests.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/tests.py -------------------------------------------------------------------------------- /olmocr/bench/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/bench/utils.py -------------------------------------------------------------------------------- /olmocr/check.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/check.py -------------------------------------------------------------------------------- /olmocr/data/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /olmocr/data/build_openai_batch_from_olmocrmix.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/data/build_openai_batch_from_olmocrmix.py -------------------------------------------------------------------------------- /olmocr/data/buildsilver.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/data/buildsilver.py -------------------------------------------------------------------------------- /olmocr/data/clean_olmocrmix.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/data/clean_olmocrmix.py -------------------------------------------------------------------------------- /olmocr/data/prepare_loc_transcripts.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/data/prepare_loc_transcripts.py -------------------------------------------------------------------------------- /olmocr/data/prepare_national_archive_transcripts.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/data/prepare_national_archive_transcripts.py -------------------------------------------------------------------------------- /olmocr/data/prepare_olmocrmix.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/data/prepare_olmocrmix.py -------------------------------------------------------------------------------- /olmocr/data/prepare_workspace.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/data/prepare_workspace.py -------------------------------------------------------------------------------- /olmocr/data/process_openai_batch_results.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/data/process_openai_batch_results.py -------------------------------------------------------------------------------- /olmocr/data/renderpdf.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/data/renderpdf.py -------------------------------------------------------------------------------- /olmocr/data/repackage_olmocrmix.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/data/repackage_olmocrmix.py -------------------------------------------------------------------------------- /olmocr/data/runopenaibatch.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/data/runopenaibatch.py -------------------------------------------------------------------------------- /olmocr/datatypes.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/datatypes.py -------------------------------------------------------------------------------- /olmocr/filter/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/filter/__init__.py -------------------------------------------------------------------------------- /olmocr/filter/coherency.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/filter/coherency.py -------------------------------------------------------------------------------- /olmocr/filter/filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/filter/filter.py -------------------------------------------------------------------------------- /olmocr/image_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/image_utils.py -------------------------------------------------------------------------------- /olmocr/metrics.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/metrics.py -------------------------------------------------------------------------------- /olmocr/pipeline.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/pipeline.py -------------------------------------------------------------------------------- /olmocr/prompts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/prompts/__init__.py -------------------------------------------------------------------------------- /olmocr/prompts/anchor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/prompts/anchor.py -------------------------------------------------------------------------------- /olmocr/prompts/prompts.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/prompts/prompts.py -------------------------------------------------------------------------------- /olmocr/py.typed: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /olmocr/repeatdetect.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/repeatdetect.py -------------------------------------------------------------------------------- /olmocr/s3_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/s3_utils.py -------------------------------------------------------------------------------- /olmocr/train/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/train/README.md -------------------------------------------------------------------------------- /olmocr/train/compare_vllm_checkpoint.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/train/compare_vllm_checkpoint.py -------------------------------------------------------------------------------- /olmocr/train/compress_checkpoint.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/train/compress_checkpoint.py -------------------------------------------------------------------------------- /olmocr/train/config.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/train/config.py -------------------------------------------------------------------------------- /olmocr/train/configs/v0.2.0/qwen25_vl_b100_x1_day2_capped.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/train/configs/v0.2.0/qwen25_vl_b100_x1_day2_capped.yaml -------------------------------------------------------------------------------- /olmocr/train/configs/v0.2.0/qwen25_vl_b100_x1_day3_json.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/train/configs/v0.2.0/qwen25_vl_b100_x1_day3_json.yaml -------------------------------------------------------------------------------- /olmocr/train/configs/v0.2.0/qwen25_vl_b100_x1_day3_json_1280.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/train/configs/v0.2.0/qwen25_vl_b100_x1_day3_json_1280.yaml -------------------------------------------------------------------------------- /olmocr/train/configs/v0.2.0/qwen25_vl_b100_x1_day3_json_1280_noanchor.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/train/configs/v0.2.0/qwen25_vl_b100_x1_day3_json_1280_noanchor.yaml -------------------------------------------------------------------------------- /olmocr/train/configs/v0.2.0/qwen25_vl_b100_x1_day3_json_1600.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/train/configs/v0.2.0/qwen25_vl_b100_x1_day3_json_1600.yaml -------------------------------------------------------------------------------- /olmocr/train/configs/v0.2.0/qwen25_vl_b100_x1_day3_json_lr5e-5.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/train/configs/v0.2.0/qwen25_vl_b100_x1_day3_json_lr5e-5.yaml -------------------------------------------------------------------------------- /olmocr/train/configs/v0.2.0/qwen25_vl_b100_x1_day3_json_wsd.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/train/configs/v0.2.0/qwen25_vl_b100_x1_day3_json_wsd.yaml -------------------------------------------------------------------------------- /olmocr/train/configs/v0.2.0/qwen25_vl_b100_x1_day3_yaml_1280_noanchor.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/train/configs/v0.2.0/qwen25_vl_b100_x1_day3_yaml_1280_noanchor.yaml -------------------------------------------------------------------------------- /olmocr/train/configs/v0.2.0/qwen25_vl_b100_x1_day3_yaml_1280_noanchor_128batch.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/train/configs/v0.2.0/qwen25_vl_b100_x1_day3_yaml_1280_noanchor_128batch.yaml -------------------------------------------------------------------------------- /olmocr/train/configs/v0.2.0/qwen25_vl_b100_x1_day3_yaml_1280_noanchor_latexnormalize.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/train/configs/v0.2.0/qwen25_vl_b100_x1_day3_yaml_1280_noanchor_latexnormalize.yaml -------------------------------------------------------------------------------- /olmocr/train/configs/v0.2.0/qwen25_vl_b100_x1_day3_yaml_1280_noanchor_newprompt.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/train/configs/v0.2.0/qwen25_vl_b100_x1_day3_yaml_1280_noanchor_newprompt.yaml -------------------------------------------------------------------------------- /olmocr/train/configs/v0.2.0/qwen25_vl_b100_x1_default.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/train/configs/v0.2.0/qwen25_vl_b100_x1_default.yaml -------------------------------------------------------------------------------- /olmocr/train/configs/v0.2.0/qwen25_vl_b100_x1_default_fresh_prompt.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/train/configs/v0.2.0/qwen25_vl_b100_x1_default_fresh_prompt.yaml -------------------------------------------------------------------------------- /olmocr/train/configs/v0.2.0/qwen25_vl_b100_x1_default_fresh_prompt_no_doc_anchoring.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/train/configs/v0.2.0/qwen25_vl_b100_x1_default_fresh_prompt_no_doc_anchoring.yaml -------------------------------------------------------------------------------- /olmocr/train/configs/v0.2.0/qwen25_vl_b100_x1_default_grad_acc32.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/train/configs/v0.2.0/qwen25_vl_b100_x1_default_grad_acc32.yaml -------------------------------------------------------------------------------- /olmocr/train/configs/v0.2.0/qwen25_vl_b100_x1_default_image_1280.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/train/configs/v0.2.0/qwen25_vl_b100_x1_default_image_1280.yaml -------------------------------------------------------------------------------- /olmocr/train/configs/v0.2.0/qwen25_vl_b100_x1_default_image_1280_no_doc_anchor.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/train/configs/v0.2.0/qwen25_vl_b100_x1_default_image_1280_no_doc_anchor.yaml -------------------------------------------------------------------------------- /olmocr/train/configs/v0.2.0/qwen25_vl_b100_x1_default_image_1600.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/train/configs/v0.2.0/qwen25_vl_b100_x1_default_image_1600.yaml -------------------------------------------------------------------------------- /olmocr/train/configs/v0.2.0/qwen25_vl_b100_x1_default_image_1600_no_doc_anchor.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/train/configs/v0.2.0/qwen25_vl_b100_x1_default_image_1600_no_doc_anchor.yaml -------------------------------------------------------------------------------- /olmocr/train/configs/v0.2.0/qwen25_vl_b100_x1_default_json.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/train/configs/v0.2.0/qwen25_vl_b100_x1_default_json.yaml -------------------------------------------------------------------------------- /olmocr/train/configs/v0.2.0/qwen25_vl_b100_x1_default_no_doc_anchor.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/train/configs/v0.2.0/qwen25_vl_b100_x1_default_no_doc_anchor.yaml -------------------------------------------------------------------------------- /olmocr/train/configs/v0.2.0/qwen25_vl_olmocrv2_1288_soupfull0.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/train/configs/v0.2.0/qwen25_vl_olmocrv2_1288_soupfull0.yaml -------------------------------------------------------------------------------- /olmocr/train/configs/v0.2.0/qwen25_vl_olmocrv2_1288_soupfull1.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/train/configs/v0.2.0/qwen25_vl_olmocrv2_1288_soupfull1.yaml -------------------------------------------------------------------------------- /olmocr/train/configs/v0.2.0/qwen25_vl_olmocrv2_1288_soupfull2.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/train/configs/v0.2.0/qwen25_vl_olmocrv2_1288_soupfull2.yaml -------------------------------------------------------------------------------- /olmocr/train/configs/v0.2.0/qwen25_vl_olmocrv2_1epoch.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/train/configs/v0.2.0/qwen25_vl_olmocrv2_1epoch.yaml -------------------------------------------------------------------------------- /olmocr/train/configs/v0.2.0/qwen25_vl_olmocrv2_1epoch_muon2e-5.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/train/configs/v0.2.0/qwen25_vl_olmocrv2_1epoch_muon2e-5.yaml -------------------------------------------------------------------------------- /olmocr/train/configs/v0.2.0/qwen25_vl_olmocrv2_1epoch_muon2e-6.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/train/configs/v0.2.0/qwen25_vl_olmocrv2_1epoch_muon2e-6.yaml -------------------------------------------------------------------------------- /olmocr/train/configs/v0.2.0/qwen25_vl_olmocrv2_1epoch_muon6e-6.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/train/configs/v0.2.0/qwen25_vl_olmocrv2_1epoch_muon6e-6.yaml -------------------------------------------------------------------------------- /olmocr/train/configs/v0.2.0/qwen25_vl_olmocrv2_2epoch.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/train/configs/v0.2.0/qwen25_vl_olmocrv2_2epoch.yaml -------------------------------------------------------------------------------- /olmocr/train/configs/v0.2.0/qwen25_vl_olmocrv2_2epoch_promptfirst.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/train/configs/v0.2.0/qwen25_vl_olmocrv2_2epoch_promptfirst.yaml -------------------------------------------------------------------------------- /olmocr/train/configs/v0.2.0/qwen25_vl_olmocrv2_soup0.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/train/configs/v0.2.0/qwen25_vl_olmocrv2_soup0.yaml -------------------------------------------------------------------------------- /olmocr/train/configs/v0.2.0/qwen25_vl_olmocrv2_soup1.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/train/configs/v0.2.0/qwen25_vl_olmocrv2_soup1.yaml -------------------------------------------------------------------------------- /olmocr/train/configs/v0.2.0/qwen25_vl_olmocrv2_soup2.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/train/configs/v0.2.0/qwen25_vl_olmocrv2_soup2.yaml -------------------------------------------------------------------------------- /olmocr/train/configs/v0.2.0/qwen25_vl_olmocrv2_tokflip.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/train/configs/v0.2.0/qwen25_vl_olmocrv2_tokflip.yaml -------------------------------------------------------------------------------- /olmocr/train/configs/v0.2.0/qwen25_vl_olmocrv2_tokflip1k.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/train/configs/v0.2.0/qwen25_vl_olmocrv2_tokflip1k.yaml -------------------------------------------------------------------------------- /olmocr/train/configs/v0.2.0/qwen25_vl_olmocrv2_tokflip3k.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/train/configs/v0.2.0/qwen25_vl_olmocrv2_tokflip3k.yaml -------------------------------------------------------------------------------- /olmocr/train/configs/v0.2.0/qwen25_vl_olmocrv2_tokflip500.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/train/configs/v0.2.0/qwen25_vl_olmocrv2_tokflip500.yaml -------------------------------------------------------------------------------- /olmocr/train/configs/v0.2.0/qwen25_vl_olmocrv2_tokflip_2ep.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/train/configs/v0.2.0/qwen25_vl_olmocrv2_tokflip_2ep.yaml -------------------------------------------------------------------------------- /olmocr/train/configs/v0.2.0/qwen2_vl_b100_x1_day3_json.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/train/configs/v0.2.0/qwen2_vl_b100_x1_day3_json.yaml -------------------------------------------------------------------------------- /olmocr/train/configs/v0.2.0/qwen2_vl_b100_x1_day3_yaml.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/train/configs/v0.2.0/qwen2_vl_b100_x1_day3_yaml.yaml -------------------------------------------------------------------------------- /olmocr/train/configs/v0.3.0/qwen25_vl_olmocrv3_1epoch.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/train/configs/v0.3.0/qwen25_vl_olmocrv3_1epoch.yaml -------------------------------------------------------------------------------- /olmocr/train/configs/v0.3.0/qwen25_vl_olmocrv3_rotation_1epoch.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/train/configs/v0.3.0/qwen25_vl_olmocrv3_rotation_1epoch.yaml -------------------------------------------------------------------------------- /olmocr/train/configs/v0.3.0/qwen25_vl_olmocrv3_rotation_2epoch.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/train/configs/v0.3.0/qwen25_vl_olmocrv3_rotation_2epoch.yaml -------------------------------------------------------------------------------- /olmocr/train/configs/v0.3.0/qwen25_vl_olmocrv3_rotation_2epoch_resumable.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/train/configs/v0.3.0/qwen25_vl_olmocrv3_rotation_2epoch_resumable.yaml -------------------------------------------------------------------------------- /olmocr/train/configs/v0.3.0/qwen25_vl_olmocrv3_rotation_augment_1epoch.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/train/configs/v0.3.0/qwen25_vl_olmocrv3_rotation_augment_1epoch.yaml -------------------------------------------------------------------------------- /olmocr/train/configs/v0.3.0/qwen25_vl_olmocrv3_rotation_augment_1epoch_resumable.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/train/configs/v0.3.0/qwen25_vl_olmocrv3_rotation_augment_1epoch_resumable.yaml -------------------------------------------------------------------------------- /olmocr/train/configs/v0.3.0/qwen25_vl_olmocrv3_rotation_augment_2epoch_resumable.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/train/configs/v0.3.0/qwen25_vl_olmocrv3_rotation_augment_2epoch_resumable.yaml -------------------------------------------------------------------------------- /olmocr/train/configs/v0.3.0/qwen25_vl_olmocrv3_rotation_localtest.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/train/configs/v0.3.0/qwen25_vl_olmocrv3_rotation_localtest.yaml -------------------------------------------------------------------------------- /olmocr/train/configs/v0.3.0/qwen25_vl_olmocrv3_rotation_tokflip_1epoch.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/train/configs/v0.3.0/qwen25_vl_olmocrv3_rotation_tokflip_1epoch.yaml -------------------------------------------------------------------------------- /olmocr/train/configs/v0.4.0/qwen25_vl_olmocrv4_finetuning.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/train/configs/v0.4.0/qwen25_vl_olmocrv4_finetuning.yaml -------------------------------------------------------------------------------- /olmocr/train/configs/v0.4.0/qwen25_vl_olmocrv4_rotation_1epoch.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/train/configs/v0.4.0/qwen25_vl_olmocrv4_rotation_1epoch.yaml -------------------------------------------------------------------------------- /olmocr/train/configs/v0.4.0/qwen25_vl_olmocrv4_rotation_1epoch_filtering.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/train/configs/v0.4.0/qwen25_vl_olmocrv4_rotation_1epoch_filtering.yaml -------------------------------------------------------------------------------- /olmocr/train/configs/v0.4.0/qwen25_vl_olmocrv4_rotation_1epoch_filtering_mix_0925_more_rotation.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/train/configs/v0.4.0/qwen25_vl_olmocrv4_rotation_1epoch_filtering_mix_0925_more_rotation.yaml -------------------------------------------------------------------------------- /olmocr/train/configs/v0.4.0/qwen25_vl_olmocrv4_rotation_1epoch_filtering_mix_0925_preempt.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/train/configs/v0.4.0/qwen25_vl_olmocrv4_rotation_1epoch_filtering_mix_0925_preempt.yaml -------------------------------------------------------------------------------- /olmocr/train/configs/v0.4.0/qwen25_vl_olmocrv4_rotation_1epoch_filtering_noise.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/train/configs/v0.4.0/qwen25_vl_olmocrv4_rotation_1epoch_filtering_noise.yaml -------------------------------------------------------------------------------- /olmocr/train/configs/v0.4.0/qwen25_vl_olmocrv4_rotation_1epoch_filtering_preempt.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/train/configs/v0.4.0/qwen25_vl_olmocrv4_rotation_1epoch_filtering_preempt.yaml -------------------------------------------------------------------------------- /olmocr/train/configs/v0.4.0/qwen25_vl_olmocrv4_rotation_1epoch_localtest.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/train/configs/v0.4.0/qwen25_vl_olmocrv4_rotation_1epoch_localtest.yaml -------------------------------------------------------------------------------- /olmocr/train/configs/v0.4.0/qwen25_vl_olmocrv4_rotation_1epoch_mix_0925_more_rotation.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/train/configs/v0.4.0/qwen25_vl_olmocrv4_rotation_1epoch_mix_0925_more_rotation.yaml -------------------------------------------------------------------------------- /olmocr/train/configs/v0.4.0/qwen25_vl_olmocrv4_rotation_1epoch_mix_1025.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/train/configs/v0.4.0/qwen25_vl_olmocrv4_rotation_1epoch_mix_1025.yaml -------------------------------------------------------------------------------- /olmocr/train/configs/v0.4.0/qwen25_vl_olmocrv4_rotation_1epoch_mix_1025_filtered.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/train/configs/v0.4.0/qwen25_vl_olmocrv4_rotation_1epoch_mix_1025_filtered.yaml -------------------------------------------------------------------------------- /olmocr/train/configs/v0.4.0/qwen25_vl_olmocrv4_rotation_2epoch.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/train/configs/v0.4.0/qwen25_vl_olmocrv4_rotation_2epoch.yaml -------------------------------------------------------------------------------- /olmocr/train/dataloader.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/train/dataloader.py -------------------------------------------------------------------------------- /olmocr/train/grpo_train.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/train/grpo_train.py -------------------------------------------------------------------------------- /olmocr/train/muon.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/train/muon.py -------------------------------------------------------------------------------- /olmocr/train/prepare_checkpoint.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/train/prepare_checkpoint.py -------------------------------------------------------------------------------- /olmocr/train/quantization_configs/qwen2_5vl_w8a16_fp8.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/train/quantization_configs/qwen2_5vl_w8a16_fp8.yaml -------------------------------------------------------------------------------- /olmocr/train/quantization_configs/qwen2_5vl_w8a16_group.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/train/quantization_configs/qwen2_5vl_w8a16_group.yaml -------------------------------------------------------------------------------- /olmocr/train/quantization_configs/qwen2_5vl_w8a8_fp8.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/train/quantization_configs/qwen2_5vl_w8a8_fp8.yaml -------------------------------------------------------------------------------- /olmocr/train/quantization_configs/qwen2_5vl_w8a8_fp8_alt.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/train/quantization_configs/qwen2_5vl_w8a8_fp8_alt.yaml -------------------------------------------------------------------------------- /olmocr/train/quantization_configs/qwen2_5vl_w8a8_fp8_block.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/train/quantization_configs/qwen2_5vl_w8a8_fp8_block.yaml -------------------------------------------------------------------------------- /olmocr/train/quantization_configs/qwen2_5vl_w8a8_fp8_kv8.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/train/quantization_configs/qwen2_5vl_w8a8_fp8_kv8.yaml -------------------------------------------------------------------------------- /olmocr/train/quantization_configs/qwen2_5vl_w8a8_int8.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/train/quantization_configs/qwen2_5vl_w8a8_int8.yaml -------------------------------------------------------------------------------- /olmocr/train/quantization_configs/qwen2vl_w8a8_fp8.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/train/quantization_configs/qwen2vl_w8a8_fp8.yaml -------------------------------------------------------------------------------- /olmocr/train/train.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/train/train.py -------------------------------------------------------------------------------- /olmocr/version.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/version.py -------------------------------------------------------------------------------- /olmocr/viewer/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /olmocr/viewer/dolmaviewer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/viewer/dolmaviewer.py -------------------------------------------------------------------------------- /olmocr/viewer/dolmaviewer_merged_template.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/viewer/dolmaviewer_merged_template.html -------------------------------------------------------------------------------- /olmocr/viewer/dolmaviewer_template.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/viewer/dolmaviewer_template.html -------------------------------------------------------------------------------- /olmocr/work_queue.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/olmocr/work_queue.py -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/pyproject.toml -------------------------------------------------------------------------------- /scripts/beaker/Dockerfile-gpu-ci: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/scripts/beaker/Dockerfile-gpu-ci -------------------------------------------------------------------------------- /scripts/beaker/gpu-ci-script.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/scripts/beaker/gpu-ci-script.sh -------------------------------------------------------------------------------- /scripts/beaker/jupiter-ib.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/scripts/beaker/jupiter-ib.sh -------------------------------------------------------------------------------- /scripts/beaker/pluto-ib.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/scripts/beaker/pluto-ib.sh -------------------------------------------------------------------------------- /scripts/check_contamination.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/scripts/check_contamination.py -------------------------------------------------------------------------------- /scripts/compare_vllm.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/scripts/compare_vllm.sh -------------------------------------------------------------------------------- /scripts/compress_model.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/scripts/compress_model.sh -------------------------------------------------------------------------------- /scripts/data/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /scripts/data/buildsilver.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/scripts/data/buildsilver.py -------------------------------------------------------------------------------- /scripts/data/buildsilverdatasummary.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/scripts/data/buildsilverdatasummary.py -------------------------------------------------------------------------------- /scripts/data/buildtestset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/scripts/data/buildtestset.py -------------------------------------------------------------------------------- /scripts/data/convertsilver_birr.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/scripts/data/convertsilver_birr.py -------------------------------------------------------------------------------- /scripts/data/convertsilver_openai.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/scripts/data/convertsilver_openai.py -------------------------------------------------------------------------------- /scripts/data/runopenaibatch.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/scripts/data/runopenaibatch.py -------------------------------------------------------------------------------- /scripts/elo/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/scripts/elo/README.md -------------------------------------------------------------------------------- /scripts/elo/boxplots.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/scripts/elo/boxplots.png -------------------------------------------------------------------------------- /scripts/elo/calculate_elo_ratings.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/scripts/elo/calculate_elo_ratings.py -------------------------------------------------------------------------------- /scripts/elo/draw_boxplots.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/scripts/elo/draw_boxplots.py -------------------------------------------------------------------------------- /scripts/elo/ratings.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/scripts/elo/ratings.csv -------------------------------------------------------------------------------- /scripts/elo/results.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/scripts/elo/results.txt -------------------------------------------------------------------------------- /scripts/eval/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /scripts/eval/buildelo.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/scripts/eval/buildelo.py -------------------------------------------------------------------------------- /scripts/eval/dolma_refine/aligners.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/scripts/eval/dolma_refine/aligners.py -------------------------------------------------------------------------------- /scripts/eval/dolma_refine/metrics.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/scripts/eval/dolma_refine/metrics.py -------------------------------------------------------------------------------- /scripts/eval/dolma_refine/registry.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/scripts/eval/dolma_refine/registry.py -------------------------------------------------------------------------------- /scripts/eval/dolma_refine/segmenters.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/scripts/eval/dolma_refine/segmenters.py -------------------------------------------------------------------------------- /scripts/eval/evalhtml.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/scripts/eval/evalhtml.py -------------------------------------------------------------------------------- /scripts/eval/evalhtml_template.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/scripts/eval/evalhtml_template.html -------------------------------------------------------------------------------- /scripts/eval/runeval.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/scripts/eval/runeval.py -------------------------------------------------------------------------------- /scripts/eval/scoreelo.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/scripts/eval/scoreelo.py -------------------------------------------------------------------------------- /scripts/hf_local_test.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/scripts/hf_local_test.py -------------------------------------------------------------------------------- /scripts/infinigram_count.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/scripts/infinigram_count.py -------------------------------------------------------------------------------- /scripts/jsonl_to_markdown.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/scripts/jsonl_to_markdown.py -------------------------------------------------------------------------------- /scripts/movedolmadocs_to_md.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/scripts/movedolmadocs_to_md.py -------------------------------------------------------------------------------- /scripts/parse_with_pdfminer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/scripts/parse_with_pdfminer.py -------------------------------------------------------------------------------- /scripts/pii/autoscan_dolmadocs.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/scripts/pii/autoscan_dolmadocs.py -------------------------------------------------------------------------------- /scripts/pii/chatgpt_tag_dolmadocs_v1.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/scripts/pii/chatgpt_tag_dolmadocs_v1.py -------------------------------------------------------------------------------- /scripts/pii/chatgpt_tag_dolmadocs_v2.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/scripts/pii/chatgpt_tag_dolmadocs_v2.py -------------------------------------------------------------------------------- /scripts/pii/check_qual.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/scripts/pii/check_qual.sh -------------------------------------------------------------------------------- /scripts/pii/pii_rule_comparison.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/scripts/pii/pii_rule_comparison.py -------------------------------------------------------------------------------- /scripts/pii/rich_tagging_pipeline.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/scripts/pii/rich_tagging_pipeline.py -------------------------------------------------------------------------------- /scripts/pii/run_tagging_pipeline.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/scripts/pii/run_tagging_pipeline.sh -------------------------------------------------------------------------------- /scripts/pii/tagging_pipeline.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/scripts/pii/tagging_pipeline.py -------------------------------------------------------------------------------- /scripts/pii/tagging_pipeline_v2.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/scripts/pii/tagging_pipeline_v2.py -------------------------------------------------------------------------------- /scripts/plots/ocr_pareto.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/scripts/plots/ocr_pareto.pdf -------------------------------------------------------------------------------- /scripts/plots/ocr_pareto.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/scripts/plots/ocr_pareto.png -------------------------------------------------------------------------------- /scripts/plots/olmocr2_timeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/scripts/plots/olmocr2_timeline.png -------------------------------------------------------------------------------- /scripts/plots/pareto_plot.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/scripts/plots/pareto_plot.py -------------------------------------------------------------------------------- /scripts/plots/plot_olmocr2_timeline.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/scripts/plots/plot_olmocr2_timeline.py -------------------------------------------------------------------------------- /scripts/prepare_changelog.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/scripts/prepare_changelog.py -------------------------------------------------------------------------------- /scripts/release.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/scripts/release.sh -------------------------------------------------------------------------------- /scripts/release_notes.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/scripts/release_notes.py -------------------------------------------------------------------------------- /scripts/remove_paths_from_olmocrmix.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/scripts/remove_paths_from_olmocrmix.py -------------------------------------------------------------------------------- /scripts/run_benchmark.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/scripts/run_benchmark.sh -------------------------------------------------------------------------------- /scripts/run_benchmark_guided_decoding.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/scripts/run_benchmark_guided_decoding.sh -------------------------------------------------------------------------------- /scripts/run_infrapartner_benchmark.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/scripts/run_infrapartner_benchmark.sh -------------------------------------------------------------------------------- /scripts/run_integration_test.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/scripts/run_integration_test.sh -------------------------------------------------------------------------------- /scripts/run_marker_benchmark.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/scripts/run_marker_benchmark.sh -------------------------------------------------------------------------------- /scripts/run_mineru_benchmark.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/scripts/run_mineru_benchmark.sh -------------------------------------------------------------------------------- /scripts/run_paddle_benchmark.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/scripts/run_paddle_benchmark.sh -------------------------------------------------------------------------------- /scripts/run_paddlevl_benchmark.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/scripts/run_paddlevl_benchmark.sh -------------------------------------------------------------------------------- /scripts/run_transformers_benchmark.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/scripts/run_transformers_benchmark.sh -------------------------------------------------------------------------------- /scripts/s2orc_extractor.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/scripts/s2orc_extractor.sh -------------------------------------------------------------------------------- /scripts/scan_dolmadocs.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/scripts/scan_dolmadocs.py -------------------------------------------------------------------------------- /scripts/sync_beaker_image.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/scripts/sync_beaker_image.sh -------------------------------------------------------------------------------- /scripts/train/grpotrainer-beaker-multi-gpu-augusta.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/scripts/train/grpotrainer-beaker-multi-gpu-augusta.sh -------------------------------------------------------------------------------- /scripts/train/grpotrainer-beaker-multi-gpu.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/scripts/train/grpotrainer-beaker-multi-gpu.sh -------------------------------------------------------------------------------- /scripts/train/grpotrainer-beaker.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/scripts/train/grpotrainer-beaker.sh -------------------------------------------------------------------------------- /scripts/train/newtrainer-beaker.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/scripts/train/newtrainer-beaker.sh -------------------------------------------------------------------------------- /scripts/train/newtrainer-frontier.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/scripts/train/newtrainer-frontier.sh -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/gnarly_pdfs/ambiguous.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/tests/gnarly_pdfs/ambiguous.pdf -------------------------------------------------------------------------------- /tests/gnarly_pdfs/badlines.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/tests/gnarly_pdfs/badlines.pdf -------------------------------------------------------------------------------- /tests/gnarly_pdfs/bws_book_ch2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/tests/gnarly_pdfs/bws_book_ch2.pdf -------------------------------------------------------------------------------- /tests/gnarly_pdfs/discoverworld_crazy_tables.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/tests/gnarly_pdfs/discoverworld_crazy_tables.pdf -------------------------------------------------------------------------------- /tests/gnarly_pdfs/dolma-page-1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/tests/gnarly_pdfs/dolma-page-1.pdf -------------------------------------------------------------------------------- /tests/gnarly_pdfs/edgar-rotated90.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/tests/gnarly_pdfs/edgar-rotated90.pdf -------------------------------------------------------------------------------- /tests/gnarly_pdfs/edgar.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/tests/gnarly_pdfs/edgar.pdf -------------------------------------------------------------------------------- /tests/gnarly_pdfs/edgar_image.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/tests/gnarly_pdfs/edgar_image.pdf -------------------------------------------------------------------------------- /tests/gnarly_pdfs/failing_anchor_pg4.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/tests/gnarly_pdfs/failing_anchor_pg4.pdf -------------------------------------------------------------------------------- /tests/gnarly_pdfs/failing_pdf_pg9.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/tests/gnarly_pdfs/failing_pdf_pg9.pdf -------------------------------------------------------------------------------- /tests/gnarly_pdfs/form_on_later_pages.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/tests/gnarly_pdfs/form_on_later_pages.pdf -------------------------------------------------------------------------------- /tests/gnarly_pdfs/guidebook_failed_pages.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/tests/gnarly_pdfs/guidebook_failed_pages.pdf -------------------------------------------------------------------------------- /tests/gnarly_pdfs/handwriting_bad_ocr.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/tests/gnarly_pdfs/handwriting_bad_ocr.pdf -------------------------------------------------------------------------------- /tests/gnarly_pdfs/horribleocr.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/tests/gnarly_pdfs/horribleocr.pdf -------------------------------------------------------------------------------- /tests/gnarly_pdfs/instructions_and_schematics.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/tests/gnarly_pdfs/instructions_and_schematics.pdf -------------------------------------------------------------------------------- /tests/gnarly_pdfs/large_prompt_hint1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/tests/gnarly_pdfs/large_prompt_hint1.pdf -------------------------------------------------------------------------------- /tests/gnarly_pdfs/large_prompt_hint2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/tests/gnarly_pdfs/large_prompt_hint2.pdf -------------------------------------------------------------------------------- /tests/gnarly_pdfs/large_prompt_hint3.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/tests/gnarly_pdfs/large_prompt_hint3.pdf -------------------------------------------------------------------------------- /tests/gnarly_pdfs/load_v_error.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/tests/gnarly_pdfs/load_v_error.pdf -------------------------------------------------------------------------------- /tests/gnarly_pdfs/lots_of_chem_tables.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/tests/gnarly_pdfs/lots_of_chem_tables.pdf -------------------------------------------------------------------------------- /tests/gnarly_pdfs/lots_of_sci_tables.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/tests/gnarly_pdfs/lots_of_sci_tables.pdf -------------------------------------------------------------------------------- /tests/gnarly_pdfs/map1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/tests/gnarly_pdfs/map1.pdf -------------------------------------------------------------------------------- /tests/gnarly_pdfs/most_content_in_image_form.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/tests/gnarly_pdfs/most_content_in_image_form.pdf -------------------------------------------------------------------------------- /tests/gnarly_pdfs/newspaper.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/tests/gnarly_pdfs/newspaper.pdf -------------------------------------------------------------------------------- /tests/gnarly_pdfs/not_parsing.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/tests/gnarly_pdfs/not_parsing.pdf -------------------------------------------------------------------------------- /tests/gnarly_pdfs/not_parsing2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/tests/gnarly_pdfs/not_parsing2.pdf -------------------------------------------------------------------------------- /tests/gnarly_pdfs/olmo-page-1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/tests/gnarly_pdfs/olmo-page-1.pdf -------------------------------------------------------------------------------- /tests/gnarly_pdfs/overrun_on_pg8.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/tests/gnarly_pdfs/overrun_on_pg8.pdf -------------------------------------------------------------------------------- /tests/gnarly_pdfs/pdftotext_two_column_issue.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/tests/gnarly_pdfs/pdftotext_two_column_issue.pdf -------------------------------------------------------------------------------- /tests/gnarly_pdfs/repeating_references_on_pg9_pg10.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/tests/gnarly_pdfs/repeating_references_on_pg9_pg10.pdf -------------------------------------------------------------------------------- /tests/gnarly_pdfs/skinnypage.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/tests/gnarly_pdfs/skinnypage.pdf -------------------------------------------------------------------------------- /tests/gnarly_pdfs/slideshow_mostly_good_some_pages_should_get_filtered.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/tests/gnarly_pdfs/slideshow_mostly_good_some_pages_should_get_filtered.pdf -------------------------------------------------------------------------------- /tests/gnarly_pdfs/slideshow_mostly_images.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/tests/gnarly_pdfs/slideshow_mostly_images.pdf -------------------------------------------------------------------------------- /tests/gnarly_pdfs/small_page_size.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/tests/gnarly_pdfs/small_page_size.pdf -------------------------------------------------------------------------------- /tests/gnarly_pdfs/some_ocr1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/tests/gnarly_pdfs/some_ocr1.pdf -------------------------------------------------------------------------------- /tests/gnarly_pdfs/ti89_guidebook_programming.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/tests/gnarly_pdfs/ti89_guidebook_programming.pdf -------------------------------------------------------------------------------- /tests/gnarly_pdfs/tobacco_missed_tokens_pg1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/tests/gnarly_pdfs/tobacco_missed_tokens_pg1.pdf -------------------------------------------------------------------------------- /tests/sample_dataset/empty_document/blanktext.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/tests/sample_dataset/empty_document/blanktext.md -------------------------------------------------------------------------------- /tests/sample_dataset/empty_document/blanktext.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/tests/sample_dataset/empty_document/blanktext.pdf -------------------------------------------------------------------------------- /tests/sample_dataset/simple_document/edgar.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/tests/sample_dataset/simple_document/edgar.md -------------------------------------------------------------------------------- /tests/sample_dataset/simple_document/edgar.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/tests/sample_dataset/simple_document/edgar.pdf -------------------------------------------------------------------------------- /tests/sample_dataset/urls.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/tests/sample_dataset/urls.jsonl -------------------------------------------------------------------------------- /tests/test_anchor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/tests/test_anchor.py -------------------------------------------------------------------------------- /tests/test_dataloader.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/tests/test_dataloader.py -------------------------------------------------------------------------------- /tests/test_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/tests/test_filter.py -------------------------------------------------------------------------------- /tests/test_grpo.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/tests/test_grpo.py -------------------------------------------------------------------------------- /tests/test_integration.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/tests/test_integration.py -------------------------------------------------------------------------------- /tests/test_katex_render.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/tests/test_katex_render.py -------------------------------------------------------------------------------- /tests/test_mine_html_templates.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/tests/test_mine_html_templates.py -------------------------------------------------------------------------------- /tests/test_olmocrmix.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/tests/test_olmocrmix.py -------------------------------------------------------------------------------- /tests/test_pipeline.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/tests/test_pipeline.py -------------------------------------------------------------------------------- /tests/test_s3_work_queue.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/tests/test_s3_work_queue.py -------------------------------------------------------------------------------- /tests/test_tests.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/olmocr/HEAD/tests/test_tests.py --------------------------------------------------------------------------------