├── .dockerignore ├── .github ├── FUNDING.yml ├── copilot-instructions.md ├── dependabot.yml └── workflows │ ├── push_docker_image.yml │ └── test.yml ├── .gitignore ├── .well-known └── funding-manifest-urls ├── Dockerfile ├── Dockerfile.gradio ├── Dockerfile.ollama ├── LICENSE ├── Makefile ├── README.md ├── dev-requirements.txt ├── docker-compose-gpu.yml ├── docker-compose.yml ├── fine_tuning_lightgbm_models.ipynb ├── images ├── ui.png ├── vgtexample1.png ├── vgtexample2.png ├── vgtexample3.png └── vgtexample4.png ├── justfile ├── pyproject.toml ├── requirements-gradio.txt ├── requirements.txt ├── src ├── adapters │ ├── __init__.py │ ├── infrastructure │ │ ├── __init__.py │ │ ├── format_conversion_service_adapter.py │ │ ├── format_converters │ │ │ ├── __init__.py │ │ │ ├── convert_formula_to_latex.py │ │ │ └── convert_table_to_html.py │ │ ├── html_conversion_service_adapter.py │ │ ├── markdown_conversion_service_adapter.py │ │ ├── markup_conversion │ │ │ ├── ExtractedImage.py │ │ │ ├── Link.py │ │ │ ├── OutputFormat.py │ │ │ ├── __init__.py │ │ │ └── pdf_to_markup_service_adapter.py │ │ ├── ocr │ │ │ ├── __init__.py │ │ │ └── languages.py │ │ ├── ocr_service_adapter.py │ │ ├── pdf_analysis_service_adapter.py │ │ ├── text_extraction_adapter.py │ │ ├── toc │ │ │ ├── MergeTwoSegmentsTitles.py │ │ │ ├── PdfSegmentation.py │ │ │ ├── TOCExtractor.py │ │ │ ├── TitleFeatures.py │ │ │ ├── __init__.py │ │ │ ├── data │ │ │ │ ├── TOCItem.py │ │ │ │ └── __init__.py │ │ │ ├── extract_table_of_contents.py │ │ │ └── methods │ │ │ │ ├── __init__.py │ │ │ │ └── two_models_v3_segments_context_2 │ │ │ │ ├── Modes.py │ │ │ │ └── __init__.py │ │ ├── toc_service_adapter.py │ │ ├── translation │ │ │ ├── decode_html_content.py │ │ │ ├── decode_markdown_content.py │ │ │ ├── download_translation_model.py │ │ │ ├── encode_html_content.py │ │ │ ├── encode_markdown_content.py │ │ │ ├── ollama_container_manager.py │ │ │ └── translate_markup_document.py │ │ └── visualization_service_adapter.py │ ├── ml │ │ ├── __init__.py │ │ ├── fast_trainer │ │ │ ├── Paragraph.py │ │ │ ├── ParagraphExtractorTrainer.py │ │ │ ├── __init__.py │ │ │ └── model_configuration.py │ │ ├── fast_trainer_adapter.py │ │ ├── pdf_tokens_type_trainer │ │ │ ├── ModelConfiguration.py │ │ │ ├── PdfTrainer.py │ │ │ ├── TokenFeatures.py │ │ │ ├── TokenTypeTrainer.py │ │ │ ├── __init__.py │ │ │ ├── config.py │ │ │ ├── download_models.py │ │ │ ├── get_paths.py │ │ │ └── tests │ │ │ │ ├── __init__.py │ │ │ │ └── test_trainer.py │ │ ├── vgt │ │ │ ├── __init__.py │ │ │ ├── bros │ │ │ │ ├── __init__.py │ │ │ │ ├── configuration_bros.py │ │ │ │ ├── modeling_bros.py │ │ │ │ ├── tokenization_bros.py │ │ │ │ └── tokenization_bros_fast.py │ │ │ ├── create_word_grid.py │ │ │ ├── ditod │ │ │ │ ├── FeatureMerge.py │ │ │ │ ├── VGT.py │ │ │ │ ├── VGTTrainer.py │ │ │ │ ├── VGTbackbone.py │ │ │ │ ├── VGTbeit.py │ │ │ │ ├── VGTcheckpointer.py │ │ │ │ ├── Wordnn_embedding.py │ │ │ │ ├── __init__.py │ │ │ │ ├── config.py │ │ │ │ ├── dataset_mapper.py │ │ │ │ ├── tokenization_bros.py │ │ │ │ └── utils.py │ │ │ ├── get_json_annotations.py │ │ │ ├── get_model_configuration.py │ │ │ ├── get_most_probable_pdf_segments.py │ │ │ ├── get_reading_orders.py │ │ │ └── model_configuration │ │ │ │ ├── Base-RCNN-FPN.yaml │ │ │ │ ├── doclaynet_VGT_cascade_PTM.yaml │ │ │ │ └── doclaynet_configuration.pickle │ │ └── vgt_model_adapter.py │ ├── storage │ │ ├── __init__.py │ │ └── file_system_repository.py │ └── web │ │ ├── __init__.py │ │ └── fastapi_controllers.py ├── app.py ├── catch_exceptions.py ├── configuration.py ├── domain │ ├── PdfImages.py │ ├── PdfSegment.py │ ├── Prediction.py │ └── SegmentBox.py ├── download_models.py ├── drivers │ ├── __init__.py │ └── web │ │ ├── __init__.py │ │ └── dependency_injection.py ├── gradio_app.py ├── ports │ ├── __init__.py │ ├── repositories │ │ ├── __init__.py │ │ └── file_repository.py │ └── services │ │ ├── __init__.py │ │ ├── format_conversion_service.py │ │ ├── html_conversion_service.py │ │ ├── markdown_conversion_service.py │ │ ├── ml_model_service.py │ │ ├── ocr_service.py │ │ ├── pdf_analysis_service.py │ │ ├── text_extraction_service.py │ │ ├── toc_service.py │ │ └── visualization_service.py ├── tests │ ├── __init__.py │ └── test_end_to_end.py └── use_cases │ ├── __init__.py │ ├── html_conversion │ ├── __init__.py │ └── convert_to_html_use_case.py │ ├── markdown_conversion │ ├── __init__.py │ └── convert_to_markdown_use_case.py │ ├── ocr │ ├── __init__.py │ └── process_ocr_use_case.py │ ├── pdf_analysis │ ├── __init__.py │ ├── analyze_pdf_use_case.py │ └── get_pdf_word_positions.py │ ├── text_extraction │ ├── __init__.py │ └── extract_text_use_case.py │ ├── toc_extraction │ ├── __init__.py │ └── extract_toc_use_case.py │ └── visualization │ ├── __init__.py │ └── create_visualization_use_case.py ├── start.sh └── test_pdfs ├── blank.pdf ├── chinese.pdf ├── error.pdf ├── formula.pdf ├── image.pdf ├── korean.pdf ├── not_a_pdf.pdf ├── ocr-sample-already-ocred.pdf ├── ocr-sample-english.pdf ├── ocr-sample-french.pdf ├── ocr_pdf.pdf ├── regular.pdf ├── some_empty_pages.pdf ├── table.pdf ├── test.pdf ├── toc-test.pdf └── toc-test.xml /.dockerignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/.dockerignore -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | custom: ["https://huridocs.org/donate/"] 2 | -------------------------------------------------------------------------------- /.github/copilot-instructions.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/.github/dependabot.yml -------------------------------------------------------------------------------- /.github/workflows/push_docker_image.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/.github/workflows/push_docker_image.yml -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/.github/workflows/test.yml -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/.gitignore -------------------------------------------------------------------------------- /.well-known/funding-manifest-urls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/.well-known/funding-manifest-urls -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/Dockerfile -------------------------------------------------------------------------------- /Dockerfile.gradio: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/Dockerfile.gradio -------------------------------------------------------------------------------- /Dockerfile.ollama: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/Dockerfile.ollama -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/LICENSE -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/Makefile -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/README.md -------------------------------------------------------------------------------- /dev-requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/dev-requirements.txt -------------------------------------------------------------------------------- /docker-compose-gpu.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/docker-compose-gpu.yml -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/docker-compose.yml -------------------------------------------------------------------------------- /fine_tuning_lightgbm_models.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/fine_tuning_lightgbm_models.ipynb -------------------------------------------------------------------------------- /images/ui.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/images/ui.png -------------------------------------------------------------------------------- /images/vgtexample1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/images/vgtexample1.png -------------------------------------------------------------------------------- /images/vgtexample2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/images/vgtexample2.png -------------------------------------------------------------------------------- /images/vgtexample3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/images/vgtexample3.png -------------------------------------------------------------------------------- /images/vgtexample4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/images/vgtexample4.png -------------------------------------------------------------------------------- /justfile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/justfile -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/pyproject.toml -------------------------------------------------------------------------------- /requirements-gradio.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/requirements-gradio.txt -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/requirements.txt -------------------------------------------------------------------------------- /src/adapters/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/adapters/infrastructure/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/adapters/infrastructure/format_conversion_service_adapter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/adapters/infrastructure/format_conversion_service_adapter.py -------------------------------------------------------------------------------- /src/adapters/infrastructure/format_converters/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/adapters/infrastructure/format_converters/convert_formula_to_latex.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/adapters/infrastructure/format_converters/convert_formula_to_latex.py -------------------------------------------------------------------------------- /src/adapters/infrastructure/format_converters/convert_table_to_html.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/adapters/infrastructure/format_converters/convert_table_to_html.py -------------------------------------------------------------------------------- /src/adapters/infrastructure/html_conversion_service_adapter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/adapters/infrastructure/html_conversion_service_adapter.py -------------------------------------------------------------------------------- /src/adapters/infrastructure/markdown_conversion_service_adapter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/adapters/infrastructure/markdown_conversion_service_adapter.py -------------------------------------------------------------------------------- /src/adapters/infrastructure/markup_conversion/ExtractedImage.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/adapters/infrastructure/markup_conversion/ExtractedImage.py -------------------------------------------------------------------------------- /src/adapters/infrastructure/markup_conversion/Link.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/adapters/infrastructure/markup_conversion/Link.py -------------------------------------------------------------------------------- /src/adapters/infrastructure/markup_conversion/OutputFormat.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/adapters/infrastructure/markup_conversion/OutputFormat.py -------------------------------------------------------------------------------- /src/adapters/infrastructure/markup_conversion/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/adapters/infrastructure/markup_conversion/pdf_to_markup_service_adapter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/adapters/infrastructure/markup_conversion/pdf_to_markup_service_adapter.py -------------------------------------------------------------------------------- /src/adapters/infrastructure/ocr/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/adapters/infrastructure/ocr/languages.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/adapters/infrastructure/ocr/languages.py -------------------------------------------------------------------------------- /src/adapters/infrastructure/ocr_service_adapter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/adapters/infrastructure/ocr_service_adapter.py -------------------------------------------------------------------------------- /src/adapters/infrastructure/pdf_analysis_service_adapter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/adapters/infrastructure/pdf_analysis_service_adapter.py -------------------------------------------------------------------------------- /src/adapters/infrastructure/text_extraction_adapter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/adapters/infrastructure/text_extraction_adapter.py -------------------------------------------------------------------------------- /src/adapters/infrastructure/toc/MergeTwoSegmentsTitles.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/adapters/infrastructure/toc/MergeTwoSegmentsTitles.py -------------------------------------------------------------------------------- /src/adapters/infrastructure/toc/PdfSegmentation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/adapters/infrastructure/toc/PdfSegmentation.py -------------------------------------------------------------------------------- /src/adapters/infrastructure/toc/TOCExtractor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/adapters/infrastructure/toc/TOCExtractor.py -------------------------------------------------------------------------------- /src/adapters/infrastructure/toc/TitleFeatures.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/adapters/infrastructure/toc/TitleFeatures.py -------------------------------------------------------------------------------- /src/adapters/infrastructure/toc/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/adapters/infrastructure/toc/data/TOCItem.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/adapters/infrastructure/toc/data/TOCItem.py -------------------------------------------------------------------------------- /src/adapters/infrastructure/toc/data/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/adapters/infrastructure/toc/extract_table_of_contents.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/adapters/infrastructure/toc/extract_table_of_contents.py -------------------------------------------------------------------------------- /src/adapters/infrastructure/toc/methods/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/adapters/infrastructure/toc/methods/two_models_v3_segments_context_2/Modes.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/adapters/infrastructure/toc/methods/two_models_v3_segments_context_2/Modes.py -------------------------------------------------------------------------------- /src/adapters/infrastructure/toc/methods/two_models_v3_segments_context_2/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/adapters/infrastructure/toc_service_adapter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/adapters/infrastructure/toc_service_adapter.py -------------------------------------------------------------------------------- /src/adapters/infrastructure/translation/decode_html_content.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/adapters/infrastructure/translation/decode_html_content.py -------------------------------------------------------------------------------- /src/adapters/infrastructure/translation/decode_markdown_content.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/adapters/infrastructure/translation/decode_markdown_content.py -------------------------------------------------------------------------------- /src/adapters/infrastructure/translation/download_translation_model.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/adapters/infrastructure/translation/download_translation_model.py -------------------------------------------------------------------------------- /src/adapters/infrastructure/translation/encode_html_content.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/adapters/infrastructure/translation/encode_html_content.py -------------------------------------------------------------------------------- /src/adapters/infrastructure/translation/encode_markdown_content.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/adapters/infrastructure/translation/encode_markdown_content.py -------------------------------------------------------------------------------- /src/adapters/infrastructure/translation/ollama_container_manager.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/adapters/infrastructure/translation/ollama_container_manager.py -------------------------------------------------------------------------------- /src/adapters/infrastructure/translation/translate_markup_document.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/adapters/infrastructure/translation/translate_markup_document.py -------------------------------------------------------------------------------- /src/adapters/infrastructure/visualization_service_adapter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/adapters/infrastructure/visualization_service_adapter.py -------------------------------------------------------------------------------- /src/adapters/ml/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/adapters/ml/fast_trainer/Paragraph.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/adapters/ml/fast_trainer/Paragraph.py -------------------------------------------------------------------------------- /src/adapters/ml/fast_trainer/ParagraphExtractorTrainer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/adapters/ml/fast_trainer/ParagraphExtractorTrainer.py -------------------------------------------------------------------------------- /src/adapters/ml/fast_trainer/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/adapters/ml/fast_trainer/model_configuration.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/adapters/ml/fast_trainer/model_configuration.py -------------------------------------------------------------------------------- /src/adapters/ml/fast_trainer_adapter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/adapters/ml/fast_trainer_adapter.py -------------------------------------------------------------------------------- /src/adapters/ml/pdf_tokens_type_trainer/ModelConfiguration.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/adapters/ml/pdf_tokens_type_trainer/ModelConfiguration.py -------------------------------------------------------------------------------- /src/adapters/ml/pdf_tokens_type_trainer/PdfTrainer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/adapters/ml/pdf_tokens_type_trainer/PdfTrainer.py -------------------------------------------------------------------------------- /src/adapters/ml/pdf_tokens_type_trainer/TokenFeatures.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/adapters/ml/pdf_tokens_type_trainer/TokenFeatures.py -------------------------------------------------------------------------------- /src/adapters/ml/pdf_tokens_type_trainer/TokenTypeTrainer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/adapters/ml/pdf_tokens_type_trainer/TokenTypeTrainer.py -------------------------------------------------------------------------------- /src/adapters/ml/pdf_tokens_type_trainer/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/adapters/ml/pdf_tokens_type_trainer/config.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/adapters/ml/pdf_tokens_type_trainer/config.py -------------------------------------------------------------------------------- /src/adapters/ml/pdf_tokens_type_trainer/download_models.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/adapters/ml/pdf_tokens_type_trainer/download_models.py -------------------------------------------------------------------------------- /src/adapters/ml/pdf_tokens_type_trainer/get_paths.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/adapters/ml/pdf_tokens_type_trainer/get_paths.py -------------------------------------------------------------------------------- /src/adapters/ml/pdf_tokens_type_trainer/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/adapters/ml/pdf_tokens_type_trainer/tests/test_trainer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/adapters/ml/pdf_tokens_type_trainer/tests/test_trainer.py -------------------------------------------------------------------------------- /src/adapters/ml/vgt/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/adapters/ml/vgt/bros/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/adapters/ml/vgt/bros/__init__.py -------------------------------------------------------------------------------- /src/adapters/ml/vgt/bros/configuration_bros.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/adapters/ml/vgt/bros/configuration_bros.py -------------------------------------------------------------------------------- /src/adapters/ml/vgt/bros/modeling_bros.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/adapters/ml/vgt/bros/modeling_bros.py -------------------------------------------------------------------------------- /src/adapters/ml/vgt/bros/tokenization_bros.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/adapters/ml/vgt/bros/tokenization_bros.py -------------------------------------------------------------------------------- /src/adapters/ml/vgt/bros/tokenization_bros_fast.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/adapters/ml/vgt/bros/tokenization_bros_fast.py -------------------------------------------------------------------------------- /src/adapters/ml/vgt/create_word_grid.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/adapters/ml/vgt/create_word_grid.py -------------------------------------------------------------------------------- /src/adapters/ml/vgt/ditod/FeatureMerge.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/adapters/ml/vgt/ditod/FeatureMerge.py -------------------------------------------------------------------------------- /src/adapters/ml/vgt/ditod/VGT.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/adapters/ml/vgt/ditod/VGT.py -------------------------------------------------------------------------------- /src/adapters/ml/vgt/ditod/VGTTrainer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/adapters/ml/vgt/ditod/VGTTrainer.py -------------------------------------------------------------------------------- /src/adapters/ml/vgt/ditod/VGTbackbone.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/adapters/ml/vgt/ditod/VGTbackbone.py -------------------------------------------------------------------------------- /src/adapters/ml/vgt/ditod/VGTbeit.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/adapters/ml/vgt/ditod/VGTbeit.py -------------------------------------------------------------------------------- /src/adapters/ml/vgt/ditod/VGTcheckpointer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/adapters/ml/vgt/ditod/VGTcheckpointer.py -------------------------------------------------------------------------------- /src/adapters/ml/vgt/ditod/Wordnn_embedding.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/adapters/ml/vgt/ditod/Wordnn_embedding.py -------------------------------------------------------------------------------- /src/adapters/ml/vgt/ditod/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/adapters/ml/vgt/ditod/__init__.py -------------------------------------------------------------------------------- /src/adapters/ml/vgt/ditod/config.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/adapters/ml/vgt/ditod/config.py -------------------------------------------------------------------------------- /src/adapters/ml/vgt/ditod/dataset_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/adapters/ml/vgt/ditod/dataset_mapper.py -------------------------------------------------------------------------------- /src/adapters/ml/vgt/ditod/tokenization_bros.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/adapters/ml/vgt/ditod/tokenization_bros.py -------------------------------------------------------------------------------- /src/adapters/ml/vgt/ditod/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/adapters/ml/vgt/ditod/utils.py -------------------------------------------------------------------------------- /src/adapters/ml/vgt/get_json_annotations.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/adapters/ml/vgt/get_json_annotations.py -------------------------------------------------------------------------------- /src/adapters/ml/vgt/get_model_configuration.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/adapters/ml/vgt/get_model_configuration.py -------------------------------------------------------------------------------- /src/adapters/ml/vgt/get_most_probable_pdf_segments.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/adapters/ml/vgt/get_most_probable_pdf_segments.py -------------------------------------------------------------------------------- /src/adapters/ml/vgt/get_reading_orders.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/adapters/ml/vgt/get_reading_orders.py -------------------------------------------------------------------------------- /src/adapters/ml/vgt/model_configuration/Base-RCNN-FPN.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/adapters/ml/vgt/model_configuration/Base-RCNN-FPN.yaml -------------------------------------------------------------------------------- /src/adapters/ml/vgt/model_configuration/doclaynet_VGT_cascade_PTM.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/adapters/ml/vgt/model_configuration/doclaynet_VGT_cascade_PTM.yaml -------------------------------------------------------------------------------- /src/adapters/ml/vgt/model_configuration/doclaynet_configuration.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/adapters/ml/vgt/model_configuration/doclaynet_configuration.pickle -------------------------------------------------------------------------------- /src/adapters/ml/vgt_model_adapter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/adapters/ml/vgt_model_adapter.py -------------------------------------------------------------------------------- /src/adapters/storage/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/adapters/storage/file_system_repository.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/adapters/storage/file_system_repository.py -------------------------------------------------------------------------------- /src/adapters/web/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/adapters/web/fastapi_controllers.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/adapters/web/fastapi_controllers.py -------------------------------------------------------------------------------- /src/app.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/app.py -------------------------------------------------------------------------------- /src/catch_exceptions.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/catch_exceptions.py -------------------------------------------------------------------------------- /src/configuration.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/configuration.py -------------------------------------------------------------------------------- /src/domain/PdfImages.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/domain/PdfImages.py -------------------------------------------------------------------------------- /src/domain/PdfSegment.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/domain/PdfSegment.py -------------------------------------------------------------------------------- /src/domain/Prediction.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/domain/Prediction.py -------------------------------------------------------------------------------- /src/domain/SegmentBox.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/domain/SegmentBox.py -------------------------------------------------------------------------------- /src/download_models.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/download_models.py -------------------------------------------------------------------------------- /src/drivers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/drivers/web/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/drivers/web/dependency_injection.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/drivers/web/dependency_injection.py -------------------------------------------------------------------------------- /src/gradio_app.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/gradio_app.py -------------------------------------------------------------------------------- /src/ports/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/ports/repositories/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/ports/repositories/file_repository.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/ports/repositories/file_repository.py -------------------------------------------------------------------------------- /src/ports/services/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/ports/services/format_conversion_service.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/ports/services/format_conversion_service.py -------------------------------------------------------------------------------- /src/ports/services/html_conversion_service.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/ports/services/html_conversion_service.py -------------------------------------------------------------------------------- /src/ports/services/markdown_conversion_service.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/ports/services/markdown_conversion_service.py -------------------------------------------------------------------------------- /src/ports/services/ml_model_service.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/ports/services/ml_model_service.py -------------------------------------------------------------------------------- /src/ports/services/ocr_service.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/ports/services/ocr_service.py -------------------------------------------------------------------------------- /src/ports/services/pdf_analysis_service.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/ports/services/pdf_analysis_service.py -------------------------------------------------------------------------------- /src/ports/services/text_extraction_service.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/ports/services/text_extraction_service.py -------------------------------------------------------------------------------- /src/ports/services/toc_service.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/ports/services/toc_service.py -------------------------------------------------------------------------------- /src/ports/services/visualization_service.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/ports/services/visualization_service.py -------------------------------------------------------------------------------- /src/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/tests/test_end_to_end.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/tests/test_end_to_end.py -------------------------------------------------------------------------------- /src/use_cases/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/use_cases/html_conversion/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/use_cases/html_conversion/convert_to_html_use_case.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/use_cases/html_conversion/convert_to_html_use_case.py -------------------------------------------------------------------------------- /src/use_cases/markdown_conversion/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/use_cases/markdown_conversion/convert_to_markdown_use_case.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/use_cases/markdown_conversion/convert_to_markdown_use_case.py -------------------------------------------------------------------------------- /src/use_cases/ocr/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/use_cases/ocr/process_ocr_use_case.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/use_cases/ocr/process_ocr_use_case.py -------------------------------------------------------------------------------- /src/use_cases/pdf_analysis/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/use_cases/pdf_analysis/analyze_pdf_use_case.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/use_cases/pdf_analysis/analyze_pdf_use_case.py -------------------------------------------------------------------------------- /src/use_cases/pdf_analysis/get_pdf_word_positions.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/use_cases/pdf_analysis/get_pdf_word_positions.py -------------------------------------------------------------------------------- /src/use_cases/text_extraction/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/use_cases/text_extraction/extract_text_use_case.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/use_cases/text_extraction/extract_text_use_case.py -------------------------------------------------------------------------------- /src/use_cases/toc_extraction/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/use_cases/toc_extraction/extract_toc_use_case.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/use_cases/toc_extraction/extract_toc_use_case.py -------------------------------------------------------------------------------- /src/use_cases/visualization/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/use_cases/visualization/create_visualization_use_case.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/src/use_cases/visualization/create_visualization_use_case.py -------------------------------------------------------------------------------- /start.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/start.sh -------------------------------------------------------------------------------- /test_pdfs/blank.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/test_pdfs/blank.pdf -------------------------------------------------------------------------------- /test_pdfs/chinese.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/test_pdfs/chinese.pdf -------------------------------------------------------------------------------- /test_pdfs/error.pdf: -------------------------------------------------------------------------------- 1 | error -------------------------------------------------------------------------------- /test_pdfs/formula.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/test_pdfs/formula.pdf -------------------------------------------------------------------------------- /test_pdfs/image.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/test_pdfs/image.pdf -------------------------------------------------------------------------------- /test_pdfs/korean.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/test_pdfs/korean.pdf -------------------------------------------------------------------------------- /test_pdfs/not_a_pdf.pdf: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test_pdfs/ocr-sample-already-ocred.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/test_pdfs/ocr-sample-already-ocred.pdf -------------------------------------------------------------------------------- /test_pdfs/ocr-sample-english.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/test_pdfs/ocr-sample-english.pdf -------------------------------------------------------------------------------- /test_pdfs/ocr-sample-french.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/test_pdfs/ocr-sample-french.pdf -------------------------------------------------------------------------------- /test_pdfs/ocr_pdf.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/test_pdfs/ocr_pdf.pdf -------------------------------------------------------------------------------- /test_pdfs/regular.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/test_pdfs/regular.pdf -------------------------------------------------------------------------------- /test_pdfs/some_empty_pages.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/test_pdfs/some_empty_pages.pdf -------------------------------------------------------------------------------- /test_pdfs/table.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/test_pdfs/table.pdf -------------------------------------------------------------------------------- /test_pdfs/test.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/test_pdfs/test.pdf -------------------------------------------------------------------------------- /test_pdfs/toc-test.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/test_pdfs/toc-test.pdf -------------------------------------------------------------------------------- /test_pdfs/toc-test.xml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huridocs/pdf-document-layout-analysis/HEAD/test_pdfs/toc-test.xml --------------------------------------------------------------------------------