├── .gitignore ├── .pre-commit-config.yaml ├── LICENSE ├── README.md ├── demo.py ├── rapid_doc ├── __init__.py ├── layout_recover │ ├── __init__.py │ └── main.py ├── main.py ├── ocr_extract │ ├── __init__.py │ └── main.py ├── pdf_extract │ ├── __init__.py │ └── main.py └── utils.py ├── requirements.txt ├── scripts ├── test_fitz.py └── test_pdfminer.py ├── test_pdf_extract.py └── tests ├── test_files ├── direct_extract │ ├── single_column.pdf │ ├── two_column.pdf │ └── two_column_img_table.pdf └── scan_pdf │ └── B0702罗马十二帝王传Page3_5.pdf └── test_main.py /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RapidAI/RapidDocEx/HEAD/.gitignore -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RapidAI/RapidDocEx/HEAD/.pre-commit-config.yaml -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RapidAI/RapidDocEx/HEAD/LICENSE -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RapidAI/RapidDocEx/HEAD/README.md -------------------------------------------------------------------------------- /demo.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RapidAI/RapidDocEx/HEAD/demo.py -------------------------------------------------------------------------------- /rapid_doc/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RapidAI/RapidDocEx/HEAD/rapid_doc/__init__.py -------------------------------------------------------------------------------- /rapid_doc/layout_recover/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RapidAI/RapidDocEx/HEAD/rapid_doc/layout_recover/__init__.py -------------------------------------------------------------------------------- /rapid_doc/layout_recover/main.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RapidAI/RapidDocEx/HEAD/rapid_doc/layout_recover/main.py -------------------------------------------------------------------------------- /rapid_doc/main.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RapidAI/RapidDocEx/HEAD/rapid_doc/main.py -------------------------------------------------------------------------------- /rapid_doc/ocr_extract/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RapidAI/RapidDocEx/HEAD/rapid_doc/ocr_extract/__init__.py -------------------------------------------------------------------------------- /rapid_doc/ocr_extract/main.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RapidAI/RapidDocEx/HEAD/rapid_doc/ocr_extract/main.py -------------------------------------------------------------------------------- /rapid_doc/pdf_extract/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RapidAI/RapidDocEx/HEAD/rapid_doc/pdf_extract/__init__.py -------------------------------------------------------------------------------- /rapid_doc/pdf_extract/main.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RapidAI/RapidDocEx/HEAD/rapid_doc/pdf_extract/main.py -------------------------------------------------------------------------------- /rapid_doc/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RapidAI/RapidDocEx/HEAD/rapid_doc/utils.py -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RapidAI/RapidDocEx/HEAD/requirements.txt -------------------------------------------------------------------------------- /scripts/test_fitz.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RapidAI/RapidDocEx/HEAD/scripts/test_fitz.py -------------------------------------------------------------------------------- /scripts/test_pdfminer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RapidAI/RapidDocEx/HEAD/scripts/test_pdfminer.py -------------------------------------------------------------------------------- /test_pdf_extract.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RapidAI/RapidDocEx/HEAD/test_pdf_extract.py -------------------------------------------------------------------------------- /tests/test_files/direct_extract/single_column.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RapidAI/RapidDocEx/HEAD/tests/test_files/direct_extract/single_column.pdf -------------------------------------------------------------------------------- /tests/test_files/direct_extract/two_column.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RapidAI/RapidDocEx/HEAD/tests/test_files/direct_extract/two_column.pdf -------------------------------------------------------------------------------- /tests/test_files/direct_extract/two_column_img_table.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RapidAI/RapidDocEx/HEAD/tests/test_files/direct_extract/two_column_img_table.pdf -------------------------------------------------------------------------------- /tests/test_files/scan_pdf/B0702罗马十二帝王传Page3_5.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RapidAI/RapidDocEx/HEAD/tests/test_files/scan_pdf/B0702罗马十二帝王传Page3_5.pdf -------------------------------------------------------------------------------- /tests/test_main.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RapidAI/RapidDocEx/HEAD/tests/test_main.py --------------------------------------------------------------------------------