├── .github ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── ISSUE_TEMPLATE │ ├── bug_report.md │ ├── config.yml │ └── feature_request.md ├── example.png ├── layout-parser.png ├── lp.png └── workflows │ ├── ci.yml │ └── release.yml ├── .gitignore ├── .readthedocs.yml ├── LICENSE ├── MANIFEST.in ├── README.md ├── dev-requirements.txt ├── docs ├── Makefile ├── api_doc │ ├── elements.rst │ ├── io.rst │ ├── models.rst │ ├── ocr.rst │ └── visualization.rst ├── conf.py ├── example │ ├── deep_layout_parsing │ │ ├── index.rst │ │ ├── output_21_0.png │ │ └── output_7_0.png │ ├── load_coco │ │ ├── index.rst │ │ ├── output_10_0.png │ │ ├── output_15_0.png │ │ └── output_8_0.png │ └── parse_ocr │ │ ├── index.rst │ │ ├── output_14_0.png │ │ ├── output_17_0.png │ │ ├── output_19_0.png │ │ ├── output_25_0.png │ │ └── output_6_1.png ├── index.rst ├── make.bat └── notes │ ├── installation.md │ ├── intersection.png │ ├── modelzoo.md │ ├── quickstart.rst │ ├── shape_operations.md │ └── union.png ├── examples ├── Customizing Layout Models with Label Studio Annotation │ ├── Customizing Layout Models with Label Studio Annotation.ipynb │ ├── README.md │ ├── download_annotation.py │ ├── pipeline-overview.jpg │ └── task-overview.png ├── Deep Layout Parsing.ipynb ├── Load and visualize layout annotations in the COCO format.ipynb ├── OCR Tables and Parse the Output.ipynb └── data │ ├── example-table.jpeg │ └── paper-image.jpg ├── installation.md ├── setup.cfg ├── setup.py ├── src └── layoutparser │ ├── __init__.py │ ├── elements │ ├── __init__.py │ ├── base.py │ ├── errors.py │ ├── layout.py │ ├── layout_elements.py │ └── utils.py │ ├── file_utils.py │ ├── io │ ├── __init__.py │ ├── basic.py │ └── pdf.py │ ├── misc │ └── NotoSerifCJKjp-Regular.otf │ ├── models │ ├── __init__.py │ ├── auto_layoutmodel.py │ ├── base_catalog.py │ ├── base_layoutmodel.py │ ├── detectron2 │ │ ├── __init__.py │ │ ├── catalog.py │ │ └── layoutmodel.py │ ├── effdet │ │ ├── __init__.py │ │ ├── catalog.py │ │ └── layoutmodel.py │ ├── model_config.py │ └── paddledetection │ │ ├── __init__.py │ │ ├── catalog.py │ │ └── layoutmodel.py │ ├── ocr │ ├── __init__.py │ ├── base.py │ ├── gcv_agent.py │ └── tesseract_agent.py │ ├── tools │ ├── __init__.py │ └── shape_operations.py │ └── visualization.py ├── tests ├── fixtures │ ├── io │ │ ├── empty.pdf │ │ ├── example.pdf │ │ ├── generate_test_jsons.py │ │ ├── interval.json │ │ ├── interval_textblock.json │ │ ├── layout.csv │ │ ├── layout.json │ │ ├── layout_textblock.csv │ │ ├── layout_textblock.json │ │ ├── quadrilateral.json │ │ ├── quadrilateral_textblock.json │ │ ├── rectangle.json │ │ └── rectangle_textblock.json │ ├── model │ │ ├── config.yml │ │ ├── layout_detection_reference.jpg │ │ ├── layout_detection_reference.json │ │ └── test_model_image.jpg │ └── ocr │ │ ├── test_gcv_image.jpg │ │ ├── test_gcv_response.json │ │ └── test_tesseract_response.pickle ├── test_elements.py ├── test_io.py ├── test_model.py ├── test_ocr.py ├── test_tools.py └── test_visualization.py └── tests_deps ├── test_file_utils.py ├── test_only_detectron2.py ├── test_only_effdet.py └── test_only_paddledetection.py /.github/CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | 2 | # Contributor Covenant Code of Conduct 3 | 4 | ## Our Pledge 5 | 6 | In the interest of fostering an open and welcoming environment, we as 7 | contributors and maintainers pledge to make participation in our project and 8 | our community a harassment-free experience for everyone, regardless of age, body 9 | size, disability, ethnicity, sex characteristics, gender identity and expression, 10 | level of experience, education, socio-economic status, nationality, personal 11 | appearance, race, religion, or sexual identity and orientation. 12 | 13 | ## Our Standards 14 | 15 | Examples of behavior that contributes to creating a positive environment 16 | include: 17 | 18 | * Using welcoming and inclusive language 19 | * Being respectful of differing viewpoints and experiences 20 | * Gracefully accepting constructive criticism 21 | * Focusing on what is best for the community 22 | * Showing empathy towards other community members 23 | 24 | Examples of unacceptable behavior by participants include: 25 | 26 | * The use of sexualized language or imagery and unwelcome sexual attention or 27 | advances 28 | * Trolling, insulting/derogatory comments, and personal or political attacks 29 | * Public or private harassment 30 | * Publishing others' private information, such as a physical or electronic 31 | address, without explicit permission 32 | * Other conduct which could reasonably be considered inappropriate in a 33 | professional setting 34 | 35 | ## Our Responsibilities 36 | 37 | Project maintainers are responsible for clarifying the standards of acceptable 38 | behavior and are expected to take appropriate and fair corrective action in 39 | response to any instances of unacceptable behavior. 40 | 41 | Project maintainers have the right and responsibility to remove, edit, or 42 | reject comments, commits, code, wiki edits, issues, and other contributions 43 | that are not aligned to this Code of Conduct, or to ban temporarily or 44 | permanently any contributor for other behaviors that they deem inappropriate, 45 | threatening, offensive, or harmful. 46 | 47 | ## Scope 48 | 49 | This Code of Conduct applies within all project spaces, and it also applies when 50 | an individual is representing the project or its community in public spaces. 51 | Examples of representing a project or community include using an official 52 | project e-mail address, posting via an official social media account, or acting 53 | as an appointed representative at an online or offline event. Representation of 54 | a project may be further defined and clarified by project maintainers. 55 | 56 | ## Enforcement 57 | 58 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 59 | reported by contacting the project team at layoutparser@gmail.com. All 60 | complaints will be reviewed and investigated and will result in a response that 61 | is deemed necessary and appropriate to the circumstances. The project team is 62 | obligated to maintain confidentiality with regard to the reporter of an incident. 63 | Further details of specific enforcement policies may be posted separately. 64 | 65 | Project maintainers who do not follow or enforce the Code of Conduct in good 66 | faith may face temporary or permanent repercussions as determined by other 67 | members of the project's leadership. 68 | 69 | ## Attribution 70 | 71 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 72 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html 73 | 74 | [homepage]: https://www.contributor-covenant.org 75 | 76 | For answers to common questions about this code of conduct, see 77 | https://www.contributor-covenant.org/faq 78 | -------------------------------------------------------------------------------- /.github/CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to Layout Parser 2 | 3 | 🙌 Thank you for reading this and plan to contribute! We hope you can join us and work on this exciting project that can transform document image analysis pipelines with the full power of Deep Learning. 4 | 5 | All kinds of contributions are welcome, including but not limited to: 6 | 7 | - Better documentation and examples for more use cases 8 | - New pre-trained layout detection models 9 | - New features 10 | 11 | ## Planned features 12 | 13 | We are planning to improve different aspects of Layout Parser, any feedbacks and contributions would be great! 14 | 15 | ### Layout Modeling 16 | 17 | (Pre-trained) layout models are one of the most important components in Layout Parser, and we are planning to broaden the support for layout models: 18 | 19 | - Support framework other than Detectron2, e.g., [MMOCR](https://github.com/open-mmlab/mmocr). It may lead to easier installation and support for more application scenarios like receipt or invoice detection. 20 | - Support segmentation-based models, e.g., [dhSegment](https://github.com/dhlab-epfl/dhSegment) 21 | - Better customized training of layout detection models, see [layout-model-training](https://github.com/Layout-Parser/layout-model-training) 22 | - Reproducing novel layout models in the current framework, e.g., [CascadeTabNet](https://github.com/DevashishPrasad/CascadeTabNet) 23 | 24 | We are also working on the Layout Parser platform that can support users' sharing their own models. Please check [community-platform](https://github.com/Layout-Parser/community-platform) for more detail. 25 | 26 | ### Advanced Layout Pipeline 27 | 28 | - Support defining `Pipeline` that specifies an end-to-end layout processing pipeline for complex documents 29 | 30 | ### Command Line Tool and Layout Detection Service 31 | 32 | Layout Parser can be easily turned into a command line tool or service to process documents in bulk 33 | 34 | - Build a command line tool based on `Click` that supports commands like `layoutparser process --path ` 35 | - Build a RESTful Layout Parser service based on tools like `FastAPI` with similar supports as the command line tool 36 | - Performance improvements for such services 37 | 38 | ### Easy Installation and Deployment 39 | 40 | - Better ways for installing Detectron2 and related components on Windows machines 41 | - A Docker configuration for installing the Layout Parser 42 | 43 | ## How to Contribute? 44 | 45 | This how-to-guide is abridged from the [MMOCR Repository](https://github.com/open-mmlab/mmocr/blob/main/.github/CONTRIBUTING.md). 46 | 47 | ### Main Steps 48 | 49 | 1. Fork and pull the latest Layout Parser Repository 50 | 2. Checkout a new branch (do not use main branch for PRs) 51 | 3. Commit your changes 52 | 4. Create a PR 53 | 54 | **Notes**: 55 | 1. If you plan to add some new features that involve big changes, please open an issue to discuss with us first 56 | 2. If you are the author of some papers and would like to include your method into Layout Parser, please let us know (open an issue or contact the maintainers). Your contribution would be much appreciated. 57 | 3. For new features and new modules, unit tests are required to improve the code robustness 58 | 4. You might want to run `pip install -r dev-requirements.txt` to install the dev-dependencies. 59 | 60 | ### Code Style 61 | 62 | 1. We adopt [PEP8](https://www.python.org/dev/peps/pep-0008/) as the preferred code style. 63 | 2. We use the following tools for linting and formatting: 64 | - pylint: linter 65 | - black: formatter 66 | 3. We suggest adding [type hints](https://docs.python.org/3/library/typing.html) for all APIs. 67 | 68 | Sincere thanks, 69 | 70 | Zejiang (Shannon) Shen 71 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: 'bug' 6 | assignees: '' 7 | --- 8 | 9 | **Describe the bug** 10 | A clear and concise description of what the bug is. 11 | 12 | **Checklist** 13 | 14 | 1. I have searched related issues but cannot get the expected help. 15 | 2. The bug has not been fixed in the latest version, see the [Layout Parser Releases](https://github.com/Layout-Parser/layout-parser/releases/) 16 | 17 | **To Reproduce** 18 | Steps to reproduce the behavior: 19 | 1. What command or script did you run? 20 | ```none 21 | A placeholder for the command. 22 | ``` 23 | 24 | **Environment** 25 | 1. Please describe your Platform [Windows/MacOS/Linux] 26 | 2. Please show the Layout Parser version 27 | 2. You may add addition that may be helpful for locating the problem, such as 28 | - How you installed PyTorch [e.g., pip, conda, source] 29 | - Other environment variables that may be related (such as `$PATH`, `$LD_LIBRARY_PATH`, `$PYTHONPATH`, etc.) 30 | 31 | **Error traceback** 32 | If applicable, paste the error traceback here. 33 | 34 | **Screenshots** 35 | If applicable, add screenshots to help explain your problem. 36 | 37 | **Additional context** 38 | Add any other context about the problem here. 39 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/config.yml: -------------------------------------------------------------------------------- 1 | blank_issues_enabled: true 2 | contact_links: 3 | - name: Installation Guide 4 | url: https://layout-parser.readthedocs.io/en/latest/notes/installation.html 5 | about: | 6 | For any questions related to installation, especially installation on 7 | Windows platforms, please check the Installation Guide first. -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | --- 8 | 9 | **Motivation** 10 | A clear and concise description of the motivation of the feature, and how relates to make Layout Parser better? 11 | You can also find examples in [Layout Parser CONTRIBUTING guidelines](../CONTRIBUTING.md) 12 | 13 | **Related resources** 14 | If there is an official code release or third-party implementations, please also provide the information here, which would be very helpful. 15 | 16 | **Additional context** 17 | Add any other context or screenshots about the feature request here. 18 | -------------------------------------------------------------------------------- /.github/example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Layout-Parser/layout-parser/04e28168d820eea3a1ff1e098078323e7b48648b/.github/example.png -------------------------------------------------------------------------------- /.github/layout-parser.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Layout-Parser/layout-parser/04e28168d820eea3a1ff1e098078323e7b48648b/.github/layout-parser.png -------------------------------------------------------------------------------- /.github/lp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Layout-Parser/layout-parser/04e28168d820eea3a1ff1e098078323e7b48648b/.github/lp.png -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | push: 5 | branches: 6 | - 'main' 7 | paths: 8 | - '**.py' 9 | pull_request: 10 | 11 | jobs: 12 | 13 | test_only_effdet_backend: 14 | 15 | runs-on: ubuntu-latest 16 | steps: 17 | - uses: actions/checkout@v2 18 | - uses: actions/setup-python@v2 19 | with: 20 | python-version: '3.7' 21 | 22 | - name: Test Dependency Support 23 | run: | 24 | pip install pytest 25 | pip install -e . # The bare layoutparser module 26 | pytest tests_deps/test_file_utils.py 27 | 28 | - name: Install only effdet deps 29 | run: | 30 | pip install pytest 31 | pip install -e ".[effdet]" 32 | pytest tests_deps/test_only_effdet.py 33 | 34 | test_only_detectron2_backend: 35 | 36 | runs-on: ubuntu-latest 37 | steps: 38 | - uses: actions/checkout@v2 39 | - uses: actions/setup-python@v2 40 | with: 41 | python-version: '3.7' 42 | 43 | - name: Install only Detectron2 deps 44 | run: | 45 | pip install pytest 46 | pip install -e . 47 | pip install torchvision && pip install "git+https://github.com/facebookresearch/detectron2.git@v0.5#egg=detectron2" 48 | pytest tests_deps/test_only_detectron2.py 49 | 50 | test_only_paddledetection_backend: 51 | 52 | runs-on: ubuntu-latest 53 | steps: 54 | - uses: actions/checkout@v2 55 | - uses: actions/setup-python@v2 56 | with: 57 | python-version: '3.7' 58 | 59 | - name: Install only PaddleDetection deps 60 | run: | 61 | pip install pytest 62 | pip install -e ".[paddledetection]" 63 | pytest tests_deps/test_only_paddledetection.py 64 | env: 65 | PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION: python 66 | 67 | test_all_methods_all_backends: 68 | needs: [test_only_effdet_backend, test_only_detectron2_backend, test_only_paddledetection_backend] 69 | runs-on: ubuntu-latest 70 | strategy: 71 | matrix: 72 | python-version: [3.7, 3.8] 73 | steps: 74 | - uses: actions/checkout@v2 75 | 76 | - name: Set up Python ${{ matrix.python-version }} 77 | uses: actions/setup-python@v2 78 | with: 79 | python-version: ${{ matrix.python-version }} 80 | 81 | - name: Install library and dependencies 82 | run: | 83 | python -m pip install --upgrade pip 84 | pip install . 85 | 86 | - name: Lint with flake8 87 | run: | 88 | pip install flake8 89 | # stop the build if there are Python syntax errors or undefined names 90 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics --ignore F821 91 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 92 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 93 | 94 | - name: Test with pytest 95 | run: | 96 | # Install additional requirements when running tests 97 | pip install ".[effdet]" 98 | pip install -r dev-requirements.txt 99 | pytest tests 100 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Upload Python Package 2 | 3 | on: 4 | release: 5 | types: [published] 6 | 7 | jobs: 8 | release-pypi: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - uses: actions/checkout@v2 12 | - name: Set up Python 13 | uses: actions/setup-python@v2 14 | with: 15 | python-version: '3.x' 16 | - name: Install dependencies 17 | run: | 18 | python -m pip install --upgrade pip 19 | pip install setuptools wheel twine 20 | - name: Build and publish 21 | env: 22 | TWINE_USERNAME: ${{ secrets.TWINE_USERNAME }} 23 | TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }} 24 | run: | 25 | python setup.py sdist bdist_wheel 26 | twine upload dist/* 27 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Examples files 2 | examples/Customizing Layout Models with Label Studio Annotation/downloaded-annotations 3 | 4 | *.bak 5 | .gitattributes 6 | .last_checked 7 | .gitconfig 8 | *.bak 9 | *.log 10 | *~ 11 | ~* 12 | _tmp* 13 | tmp* 14 | tags 15 | 16 | # Byte-compiled / optimized / DLL files 17 | __pycache__/ 18 | *.py[cod] 19 | *$py.class 20 | 21 | # C extensions 22 | *.so 23 | 24 | # Distribution / packaging 25 | .Python 26 | env/ 27 | build/ 28 | develop-eggs/ 29 | dist/ 30 | downloads/ 31 | eggs/ 32 | .eggs/ 33 | lib/ 34 | lib64/ 35 | parts/ 36 | sdist/ 37 | var/ 38 | wheels/ 39 | *.egg-info/ 40 | .installed.cfg 41 | *.egg 42 | 43 | # PyInstaller 44 | # Usually these files are written by a python script from a template 45 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 46 | *.manifest 47 | *.spec 48 | 49 | # Installer logs 50 | pip-log.txt 51 | pip-delete-this-directory.txt 52 | 53 | # Unit test / coverage reports 54 | htmlcov/ 55 | .tox/ 56 | .coverage 57 | .coverage.* 58 | .cache 59 | nosetests.xml 60 | coverage.xml 61 | *.cover 62 | .hypothesis/ 63 | 64 | # Translations 65 | *.mo 66 | *.pot 67 | 68 | # Django stuff: 69 | *.log 70 | local_settings.py 71 | 72 | # Flask stuff: 73 | instance/ 74 | .webassets-cache 75 | 76 | # Scrapy stuff: 77 | .scrapy 78 | 79 | # Sphinx documentation 80 | docs/_build/ 81 | 82 | # PyBuilder 83 | target/ 84 | 85 | # Jupyter Notebook 86 | .ipynb_checkpoints 87 | 88 | # pyenv 89 | .python-version 90 | 91 | # celery beat schedule file 92 | celerybeat-schedule 93 | 94 | # SageMath parsed files 95 | *.sage.py 96 | 97 | # dotenv 98 | .env 99 | 100 | # virtualenv 101 | .venv 102 | venv/ 103 | ENV/ 104 | 105 | # Spyder project settings 106 | .spyderproject 107 | .spyproject 108 | 109 | # Rope project settings 110 | .ropeproject 111 | 112 | # mkdocs documentation 113 | /site 114 | 115 | # mypy 116 | .mypy_cache/ 117 | 118 | .vscode 119 | *.swp 120 | 121 | # osx generated files 122 | .DS_Store 123 | .DS_Store? 124 | .Trashes 125 | ehthumbs.db 126 | Thumbs.db 127 | .idea 128 | 129 | # pytest 130 | .pytest_cache 131 | 132 | # tools/trust-doc-nbs 133 | docs_src/.last_checked 134 | 135 | # symlinks to fastai 136 | docs_src/fastai 137 | tools/fastai 138 | 139 | # link checker 140 | checklink/cookies.txt 141 | 142 | # .gitconfig is now autogenerated 143 | .gitconfig 144 | 145 | -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | # Build documentation in the docs/ directory with Sphinx 9 | sphinx: 10 | configuration: docs/conf.py 11 | 12 | # Optionally build your docs in additional formats such as PDF 13 | formats: all 14 | 15 | # Optionally set the version of Python and requirements required to build your docs 16 | python: 17 | version: 3.7 18 | install: 19 | - method: pip 20 | path: . 21 | extra_requirements: 22 | - effdet 23 | - requirements: dev-requirements.txt -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | include README.md 3 | include src/layoutparser/misc/*.otf 4 | recursive-exclude * __pycache__ -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | Layout Parser Logo 3 |

4 | A unified toolkit for Deep Learning Based Document Image Analysis 5 |

6 |

7 | 8 |

9 | 10 | 11 | PyPI - Downloads 12 |

13 | 14 |

15 | 16 | 17 | 18 |

19 | 20 | --- 21 | 22 | ## What is LayoutParser 23 | 24 | ![Example Usage](https://github.com/Layout-Parser/layout-parser/raw/main/.github/example.png) 25 | 26 | LayoutParser aims to provide a wide range of tools that aims to streamline Document Image Analysis (DIA) tasks. Please check the LayoutParser [demo video](https://youtu.be/8yA5xB4Dg8c) (1 min) or [full talk](https://www.youtube.com/watch?v=YG0qepPgyGY) (15 min) for details. And here are some key features: 27 | 28 | - LayoutParser provides a rich repository of deep learning models for layout detection as well as a set of unified APIs for using them. For example, 29 | 30 |
31 | Perform DL layout detection in 4 lines of code 32 | 33 | ```python 34 | import layoutparser as lp 35 | model = lp.AutoLayoutModel('lp://EfficientDete/PubLayNet') 36 | # image = Image.open("path/to/image") 37 | layout = model.detect(image) 38 | ``` 39 | 40 |
41 | 42 | - LayoutParser comes with a set of layout data structures with carefully designed APIs that are optimized for document image analysis tasks. For example, 43 | 44 |
45 | Selecting layout/textual elements in the left column of a page 46 | 47 | ```python 48 | image_width = image.size[0] 49 | left_column = lp.Interval(0, image_width/2, axis='x') 50 | layout.filter_by(left_column, center=True) # select objects in the left column 51 | ``` 52 | 53 |
54 | 55 |
56 | Performing OCR for each detected Layout Region 57 | 58 | ```python 59 | ocr_agent = lp.TesseractAgent() 60 | for layout_region in layout: 61 | image_segment = layout_region.crop(image) 62 | text = ocr_agent.detect(image_segment) 63 | ``` 64 | 65 |
66 | 67 |
68 | Flexible APIs for visualizing the detected layouts 69 | 70 | ```python 71 | lp.draw_box(image, layout, box_width=1, show_element_id=True, box_alpha=0.25) 72 | ``` 73 | 74 |
75 | 76 | 77 | 78 |
79 | Loading layout data stored in json, csv, and even PDFs 80 | 81 | ```python 82 | layout = lp.load_json("path/to/json") 83 | layout = lp.load_csv("path/to/csv") 84 | pdf_layout = lp.load_pdf("path/to/pdf") 85 | ``` 86 | 87 |
88 | 89 | - LayoutParser is also a open platform that enables the sharing of layout detection models and DIA pipelines among the community. 90 |
91 | Check the LayoutParser open platform 92 |
93 | 94 |
95 | Submit your models/pipelines to LayoutParser 96 |
97 | 98 | ## Installation 99 | 100 | After several major updates, layoutparser provides various functionalities and deep learning models from different backends. But it still easy to install layoutparser, and we designed the installation method in a way such that you can choose to install only the needed dependencies for your project: 101 | 102 | ```bash 103 | pip install layoutparser # Install the base layoutparser library with 104 | pip install "layoutparser[layoutmodels]" # Install DL layout model toolkit 105 | pip install "layoutparser[ocr]" # Install OCR toolkit 106 | ``` 107 | 108 | Extra steps are needed if you want to use Detectron2-based models. Please check [installation.md](installation.md) for additional details on layoutparser installation. 109 | 110 | ## Examples 111 | 112 | We provide a series of examples for to help you start using the layout parser library: 113 | 114 | 1. [Table OCR and Results Parsing](https://github.com/Layout-Parser/layout-parser/blob/main/examples/OCR%20Tables%20and%20Parse%20the%20Output.ipynb): `layoutparser` can be used for conveniently OCR documents and convert the output in to structured data. 115 | 116 | 2. [Deep Layout Parsing Example](https://github.com/Layout-Parser/layout-parser/blob/main/examples/Deep%20Layout%20Parsing.ipynb): With the help of Deep Learning, `layoutparser` supports the analysis very complex documents and processing of the hierarchical structure in the layouts. 117 | 118 | ## Contributing 119 | 120 | We encourage you to contribute to Layout Parser! Please check out the [Contributing guidelines](.github/CONTRIBUTING.md) for guidelines about how to proceed. Join us! 121 | 122 | ## Citing `layoutparser` 123 | 124 | If you find `layoutparser` helpful to your work, please consider citing our tool and [paper](https://arxiv.org/pdf/2103.15348.pdf) using the following BibTeX entry. 125 | 126 | ``` 127 | @article{shen2021layoutparser, 128 | title={LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis}, 129 | author={Shen, Zejiang and Zhang, Ruochen and Dell, Melissa and Lee, Benjamin Charles Germain and Carlson, Jacob and Li, Weining}, 130 | journal={arXiv preprint arXiv:2103.15348}, 131 | year={2021} 132 | } 133 | ``` -------------------------------------------------------------------------------- /dev-requirements.txt: -------------------------------------------------------------------------------- 1 | pytest 2 | torch 3 | numpy 4 | opencv-python 5 | pandas 6 | docutils==0.16 7 | Sphinx==3.0.0 8 | recommonmark==0.6.0 9 | sphinx-markdown-tables 10 | sphinx_rtd_theme 11 | google-cloud-vision==1 12 | pytesseract 13 | pycocotools 14 | git+https://github.com/facebookresearch/detectron2.git@v0.4#egg=detectron2 15 | paddlepaddle 16 | effdet -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/api_doc/elements.rst: -------------------------------------------------------------------------------- 1 | Layout Elements 2 | ================================ 3 | 4 | 5 | Coordinate System 6 | -------------------------------- 7 | 8 | .. autoclass:: layoutparser.elements.Interval 9 | :members: 10 | :undoc-members: 11 | :show-inheritance: 12 | 13 | .. autoclass:: layoutparser.elements.Rectangle 14 | :members: 15 | :undoc-members: 16 | :show-inheritance: 17 | 18 | .. autoclass:: layoutparser.elements.Quadrilateral 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | 24 | TextBlock 25 | -------------------------------- 26 | 27 | .. autoclass:: layoutparser.elements.TextBlock 28 | :members: 29 | :undoc-members: 30 | :show-inheritance: 31 | 32 | Layout 33 | -------------------------------- 34 | 35 | .. autoclass:: layoutparser.elements.Layout 36 | :members: 37 | :undoc-members: 38 | :show-inheritance: -------------------------------------------------------------------------------- /docs/api_doc/io.rst: -------------------------------------------------------------------------------- 1 | Load and Export Layout Data 2 | ================================ 3 | 4 | 5 | `Dataframe` and CSV 6 | -------------------------------- 7 | 8 | .. autofunction:: layoutparser.io.load_dataframe 9 | 10 | .. autofunction:: layoutparser.io.load_csv 11 | 12 | 13 | `Dict` and JSON 14 | -------------------------------- 15 | 16 | .. autofunction:: layoutparser.io.load_dict 17 | 18 | .. autofunction:: layoutparser.io.load_json 19 | 20 | 21 | PDF 22 | -------------------------------- 23 | 24 | .. autofunction:: layoutparser.io.load_pdf 25 | 26 | 27 | Other Formats 28 | -------------------------------- 29 | Stay tuned! We are working on to support more formats. -------------------------------------------------------------------------------- /docs/api_doc/models.rst: -------------------------------------------------------------------------------- 1 | Layout Detection Models 2 | ================================ 3 | 4 | 5 | .. autoclass:: layoutparser.models.Detectron2LayoutModel 6 | :members: 7 | :undoc-members: 8 | :show-inheritance: -------------------------------------------------------------------------------- /docs/api_doc/ocr.rst: -------------------------------------------------------------------------------- 1 | Text Recognition Tool 2 | ================================ 3 | 4 | 5 | Google Cloud Vision API 6 | -------------------------------- 7 | 8 | .. autoclass:: layoutparser.ocr.GCVFeatureType 9 | :members: 10 | :undoc-members: 11 | :show-inheritance: 12 | 13 | .. autoclass:: layoutparser.ocr.GCVAgent 14 | :members: 15 | :undoc-members: 16 | :show-inheritance: 17 | 18 | 19 | Tesseract OCR API 20 | -------------------------------- 21 | 22 | .. autoclass:: layoutparser.ocr.TesseractFeatureType 23 | :members: 24 | :undoc-members: 25 | :show-inheritance: 26 | 27 | .. autoclass:: layoutparser.ocr.TesseractAgent 28 | :members: 29 | :undoc-members: 30 | :show-inheritance: -------------------------------------------------------------------------------- /docs/api_doc/visualization.rst: -------------------------------------------------------------------------------- 1 | Layout and Text Visualization 2 | ================================ 3 | 4 | .. automodule:: layoutparser.visualization 5 | :members: 6 | :undoc-members: -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The Layout Parser team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # Configuration file for the Sphinx documentation builder. 16 | # 17 | # This file only contains a selection of the most common options. For a full 18 | # list see the documentation: 19 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 20 | 21 | # -- Path setup -------------------------------------------------------------- 22 | 23 | # If extensions (or modules to document with autodoc) are in another directory, 24 | # add these directories to sys.path here. If the directory is relative to the 25 | # documentation root, use os.path.abspath to make it absolute, like shown here. 26 | # 27 | import os 28 | import sys 29 | sys.path.insert(0, os.path.abspath('../src')) 30 | import layoutparser 31 | 32 | # -- Project information ----------------------------------------------------- 33 | 34 | project = 'Layout Parser' 35 | copyright = '2020-2021, Layout Parser Contributors' 36 | author = 'Layout Parser Contributors' 37 | 38 | # The full version, including alpha/beta/rc tags 39 | release = layoutparser.__version__ 40 | 41 | 42 | # -- General configuration --------------------------------------------------- 43 | 44 | # Add any Sphinx extension module names here, as strings. They can be 45 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 46 | # ones. 47 | extensions = [ 48 | "recommonmark", 49 | "sphinx.ext.autodoc", 50 | "sphinx.ext.napoleon", 51 | "sphinx.ext.intersphinx", 52 | "sphinx.ext.todo", 53 | "sphinx.ext.coverage", 54 | "sphinx.ext.mathjax", 55 | "sphinx.ext.viewcode", 56 | "sphinx.ext.githubpages", 57 | "sphinx_markdown_tables" 58 | ] 59 | 60 | # Add any paths that contain templates here, relative to this directory. 61 | templates_path = ['_templates'] 62 | source_suffix = [".rst", ".md"] 63 | 64 | # List of patterns, relative to source directory, that match files and 65 | # directories to ignore when looking for source files. 66 | # This pattern also affects html_static_path and html_extra_path. 67 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 68 | 69 | 70 | # -- Options for HTML output ------------------------------------------------- 71 | 72 | # The theme to use for HTML and HTML Help pages. See the documentation for 73 | # a list of builtin themes. 74 | # 75 | html_theme = 'sphinx_rtd_theme' 76 | 77 | # Add any paths that contain custom static files (such as style sheets) here, 78 | # relative to this directory. They are copied after the builtin static files, 79 | # so a file named "default.css" will overwrite the builtin "default.css". 80 | html_static_path = ['_static'] 81 | 82 | 83 | # Additional Configurations 84 | intersphinx_mapping = {'python': ('https://docs.python.org/3', None)} 85 | autodoc_member_order = 'bysource' 86 | autoclass_content = 'both' 87 | 88 | # [TODO] Solve the issue for functools.wrappers cause **kwargs in function declarations -------------------------------------------------------------------------------- /docs/example/deep_layout_parsing/index.rst: -------------------------------------------------------------------------------- 1 | Deep Layout Parsing 2 | =================== 3 | 4 | In this tutorial, we will show how to use the ``layoutparser`` API to 5 | 6 | 1. Load Deep Learning Layout Detection models and predict the layout of 7 | the paper image 8 | 2. Use the coordinate system to parse the output 9 | 10 | The ``paper-image`` is from https://arxiv.org/abs/2004.08686. 11 | 12 | .. code:: python 13 | 14 | import layoutparser as lp 15 | import cv2 16 | 17 | Use Layout Models to detect complex layout 18 | ------------------------------------------ 19 | 20 | ``layoutparser`` can identify the layout of the given document with only 21 | 4 lines of code. 22 | 23 | .. code:: python 24 | 25 | image = cv2.imread("data/paper-image.jpg") 26 | image = image[..., ::-1] 27 | # Convert the image from BGR (cv2 default loading style) 28 | # to RGB 29 | 30 | .. code:: python 31 | 32 | model = lp.Detectron2LayoutModel('lp://PubLayNet/faster_rcnn_R_50_FPN_3x/config', 33 | extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.8], 34 | label_map={0: "Text", 1: "Title", 2: "List", 3:"Table", 4:"Figure"}) 35 | # Load the deep layout model from the layoutparser API 36 | # For all the supported model, please check the Model 37 | # Zoo Page: https://layout-parser.readthedocs.io/en/latest/notes/modelzoo.html 38 | 39 | .. code:: python 40 | 41 | layout = model.detect(image) 42 | # Detect the layout of the input image 43 | 44 | .. code:: python 45 | 46 | lp.draw_box(image, layout, box_width=3) 47 | # Show the detected layout of the input image 48 | 49 | 50 | 51 | 52 | .. image:: output_7_0.png 53 | 54 | 55 | 56 | Check the results from the model 57 | -------------------------------- 58 | 59 | .. code:: python 60 | 61 | type(layout) 62 | 63 | 64 | 65 | 66 | .. parsed-literal:: 67 | 68 | layoutparser.elements.Layout 69 | 70 | 71 | 72 | The ``layout`` variables is a ``Layout`` instance, which is inherited 73 | from list and supports handy methods for layout processing. 74 | 75 | .. code:: python 76 | 77 | layout[0] 78 | 79 | 80 | 81 | 82 | .. parsed-literal:: 83 | 84 | TextBlock(block=Rectangle(x_1=646.4182739257812, y_1=1420.1715087890625, x_2=1132.8687744140625, y_2=1479.7222900390625), text=, id=None, type=Text, parent=None, next=None, score=0.9996440410614014) 85 | 86 | 87 | 88 | ``layout`` contains a series of ``TextBlock``\ s. They store the 89 | coordinates in the ``.block`` variable and other information of the 90 | blocks like block type in ``.type``, text in ``.text``, etc. More 91 | information can be found at the 92 | `documentation `__. 93 | 94 | Use the coordinate system to process the detected layout 95 | -------------------------------------------------------- 96 | 97 | Firstly we filter text region of specific type: 98 | 99 | .. code:: python 100 | 101 | text_blocks = lp.Layout([b for b in layout if b.type=='Text']) 102 | figure_blocks = lp.Layout([b for b in layout if b.type=='Figure']) 103 | 104 | As there could be text region detected inside the figure region, we just 105 | drop them: 106 | 107 | .. code:: python 108 | 109 | text_blocks = lp.Layout([b for b in text_blocks \ 110 | if not any(b.is_in(b_fig) for b_fig in figure_blocks)]) 111 | 112 | Finally sort the text regions and assign ids: 113 | 114 | .. code:: python 115 | 116 | h, w = image.shape[:2] 117 | 118 | left_interval = lp.Interval(0, w/2*1.05, axis='x').put_on_canvas(image) 119 | 120 | left_blocks = text_blocks.filter_by(left_interval, center=True) 121 | left_blocks.sort(key = lambda b:b.coordinates[1], inplace=True) 122 | 123 | right_blocks = [b for b in text_blocks if b not in left_blocks] 124 | right_blocks.sort(key = lambda b:b.coordinates[1], inplace=True) 125 | 126 | # And finally combine the two list and add the index 127 | # according to the order 128 | text_blocks = lp.Layout([b.set(id = idx) for idx, b in enumerate(left_blocks + right_blocks)]) 129 | 130 | Visualize the cleaned text blocks: 131 | 132 | .. code:: python 133 | 134 | lp.draw_box(image, text_blocks, 135 | box_width=3, 136 | show_element_id=True) 137 | 138 | 139 | 140 | 141 | .. image:: output_21_0.png 142 | 143 | 144 | 145 | Fetch the text inside each text region 146 | --------------------------------------- 147 | 148 | We can also combine with the OCR functionality in ``layoutparser`` to 149 | fetch the text in the document. 150 | 151 | .. code:: python 152 | 153 | ocr_agent = lp.TesseractAgent(languages='eng') 154 | # Initialize the tesseract ocr engine. You might need 155 | # to install the OCR components in layoutparser: 156 | # pip install layoutparser[ocr] 157 | 158 | .. code:: python 159 | 160 | for block in text_blocks: 161 | segment_image = (block 162 | .pad(left=5, right=5, top=5, bottom=5) 163 | .crop_image(image)) 164 | # add padding in each image segment can help 165 | # improve robustness 166 | 167 | text = ocr_agent.detect(segment_image) 168 | block.set(text=text, inplace=True) 169 | 170 | .. code:: python 171 | 172 | for txt in text_blocks.get_texts(): 173 | print(txt, end='\n---\n') 174 | 175 | 176 | .. parsed-literal:: 177 | 178 | Figure 7: Annotation Examples in HJDataset. (a) and (b) show two examples for the labeling of main pages. The boxes 179 | are colored differently to reflect the layout element categories. Illustrated in (c), the items in each index page row are 180 | categorized as title blocks, and the annotations are denser. 181 | --- 182 | tion over union (IOU) level [0.50:0.95]’, on the test data. In 183 | general, the high mAP values indicate accurate detection of 184 | the layout elements. The Faster R-CNN and Mask R-CNN 185 | achieve comparable results, better than RetinaNet. Notice- 186 | ably, the detections for small blocks like title are less pre- 187 | cise, and the accuracy drops sharply for the title category. In 188 | Figure 8, (a) and (b) illustrate the accurate prediction results 189 | of the Faster R-CNN model. 190 | --- 191 | We also examine how our dataset can help with 192 | world document digitization application. When digitizing 193 | new publications, researchers usually do not generate large 194 | scale ground truth data to train their layout analysis models. 195 | If they are able to adapt our dataset, or models trained on 196 | our dataset, to develop models on their data, they can build 197 | their pipelines more efficiently and develop more accurate 198 | models. To this end, we conduct two experiments. First we 199 | examine how layout analysis models trained on the main 200 | pages can be used for understanding index pages. More- 201 | over, we study how the pre-trained models perform on other 202 | historical Japanese documents. 203 | --- 204 | Table 4 compares the performance of five Faster R-CNN 205 | models that are trained differently on index pages. If the 206 | model loads pre-trained weights from HJDataset, it includes 207 | information learned from main pages. Models trained over 208 | --- 209 | ?This is a core metric developed for the COCO competition [| 2] for 210 | evaluating the object detection quality. 211 | --- 212 | all the training data can be viewed as the benchmarks, while 213 | training with few samples (five in this case) are consid- 214 | ered to mimic real-world scenarios. Given different train- 215 | ing data, models pre-trained on HJDataset perform signifi- 216 | cantly better than those initialized with COCO weights. In- 217 | tuitively, models trained on more data perform better than 218 | those with fewer samples. We also directly use the model 219 | trained on main to predict index pages without fine- 220 | tuning. The low zero-shot prediction accuracy indicates the 221 | dissimilarity between index and main pages. The large 222 | increase in mAP from 0.344 to 0.471 after the model is 223 | --- 224 | Table 3: Detection mAP @ IOU [0.50:0.95] of different 225 | models for each category on the test set. All values are given 226 | as percentages. 227 | --- 228 | * For training Mask R-CNN, the segmentation masks are the quadri- 229 | lateral regions for each block. Compared to the rectangular bounding 230 | boxes, they delineate the text region more accurately. 231 | --- 232 | 233 | -------------------------------------------------------------------------------- /docs/example/deep_layout_parsing/output_21_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Layout-Parser/layout-parser/04e28168d820eea3a1ff1e098078323e7b48648b/docs/example/deep_layout_parsing/output_21_0.png -------------------------------------------------------------------------------- /docs/example/deep_layout_parsing/output_7_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Layout-Parser/layout-parser/04e28168d820eea3a1ff1e098078323e7b48648b/docs/example/deep_layout_parsing/output_7_0.png -------------------------------------------------------------------------------- /docs/example/load_coco/index.rst: -------------------------------------------------------------------------------- 1 | Load COCO Layout Annotations 2 | ============================================================== 3 | 4 | Preparation 5 | ----------- 6 | 7 | In this notebook, I will illustrate how to use LayoutParser to load and 8 | visualize the layout annotation in the COCO format. 9 | 10 | Before starting, please remember to download PubLayNet annotations and 11 | images from their 12 | `website `__ 13 | (let’s just use the validation set for now as the training set is very 14 | large). And let’s put all extracted files in the 15 | ``data/publaynet/annotations`` and ``data/publaynet/val`` folder. 16 | 17 | And we need to install an additional library for conveniently handling 18 | the COCO data format: 19 | 20 | .. code:: bash 21 | 22 | pip install pycocotools 23 | 24 | OK - Let’s get on the code: 25 | 26 | Loading and visualizing layouts using Layout-Parser 27 | --------------------------------------------------- 28 | 29 | .. code:: python 30 | 31 | from pycocotools.coco import COCO 32 | import layoutparser as lp 33 | import random 34 | import cv2 35 | 36 | .. code:: python 37 | 38 | def load_coco_annotations(annotations, coco=None): 39 | """ 40 | Args: 41 | annotations (List): 42 | a list of coco annotaions for the current image 43 | coco (`optional`, defaults to `False`): 44 | COCO annotation object instance. If set, this function will 45 | convert the loaded annotation category ids to category names 46 | set in COCO.categories 47 | """ 48 | layout = lp.Layout() 49 | 50 | for ele in annotations: 51 | 52 | x, y, w, h = ele['bbox'] 53 | 54 | layout.append( 55 | lp.TextBlock( 56 | block = lp.Rectangle(x, y, w+x, h+y), 57 | type = ele['category_id'] if coco is None else coco.cats[ele['category_id']]['name'], 58 | id = ele['id'] 59 | ) 60 | ) 61 | 62 | return layout 63 | 64 | The ``load_coco_annotations`` function will help convert COCO 65 | annotations into the layoutparser objects. 66 | 67 | .. code:: python 68 | 69 | COCO_ANNO_PATH = 'data/publaynet/annotations/val.json' 70 | COCO_IMG_PATH = 'data/publaynet/val' 71 | 72 | coco = COCO(COCO_ANNO_PATH) 73 | 74 | 75 | .. parsed-literal:: 76 | 77 | loading annotations into memory... 78 | Done (t=1.17s) 79 | creating index... 80 | index created! 81 | 82 | 83 | .. code:: python 84 | 85 | color_map = { 86 | 'text': 'red', 87 | 'title': 'blue', 88 | 'list': 'green', 89 | 'table': 'purple', 90 | 'figure': 'pink', 91 | } 92 | 93 | 94 | for image_id in random.sample(coco.imgs.keys(), 1): 95 | image_info = coco.imgs[image_id] 96 | annotations = coco.loadAnns(coco.getAnnIds([image_id])) 97 | 98 | image = cv2.imread(f'{COCO_IMG_PATH}/{image_info["file_name"]}') 99 | layout = load_coco_annotations(annotations, coco) 100 | 101 | viz = lp.draw_box(image, layout, color_map=color_map) 102 | display(viz) # show the results 103 | 104 | 105 | 106 | .. image:: output_8_0.png 107 | 108 | 109 | You could add more information in the visualization. 110 | 111 | .. code:: python 112 | 113 | lp.draw_box(image, 114 | [b.set(id=f'{b.id}/{b.type}') for b in layout], 115 | color_map=color_map, 116 | show_element_id=True, id_font_size=10, 117 | id_text_background_color='grey', 118 | id_text_color='white') 119 | 120 | 121 | 122 | 123 | .. image:: output_10_0.png 124 | 125 | 126 | 127 | Model Predictions on loaded data 128 | -------------------------------- 129 | 130 | We could also check how the trained layout model performs on the input 131 | image. Following this 132 | `instruction `__, 133 | we could conveniently load a layout prediction model and run predictions 134 | on the existing image. 135 | 136 | .. code:: python 137 | 138 | model = lp.Detectron2LayoutModel('lp://PubLayNet/faster_rcnn_R_50_FPN_3x/config', 139 | extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.8], 140 | label_map={0: "text", 1: "title", 2: "list", 3:"table", 4:"figure"}) 141 | 142 | .. code:: python 143 | 144 | layout_predicted = model.detect(image) 145 | 146 | .. code:: python 147 | 148 | lp.draw_box(image, 149 | [b.set(id=f'{b.type}/{b.score:.2f}') for b in layout_predicted], 150 | color_map=color_map, 151 | show_element_id=True, id_font_size=10, 152 | id_text_background_color='grey', 153 | id_text_color='white') 154 | 155 | 156 | 157 | 158 | .. image:: output_15_0.png 159 | 160 | 161 | -------------------------------------------------------------------------------- /docs/example/load_coco/output_10_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Layout-Parser/layout-parser/04e28168d820eea3a1ff1e098078323e7b48648b/docs/example/load_coco/output_10_0.png -------------------------------------------------------------------------------- /docs/example/load_coco/output_15_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Layout-Parser/layout-parser/04e28168d820eea3a1ff1e098078323e7b48648b/docs/example/load_coco/output_15_0.png -------------------------------------------------------------------------------- /docs/example/load_coco/output_8_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Layout-Parser/layout-parser/04e28168d820eea3a1ff1e098078323e7b48648b/docs/example/load_coco/output_8_0.png -------------------------------------------------------------------------------- /docs/example/parse_ocr/output_14_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Layout-Parser/layout-parser/04e28168d820eea3a1ff1e098078323e7b48648b/docs/example/parse_ocr/output_14_0.png -------------------------------------------------------------------------------- /docs/example/parse_ocr/output_17_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Layout-Parser/layout-parser/04e28168d820eea3a1ff1e098078323e7b48648b/docs/example/parse_ocr/output_17_0.png -------------------------------------------------------------------------------- /docs/example/parse_ocr/output_19_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Layout-Parser/layout-parser/04e28168d820eea3a1ff1e098078323e7b48648b/docs/example/parse_ocr/output_19_0.png -------------------------------------------------------------------------------- /docs/example/parse_ocr/output_25_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Layout-Parser/layout-parser/04e28168d820eea3a1ff1e098078323e7b48648b/docs/example/parse_ocr/output_25_0.png -------------------------------------------------------------------------------- /docs/example/parse_ocr/output_6_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Layout-Parser/layout-parser/04e28168d820eea3a1ff1e098078323e7b48648b/docs/example/parse_ocr/output_6_1.png -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. Layout Parser documentation master file, created by 2 | sphinx-quickstart on Sun Jun 14 23:23:41 2020. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to Layout Parser's documentation! 7 | ================================================================ 8 | 9 | .. toctree:: 10 | :maxdepth: 2 11 | :caption: Notes 12 | 13 | notes/installation.md 14 | notes/modelzoo.md 15 | 16 | .. toctree:: 17 | :maxdepth: 2 18 | :caption: Examples 19 | 20 | example/parse_ocr/index 21 | example/deep_layout_parsing/index 22 | example/load_coco/index 23 | 24 | .. toctree:: 25 | :maxdepth: 2 26 | :caption: API Reference 27 | 28 | api_doc/elements 29 | notes/shape_operations.md 30 | api_doc/ocr 31 | api_doc/models 32 | api_doc/visualization 33 | api_doc/io 34 | 35 | Indices and tables 36 | ================== 37 | 38 | * :ref:`genindex` 39 | * :ref:`search` 40 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/notes/installation.md: -------------------------------------------------------------------------------- 1 | ../../installation.md -------------------------------------------------------------------------------- /docs/notes/intersection.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Layout-Parser/layout-parser/04e28168d820eea3a1ff1e098078323e7b48648b/docs/notes/intersection.png -------------------------------------------------------------------------------- /docs/notes/modelzoo.md: -------------------------------------------------------------------------------- 1 | # Model Zoo 2 | 3 | We provide a spectrum of pre-trained models on different datasets. 4 | 5 | ## Example Usage: 6 | 7 | ```python 8 | import layoutparser as lp 9 | model = lp.Detectron2LayoutModel( 10 | config_path ='lp://PubLayNet/faster_rcnn_R_50_FPN_3x/config', # In model catalog 11 | label_map ={0: "Text", 1: "Title", 2: "List", 3:"Table", 4:"Figure"}, # In model`label_map` 12 | extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.8] # Optional 13 | ) 14 | model.detect(image) 15 | ``` 16 | 17 | ## Model Catalog 18 | 19 | | Dataset | Model | Config Path | Eval Result (mAP) | 20 | |-----------------------------------------------------------------------|--------------------------------------------------------------------------------------------|--------------------------------------------------------|---------------------------------------------------------------------------| 21 | | [HJDataset](https://dell-research-harvard.github.io/HJDataset/) | [faster_rcnn_R_50_FPN_3x](https://www.dropbox.com/s/j4yseny2u0hn22r/config.yml?dl=1) | lp://HJDataset/faster_rcnn_R_50_FPN_3x/config | | 22 | | [HJDataset](https://dell-research-harvard.github.io/HJDataset/) | [mask_rcnn_R_50_FPN_3x](https://www.dropbox.com/s/4jmr3xanmxmjcf8/config.yml?dl=1) | lp://HJDataset/mask_rcnn_R_50_FPN_3x/config | | 23 | | [HJDataset](https://dell-research-harvard.github.io/HJDataset/) | [retinanet_R_50_FPN_3x](https://www.dropbox.com/s/z8a8ywozuyc5c2x/config.yml?dl=1) | lp://HJDataset/retinanet_R_50_FPN_3x/config | | 24 | | [PubLayNet](https://github.com/ibm-aur-nlp/PubLayNet) | [faster_rcnn_R_50_FPN_3x](https://www.dropbox.com/s/f3b12qc4hc0yh4m/config.yml?dl=1) | lp://PubLayNet/faster_rcnn_R_50_FPN_3x/config | | 25 | | [PubLayNet](https://github.com/ibm-aur-nlp/PubLayNet) | [mask_rcnn_R_50_FPN_3x](https://www.dropbox.com/s/u9wbsfwz4y0ziki/config.yml?dl=1) | lp://PubLayNet/mask_rcnn_R_50_FPN_3x/config | | 26 | | [PubLayNet](https://github.com/ibm-aur-nlp/PubLayNet) | [mask_rcnn_X_101_32x8d_FPN_3x](https://www.dropbox.com/s/nau5ut6zgthunil/config.yaml?dl=1) | lp://PubLayNet/mask_rcnn_X_101_32x8d_FPN_3x/config | 88.98 [eval.csv](https://www.dropbox.com/s/15ytg3fzmc6l59x/eval.csv?dl=0) | 27 | | [PrimaLayout](https://www.primaresearch.org/dataset/) | [mask_rcnn_R_50_FPN_3x](https://www.dropbox.com/s/yc92x97k50abynt/config.yaml?dl=1) | lp://PrimaLayout/mask_rcnn_R_50_FPN_3x/config | 69.35 [eval.csv](https://www.dropbox.com/s/9uuql57uedvb9mo/eval.csv?dl=0) | 28 | | [NewspaperNavigator](https://news-navigator.labs.loc.gov/) | [faster_rcnn_R_50_FPN_3x](https://www.dropbox.com/s/wnido8pk4oubyzr/config.yml?dl=1) | lp://NewspaperNavigator/faster_rcnn_R_50_FPN_3x/config | | 29 | | [TableBank](https://doc-analysis.github.io/tablebank-page/index.html) | [faster_rcnn_R_50_FPN_3x](https://www.dropbox.com/s/7cqle02do7ah7k4/config.yaml?dl=1) | lp://TableBank/faster_rcnn_R_50_FPN_3x/config | 89.78 [eval.csv](https://www.dropbox.com/s/1uwnz58hxf96iw2/eval.csv?dl=0) | 30 | | [TableBank](https://doc-analysis.github.io/tablebank-page/index.html) | [faster_rcnn_R_101_FPN_3x](https://www.dropbox.com/s/h63n6nv51kfl923/config.yaml?dl=1) | lp://TableBank/faster_rcnn_R_101_FPN_3x/config | 91.26 [eval.csv](https://www.dropbox.com/s/e1kq8thkj2id1li/eval.csv?dl=0) | 31 | | [Math Formula Detection(MFD)](http://transcriptorium.eu/~htrcontest/MathsICDAR2021/) | [faster_rcnn_R_50_FPN_3x](https://www.dropbox.com/s/ld9izb95f19369w/config.yaml?dl=1) | lp://MFD/faster_rcnn_R_50_FPN_3x/config | 79.68 [eval.csv](https://www.dropbox.com/s/1yvrs29jjybrlpw/eval.csv?dl=0) | 32 | 33 | 34 | * For PubLayNet models, we suggest using `mask_rcnn_X_101_32x8d_FPN_3x` model as it's trained on the whole training set, while others are only trained on the validation set (the size is only around 1/50). You could expect a 15% AP improvement using the `mask_rcnn_X_101_32x8d_FPN_3x` model. 35 | 36 | ## Model `label_map` 37 | 38 | | Dataset | Label Map | 39 | | ------------------------------------------------------------ | ------------------------------------------------------------ | 40 | | [HJDataset](https://dell-research-harvard.github.io/HJDataset/) | `{1:"Page Frame", 2:"Row", 3:"Title Region", 4:"Text Region", 5:"Title", 6:"Subtitle", 7:"Other"}` | 41 | | [PubLayNet](https://github.com/ibm-aur-nlp/PubLayNet) | `{0: "Text", 1: "Title", 2: "List", 3:"Table", 4:"Figure"}` | 42 | | [PrimaLayout](https://www.primaresearch.org/dataset/) | `{1:"TextRegion", 2:"ImageRegion", 3:"TableRegion", 4:"MathsRegion", 5:"SeparatorRegion", 6:"OtherRegion"}` | 43 | | [NewspaperNavigator](https://news-navigator.labs.loc.gov/) | `{0: "Photograph", 1: "Illustration", 2: "Map", 3: "Comics/Cartoon", 4: "Editorial Cartoon", 5: "Headline", 6: "Advertisement"}` | 44 | | [TableBank](https://doc-analysis.github.io/tablebank-page/index.html) | `{0: "Table"}` | 45 | | [MFD](http://transcriptorium.eu/~htrcontest/MathsICDAR2021/) | `{1: "Equation"}` | -------------------------------------------------------------------------------- /docs/notes/quickstart.rst: -------------------------------------------------------------------------------- 1 | Quickstart 2 | ================================ 3 | 4 | 5 | Installation 6 | -------------------------------- 7 | 8 | Use pip or conda to install the library: 9 | 10 | .. code-block:: bash 11 | 12 | pip install layoutparser 13 | 14 | # Install Detectron2 for using DL Layout Detection Model 15 | pip install 'git+https://github.com/facebookresearch/detectron2.git@v0.1.3#egg=detectron2' 16 | 17 | # Install the ocr components when necessary 18 | pip install layoutparser[ocr] 19 | 20 | This by default will install the CPU version of the Detectron2, and it should be able to run on most of the computers. But if you have a GPU, you can consider the GPU version of the Detectron2, referring to the `official instructions `_. -------------------------------------------------------------------------------- /docs/notes/shape_operations.md: -------------------------------------------------------------------------------- 1 | # Shape Operations 2 | 3 | [BETA: the API and behavior *will* be changed in the future.] 4 | 5 | Starting from v0.2, Layout Parser provides supports for two types of shape operations, `union` and `intersection`, across all `BaseCoordElement`s and `TextBlock`. We've made some design choices to construct a set of generalized APIs across different shape classes, detailed as follows: 6 | 7 | ## The `union` Operation 8 | 9 | ![Illustration of Union Operations](union.png) 10 | ▲ The Illustration of Union Operations. The resulting matrix are symmetric so only the lower triangular region is left empty. Each cell shows the visualization of the shape objects, their coordinates, and their object class. For the output visualization, the gray and dashed line delineates the original obj1 and obj2, respectively, for reference. 11 | 12 | **Notes**: 13 | 1. The x-interval and y-interval are both from the `Interval` Class but with different axes. It's ill-defined to union two intervals from different axes so in this case Layout Parser will raise an `InvalidShapeError`. 14 | 2. The union of two rectangles is still a rectangle, which is the minimum covering rectangle of the two input rectangles. 15 | 3. For the outputs associated with `Quadrilateral` inputs, please see details in the [Problems related to the Quadrilateral Class](#problems-related-to-the-quadrilateral-class) section. 16 | 17 | ## The `intersect` Operation 18 | 19 | ![Illustration of Intersection Operations](intersection.png) 20 | ▲ The Illustration of Union Operations. Similar to the previous visualization, the resulting matrix are symmetric so only the lower triangular region is left empty. Each cell shows the visualization of the shape objects, their coordinates, and their object class. For the output visualization, the gray and dashed line delineates the original obj1 and obj2, respectively, for reference. 21 | 22 | ## Problems related to the `Quadrilateral` Class 23 | 24 | It is possible to generate arbitrary shapes when performing shape operations on `Quadrilateral` objects. Currently Layout Parser does not provide the support for `Polygon` objects (but we plan to support that object in the near future), thus it becomes tricky to add support for these operations for `Quadrilateral`. The temporary solution is that: 25 | 1. When performing shape operations on `Quadrilateral` objects, Layout Parser will raise `NotSupportedShapeError`. 26 | 2. A workaround is to set `strict=False` in the input (i.e., `obj1.union(obj2, strict=False)`). In this case, any quadrilateral objects will be converted to `Rectangle`s first and the operation is executed. The results may not be *strictly* equivalent to those performed on the original objects. -------------------------------------------------------------------------------- /docs/notes/union.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Layout-Parser/layout-parser/04e28168d820eea3a1ff1e098078323e7b48648b/docs/notes/union.png -------------------------------------------------------------------------------- /examples/Customizing Layout Models with Label Studio Annotation/README.md: -------------------------------------------------------------------------------- 1 |
2 |

Customizing LayoutParser Models with Label Studio Annotation

3 | With Scientific Document Parsing as an example 4 | 5 | --- 6 | 7 | [Webinar Video](https://www.youtube.com/watch?v=puOKTFXRyr4) | [Slides](https://szj.io/assets/files/talks/2022-Feb-LayoutParser-and-Label-Studio-Webinar.pdf) | [Notebooks](Customizing%20Layout%20Models%20with%20Label%20Studio%20Annotation.ipynb) 8 |
9 | 10 | ![Overview of the Pipeline](pipeline-overview.jpg) 11 | -------------------------------------------------------------------------------- /examples/Customizing Layout Models with Label Studio Annotation/download_annotation.py: -------------------------------------------------------------------------------- 1 | import pdf2image 2 | import tempfile 3 | import urllib.request 4 | import pandas as pd 5 | import zipfile 6 | 7 | opener = urllib.request.build_opener() 8 | opener.addheaders=[('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1941.0 Safari/537.36')] 9 | urllib.request.install_opener(opener) 10 | 11 | def download_auxiliary_paper_images(target_path: str = "downloaded-annotations"): 12 | 13 | data_to_download = pd.DataFrame( 14 | [ 15 | ["1810.04805v2", 10, "1810.04805v2-10_ea8f.jpg"], 16 | ["1810.04805v2", 11, "1810.04805v2-11_213f.jpg"], 17 | ["1810.04805v2", 9, "1810.04805v2-9_dc05.jpg"], 18 | ["1908.03557v1", 10, "1908.03557v1-10_fa12.jpg"], 19 | ["1908.03557v1", 11, "1908.03557v1-11_a737.jpg"], 20 | ], 21 | columns=["arxiv_id", "page", "filename"], 22 | ) 23 | 24 | for arxiv_id, gp in data_to_download.groupby("arxiv_id"): 25 | with tempfile.TemporaryDirectory() as tempdir: 26 | arxiv_link = f"http://arxiv.org/pdf/{arxiv_id}.pdf" 27 | urllib.request.urlretrieve(arxiv_link, f"{tempdir}/{arxiv_id}.pdf") 28 | pdf_images = pdf2image.convert_from_path( 29 | f"{tempdir}/{arxiv_id}.pdf", dpi=72 30 | ) 31 | for _, row in gp.iterrows(): 32 | pdf_images[row["page"]].save(f"{target_path}/images/{row['filename']}") 33 | 34 | 35 | ANNOTATION_FILE_PATH = "http://szj.io/assets/files/data/layoutparser-webinar-annotations-2022-Feb.zip" 36 | 37 | def download_zipped_annotations(): 38 | filehandle, _ = urllib.request.urlretrieve(ANNOTATION_FILE_PATH) 39 | zip_ref = zipfile.ZipFile(filehandle, 'r') 40 | zip_ref.extractall("./") # extract file to dir 41 | zip_ref.close() # close file 42 | 43 | if __name__ == "__main__": 44 | download_zipped_annotations() 45 | download_auxiliary_paper_images() -------------------------------------------------------------------------------- /examples/Customizing Layout Models with Label Studio Annotation/pipeline-overview.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Layout-Parser/layout-parser/04e28168d820eea3a1ff1e098078323e7b48648b/examples/Customizing Layout Models with Label Studio Annotation/pipeline-overview.jpg -------------------------------------------------------------------------------- /examples/Customizing Layout Models with Label Studio Annotation/task-overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Layout-Parser/layout-parser/04e28168d820eea3a1ff1e098078323e7b48648b/examples/Customizing Layout Models with Label Studio Annotation/task-overview.png -------------------------------------------------------------------------------- /examples/data/example-table.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Layout-Parser/layout-parser/04e28168d820eea3a1ff1e098078323e7b48648b/examples/data/example-table.jpeg -------------------------------------------------------------------------------- /examples/data/paper-image.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Layout-Parser/layout-parser/04e28168d820eea3a1ff1e098078323e7b48648b/examples/data/paper-image.jpg -------------------------------------------------------------------------------- /installation.md: -------------------------------------------------------------------------------- 1 | # Installation 2 | 3 | ## Install Python 4 | 5 | LayoutParser is a Python package that requires Python >= 3.6. If you do not have Python installed on your computer, you might want to turn to [the official instruction](https://www.python.org/downloads/) to download and install the appropriate version of Python. 6 | 7 | 8 | 9 | ## Install the LayoutParser library 10 | 11 | After several major updates, LayoutParser provides various functionalities and deep learning models from different backends. However, you might only need a fraction of the functions, and it would be redundant for you to install all the dependencies when they are not required. Therefore, we design highly customizable ways for installing the LayoutParser library: 12 | 13 | 14 | | Command | Description | 15 | | --- | --- | 16 | | `pip install layoutparser` | **Install the base LayoutParser Library**
It will support all key functions in LayoutParser, including:
1. Layout Data Structure and operations
2. Layout Visualization
3. Load/export the layout data | 17 | | `pip install "layoutparser[effdet]"` | **Install LayoutParser with Layout Detection Model Support**
It will install the LayoutParser base library as well as
supporting dependencies for the ***EfficientDet***-based layout detection models. | 18 | | `pip install layoutparser torchvision && pip install "git+https://github.com/facebookresearch/detectron2.git@v0.5#egg=detectron2"` | **Install LayoutParser with Layout Detection Model Support**
It will install the LayoutParser base library as well as
supporting dependencies for the ***Detectron2***-based layout detection models. See details in [Additional Instruction: Install Detectron2 Layout Model Backend](#additional-instruction-install-detectron2-layout-model-backend). | 19 | | `pip install "layoutparser[paddledetection]"` | **Install LayoutParser with Layout Detection Model Support**
It will install the LayoutParser base library as well as
supporting dependencies for the ***PaddleDetection***-based layout detection models. | 20 | | `pip install "layoutparser[ocr]"` | **Install LayoutParser with OCR Support**
It will install the LayoutParser base library as well as
supporting dependencies for performing OCRs. See details in [Additional Instruction: Install OCR utils](#additional-instruction-install-ocr-utils). | 21 | 22 | ### Additional Instruction: Install Detectron2 Layout Model Backend 23 | 24 | #### For Mac OS and Linux Users 25 | 26 | If you would like to use the Detectron2 models for layout detection, you might need to run the following command: 27 | 28 | ```bash 29 | pip install layoutparser torchvision && pip install "detectron2@git+https://github.com/facebookresearch/detectron2.git@v0.5#egg=detectron2" 30 | ``` 31 | 32 | This might take some time as the command will *compile* the library. If you also want to install a Detectron2 version 33 | with GPU support or encounter some issues during the installation process, please refer to the official Detectron2 34 | [installation instruction](https://github.com/facebookresearch/detectron2/blob/master/INSTALL.md) for detailed 35 | information. 36 | 37 | #### For Windows users 38 | 39 | As reported by many users, the installation of Detectron2 can be rather tricky on Windows platforms. In our extensive tests, we find that it is nearly impossible to provide a one-line installation command for Windows users. As a workaround solution, for now we list the possible challenges for installing Detectron2 on Windows, and attach helpful resources for solving them. We are also investigating other possibilities to avoid installing Detectron2 to use pre-trained models. If you have any suggestions or ideas, please feel free to [submit an issue](https://github.com/Layout-Parser/layout-parser/issues) in our repo. 40 | 41 | 1. Challenges for installing `pycocotools` 42 | - You can find detailed instructions on [this post](https://changhsinlee.com/pycocotools/) from Chang Hsin Lee. 43 | - Another solution is try to install `pycocotools-windows`, see https://github.com/cocodataset/cocoapi/issues/415. 44 | 2. Challenges for installing `Detectron2` 45 | - [@ivanpp](https://github.com/ivanpp) curates a detailed description for installing `Detectron2` on Windows: [Detectron2 walkthrough (Windows)](https://ivanpp.cc/detectron2-walkthrough-windows/#step3installdetectron2) 46 | - `Detectron2` maintainers claim that they won't provide official support for Windows (see [1](https://github.com/facebookresearch/detectron2/issues/9#issuecomment-540974288) and [2](https://detectron2.readthedocs.io/en/latest/tutorials/install.html)), but Detectron2 is continuously built on windows with CircleCI (see [3](https://github.com/facebookresearch/detectron2/blob/master/INSTALL.md#common-installation-issues)). Hopefully this situation will be improved in the future. 47 | 48 | 49 | ### Additional Instructions: Install OCR utils 50 | 51 | Layout Parser also comes with supports for OCR functions. In order to use them, you need to install the OCR utils via: 52 | 53 | ```bash 54 | pip install "layoutparser[ocr]" 55 | ``` 56 | 57 | Additionally, if you want to use the Tesseract-OCR engine, you also need to install it on your computer. Please check the 58 | [official documentation](https://tesseract-ocr.github.io/tessdoc/Installation.html) for detailed installation instructions. 59 | 60 | ## Known issues 61 | 62 |
Error: instantiating `lp.GCVAgent.with_credential` returns module 'google.cloud.vision' has no attribute 'types'. 63 |

64 | 65 | In this case, you have a newer version of the google-cloud-vision. Please consider downgrading the API using: 66 | ```bash 67 | pip install -U layoutparser[ocr] 68 | ``` 69 |

70 |
-------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = layoutparser 3 | description = Layout Parser is a deep learning assisted tool for Document Image Layout Analysis. 4 | keywords = layout analysis, deep learning 5 | license = Apache-2.0 6 | classifiers = 7 | Intended Audience :: Developers 8 | Intended Audience :: Education 9 | Intended Audience :: Science/Research 10 | License :: OSI Approved :: Apache Software License 11 | Programming Language :: Python :: 3 12 | Programming Language :: Python :: 3.6 13 | Programming Language :: Python :: 3.7 14 | Programming Language :: Python :: 3.8 15 | Programming Language :: Python :: 3.9 16 | Topic :: Scientific/Engineering :: Artificial Intelligence 17 | 18 | [options] 19 | zip_safe = False 20 | package_dir= 21 | =src 22 | packages=find: 23 | 24 | [options.packages.find] 25 | where=src -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The Layout Parser team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from setuptools import setup, find_packages 16 | import os 17 | 18 | # A trick from https://github.com/jina-ai/jina/blob/79b302c93b01689e82cf4b52f46522eb7497c404/setup.py#L20 19 | pkg_name = 'layoutparser' 20 | libinfo_py = os.path.join('src', pkg_name, '__init__.py') 21 | libinfo_content = open(libinfo_py, 'r', encoding='utf8').readlines() 22 | version_line = [l.strip() for l in libinfo_content if l.startswith('__version__')][0] 23 | exec(version_line) # gives __version__ 24 | 25 | setup(name = "layoutparser", 26 | version = __version__, 27 | author = "Zejiang Shen, Ruochen Zhang, and Layout Parser Model Contributors", 28 | author_email = "layoutparser@gmail.com", 29 | license = "Apache-2.0", 30 | url = "https://github.com/Layout-Parser/layout-parser", 31 | package_dir = {"": "src"}, 32 | packages = find_packages("src"), 33 | description = "A unified toolkit for Deep Learning Based Document Image Analysis", 34 | long_description=open("README.md", "r", encoding="utf-8").read(), 35 | long_description_content_type="text/markdown", 36 | python_requires='>=3.6', 37 | install_requires=[ 38 | "numpy", 39 | "opencv-python", 40 | "scipy", 41 | "pandas", 42 | "pillow", 43 | "pyyaml>=5.1", 44 | "iopath", 45 | "pdfplumber", 46 | "pdf2image", 47 | ], 48 | extras_require={ 49 | "ocr": [ 50 | 'google-cloud-vision==1', 51 | 'pytesseract' 52 | ], 53 | "gcv": [ 54 | 'google-cloud-vision==1', 55 | ], 56 | "tesseract": [ 57 | 'pytesseract' 58 | ], 59 | "layoutmodels": [ 60 | "torch", 61 | "torchvision", 62 | "effdet" 63 | ], 64 | "effdet": [ 65 | "torch", 66 | "torchvision", 67 | "effdet" 68 | ], 69 | "paddledetection": [ 70 | "paddlepaddle==2.1.0" 71 | ], 72 | }, 73 | include_package_data=True 74 | ) -------------------------------------------------------------------------------- /src/layoutparser/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The Layout Parser team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | __version__ = "0.3.4" 16 | 17 | import sys 18 | 19 | from .file_utils import ( 20 | _LazyModule, 21 | is_detectron2_available, 22 | is_paddle_available, 23 | is_effdet_available, 24 | is_pytesseract_available, 25 | is_gcv_available, 26 | ) 27 | 28 | _import_structure = { 29 | "elements": [ 30 | "Interval", 31 | "Rectangle", 32 | "Quadrilateral", 33 | "TextBlock", 34 | "Layout" 35 | ], 36 | "visualization": [ 37 | "draw_box", 38 | "draw_text" 39 | ], 40 | "io": [ 41 | "load_json", 42 | "load_dict", 43 | "load_csv", 44 | "load_dataframe", 45 | "load_pdf" 46 | ], 47 | "file_utils":[ 48 | "is_torch_available", 49 | "is_torch_cuda_available", 50 | "is_detectron2_available", 51 | "is_paddle_available", 52 | "is_pytesseract_available", 53 | "is_gcv_available", 54 | "requires_backends" 55 | ], 56 | "tools": [ 57 | "generalized_connected_component_analysis_1d", 58 | "simple_line_detection", 59 | "group_textblocks_based_on_category" 60 | ] 61 | } 62 | 63 | _import_structure["models"] = ["AutoLayoutModel"] 64 | 65 | if is_detectron2_available(): 66 | _import_structure["models.detectron2"] = ["Detectron2LayoutModel"] 67 | 68 | if is_paddle_available(): 69 | _import_structure["models.paddledetection"] = ["PaddleDetectionLayoutModel"] 70 | 71 | if is_effdet_available(): 72 | _import_structure["models.effdet"] = ["EfficientDetLayoutModel"] 73 | 74 | if is_pytesseract_available(): 75 | _import_structure["ocr.tesseract_agent"] = [ 76 | "TesseractAgent", 77 | "TesseractFeatureType", 78 | ] 79 | 80 | if is_gcv_available(): 81 | _import_structure["ocr.gcv_agent"] = ["GCVAgent", "GCVFeatureType"] 82 | 83 | sys.modules[__name__] = _LazyModule( 84 | __name__, 85 | globals()["__file__"], 86 | _import_structure, 87 | module_spec=__spec__, 88 | extra_objects={"__version__": __version__}, 89 | ) 90 | -------------------------------------------------------------------------------- /src/layoutparser/elements/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The Layout Parser team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from .base import BaseCoordElement, BaseLayoutElement 16 | from .layout_elements import ( 17 | Interval, 18 | Rectangle, 19 | Quadrilateral, 20 | TextBlock, 21 | ALL_BASECOORD_ELEMENTS, 22 | BASECOORD_ELEMENT_NAMEMAP, 23 | BASECOORD_ELEMENT_INDEXMAP, 24 | ) 25 | from .layout import Layout -------------------------------------------------------------------------------- /src/layoutparser/elements/errors.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The Layout Parser team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | class NotSupportedShapeError(Exception): 16 | """For now (v0.2), if the created shape might be a polygon (shapes with more than 4 vertices), 17 | layoutparser will raise NotSupportedShapeError. It is expected to be fixed in the future versions. 18 | See 19 | :ref:`shape_operations:problems-related-to-the-quadrilateral-class`. 20 | """ 21 | 22 | 23 | class InvalidShapeError(Exception): 24 | """For shape operations like intersection of union, lp will raise the InvalidShapeError when 25 | invalid shapes are created (e.g., intersecting a rectangle and an interval). 26 | """ -------------------------------------------------------------------------------- /src/layoutparser/elements/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The Layout Parser team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from typing import List, Union, Dict, Dict, Any, Optional, Tuple 16 | 17 | import numpy as np 18 | from PIL import Image 19 | 20 | 21 | def cvt_coordinates_to_points(coords: Tuple[float, float, float, float]) -> np.ndarray: 22 | 23 | x_1, y_1, x_2, y_2 = coords 24 | return np.array( 25 | [ 26 | [x_1, y_1], # Top Left 27 | [x_2, y_1], # Top Right 28 | [x_2, y_2], # Bottom Right 29 | [x_1, y_2], # Bottom Left 30 | ] 31 | ) 32 | 33 | 34 | def cvt_points_to_coordinates(points: np.ndarray) -> Tuple[float, float, float, float]: 35 | x_1 = points[:, 0].min() 36 | y_1 = points[:, 1].min() 37 | x_2 = points[:, 0].max() 38 | y_2 = points[:, 1].max() 39 | return (x_1, y_1, x_2, y_2) 40 | 41 | 42 | def perspective_transformation( 43 | M: np.ndarray, points: np.ndarray, is_inv: bool = False 44 | ) -> np.ndarray: 45 | 46 | if is_inv: 47 | M = np.linalg.inv(M) 48 | 49 | src_mid = np.hstack([points, np.ones((points.shape[0], 1))]).T # 3x4 50 | dst_mid = np.matmul(M, src_mid) 51 | 52 | dst = (dst_mid / dst_mid[-1]).T[:, :2] # 4x2 53 | 54 | return dst 55 | 56 | 57 | def vertice_in_polygon(vertice: np.ndarray, polygon_points: np.ndarray) -> bool: 58 | # The polygon_points are ordered clockwise 59 | 60 | # The implementation is based on the algorithm from 61 | # https://demonstrations.wolfram.com/AnEfficientTestForAPointToBeInAConvexPolygon/ 62 | 63 | points = polygon_points - vertice # shift the coordinates origin to the vertice 64 | edges = np.append(points, points[0:1, :], axis=0) 65 | return all([np.linalg.det([e1, e2]) >= 0 for e1, e2 in zip(edges, edges[1:])]) 66 | # If the points are ordered clockwise, the det should <=0 67 | 68 | 69 | def polygon_area(xs: np.ndarray, ys: np.ndarray) -> float: 70 | """Calculate the area of polygons using 71 | `Shoelace Formula `_. 72 | 73 | Args: 74 | xs (`np.ndarray`): The x coordinates of the points 75 | ys (`np.ndarray`): The y coordinates of the points 76 | """ 77 | 78 | # Refer to: https://stackoverflow.com/questions/24467972/calculate-area-of-polygon-given-x-y-coordinates 79 | # The formula is equivalent to the original one indicated in the wikipedia 80 | # page. 81 | 82 | return 0.5 * np.abs(np.dot(xs, np.roll(ys, 1)) - np.dot(ys, np.roll(xs, 1))) -------------------------------------------------------------------------------- /src/layoutparser/file_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The Layout Parser team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # Some code are adapted from 16 | # https://github.com/huggingface/transformers/blob/master/src/transformers/file_utils.py 17 | 18 | from typing import Any, BinaryIO, Dict, List, Optional, Tuple, Union 19 | import sys 20 | import os 21 | import logging 22 | import importlib.util 23 | from types import ModuleType 24 | 25 | logger = logging.getLogger(__name__) # pylint: disable=invalid-name 26 | 27 | # The package importlib_metadata is in a different place, depending on the python version. 28 | if sys.version_info < (3, 8): 29 | import importlib_metadata 30 | else: 31 | import importlib.metadata as importlib_metadata 32 | 33 | ########################################### 34 | ############ Layout Model Deps ############ 35 | ########################################### 36 | 37 | _torch_available = importlib.util.find_spec("torch") is not None 38 | try: 39 | _torch_version = importlib_metadata.version("torch") 40 | logger.debug(f"PyTorch version {_torch_version} available.") 41 | except importlib_metadata.PackageNotFoundError: 42 | _torch_available = False 43 | 44 | _detectron2_available = importlib.util.find_spec("detectron2") is not None 45 | try: 46 | _detectron2_version = importlib_metadata.version("detectron2") 47 | logger.debug(f"Detectron2 version {_detectron2_version} available") 48 | except importlib_metadata.PackageNotFoundError: 49 | _detectron2_available = False 50 | 51 | _paddle_available = importlib.util.find_spec("paddle") is not None 52 | try: 53 | # The name of the paddlepaddle library: 54 | # Install name: pip install paddlepaddle 55 | # Import name: import paddle 56 | _paddle_version = importlib_metadata.version("paddlepaddle") 57 | logger.debug(f"Paddle version {_paddle_version} available.") 58 | except importlib_metadata.PackageNotFoundError: 59 | _paddle_available = False 60 | 61 | _effdet_available = importlib.util.find_spec("effdet") is not None 62 | try: 63 | _effdet_version = importlib_metadata.version("effdet") 64 | logger.debug(f"Effdet version {_effdet_version} available.") 65 | except importlib_metadata.PackageNotFoundError: 66 | _effdet_version = False 67 | 68 | ########################################### 69 | ############## OCR Tool Deps ############## 70 | ########################################### 71 | 72 | _pytesseract_available = importlib.util.find_spec("pytesseract") is not None 73 | try: 74 | _pytesseract_version = importlib_metadata.version("pytesseract") 75 | logger.debug(f"Pytesseract version {_pytesseract_version} available.") 76 | except importlib_metadata.PackageNotFoundError: 77 | _pytesseract_available = False 78 | 79 | try: 80 | _gcv_available = importlib.util.find_spec("google.cloud.vision") is not None 81 | try: 82 | _gcv_version = importlib_metadata.version( 83 | "google-cloud-vision" 84 | ) # This is slightly different 85 | logger.debug(f"Google Cloud Vision Utils version {_gcv_version} available.") 86 | except importlib_metadata.PackageNotFoundError: 87 | _gcv_available = False 88 | except ModuleNotFoundError: 89 | _gcv_available = False 90 | 91 | 92 | def is_torch_available(): 93 | return _torch_available 94 | 95 | 96 | def is_torch_cuda_available(): 97 | if is_torch_available(): 98 | import torch 99 | 100 | return torch.cuda.is_available() 101 | else: 102 | return False 103 | 104 | 105 | def is_detectron2_available(): 106 | return _detectron2_available 107 | 108 | 109 | def is_paddle_available(): 110 | return _paddle_available 111 | 112 | 113 | def is_effdet_available(): 114 | return _effdet_available 115 | 116 | 117 | def is_pytesseract_available(): 118 | return _pytesseract_available 119 | 120 | 121 | def is_gcv_available(): 122 | return _gcv_available 123 | 124 | 125 | PYTORCH_IMPORT_ERROR = """ 126 | {0} requires the PyTorch library but it was not found in your environment. Checkout the instructions on the 127 | installation page: https://pytorch.org/get-started/locally/ and follow the ones that match your environment. 128 | """ 129 | 130 | DETECTRON2_IMPORT_ERROR = """ 131 | {0} requires the detectron2 library but it was not found in your environment. Checkout the instructions on the 132 | installation page: https://github.com/facebookresearch/detectron2/blob/master/INSTALL.md and follow the ones 133 | that match your environment. Typically the following would work for MacOS or Linux CPU machines: 134 | pip install 'git+https://github.com/facebookresearch/detectron2.git@v0.4#egg=detectron2' 135 | """ 136 | 137 | PADDLE_IMPORT_ERROR = """ 138 | {0} requires the PaddlePaddle library but it was not found in your environment. Checkout the instructions on the 139 | installation page: https://github.com/PaddlePaddle/Paddle and follow the ones that match your environment. 140 | """ 141 | 142 | EFFDET_IMPORT_ERROR = """ 143 | {0} requires the effdet library but it was not found in your environment. You can install it with pip: 144 | `pip install effdet` 145 | """ 146 | 147 | PYTESSERACT_IMPORT_ERROR = """ 148 | {0} requires the PyTesseract library but it was not found in your environment. You can install it with pip: 149 | `pip install pytesseract` 150 | """ 151 | 152 | GCV_IMPORT_ERROR = """ 153 | {0} requires the Google Cloud Vision Python utils but it was not found in your environment. You can install it with pip: 154 | `pip install google-cloud-vision==1` 155 | """ 156 | 157 | BACKENDS_MAPPING = dict( 158 | [ 159 | ("torch", (is_torch_available, PYTORCH_IMPORT_ERROR)), 160 | ("detectron2", (is_detectron2_available, DETECTRON2_IMPORT_ERROR)), 161 | ("paddle", (is_paddle_available, PADDLE_IMPORT_ERROR)), 162 | ("effdet", (is_effdet_available, EFFDET_IMPORT_ERROR)), 163 | ("pytesseract", (is_pytesseract_available, PYTESSERACT_IMPORT_ERROR)), 164 | ("google-cloud-vision", (is_gcv_available, GCV_IMPORT_ERROR)), 165 | ] 166 | ) 167 | 168 | 169 | def requires_backends(obj, backends): 170 | if not isinstance(backends, (list, tuple)): 171 | backends = [backends] 172 | 173 | name = obj.__name__ if hasattr(obj, "__name__") else obj.__class__.__name__ 174 | if not all(BACKENDS_MAPPING[backend][0]() for backend in backends): 175 | raise ImportError( 176 | "".join([BACKENDS_MAPPING[backend][1].format(name) for backend in backends]) 177 | ) 178 | 179 | 180 | class _LazyModule(ModuleType): 181 | """ 182 | Module class that surfaces all objects but only performs associated imports when the objects are requested. 183 | """ 184 | 185 | # Adapted from HuggingFace 186 | # https://github.com/huggingface/transformers/blob/c37573806ab3526dd805c49cbe2489ad4d68a9d7/src/transformers/file_utils.py#L1990 187 | 188 | def __init__( 189 | self, name, module_file, import_structure, module_spec=None, extra_objects=None 190 | ): 191 | super().__init__(name) 192 | self._modules = set(import_structure.keys()) 193 | self._class_to_module = {} 194 | for key, values in import_structure.items(): 195 | for value in values: 196 | self._class_to_module[value] = key 197 | # Needed for autocompletion in an IDE 198 | self.__all__ = list(import_structure.keys()) + sum( 199 | import_structure.values(), [] 200 | ) 201 | self.__file__ = module_file 202 | self.__spec__ = module_spec 203 | self.__path__ = [os.path.dirname(module_file)] 204 | self._objects = {} if extra_objects is None else extra_objects 205 | self._name = name 206 | self._import_structure = import_structure 207 | 208 | # Following [PEP 366](https://www.python.org/dev/peps/pep-0366/) 209 | # The __package__ variable should be set 210 | # https://docs.python.org/3/reference/import.html#__package__ 211 | self.__package__ = self.__name__ 212 | 213 | # Needed for autocompletion in an IDE 214 | def __dir__(self): 215 | return super().__dir__() + self.__all__ 216 | 217 | def __getattr__(self, name: str) -> Any: 218 | if name in self._objects: 219 | return self._objects[name] 220 | if name in self._modules: 221 | value = self._get_module(name) 222 | elif name in self._class_to_module.keys(): 223 | module = self._get_module(self._class_to_module[name]) 224 | value = getattr(module, name) 225 | else: 226 | raise AttributeError(f"module {self.__name__} has no attribute {name}") 227 | 228 | setattr(self, name, value) 229 | return value 230 | 231 | def _get_module(self, module_name: str): 232 | return importlib.import_module("." + module_name, self.__name__) 233 | 234 | def __reduce__(self): 235 | return (self.__class__, (self._name, self.__file__, self._import_structure)) 236 | -------------------------------------------------------------------------------- /src/layoutparser/io/__init__.py: -------------------------------------------------------------------------------- 1 | from .basic import load_json, load_dict, load_csv, load_dataframe 2 | from .pdf import load_pdf -------------------------------------------------------------------------------- /src/layoutparser/io/basic.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The Layout Parser team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import ast 16 | import json 17 | from typing import List, Union, Dict, Dict, Any 18 | 19 | import pandas as pd 20 | 21 | from ..elements import ( 22 | BaseLayoutElement, 23 | TextBlock, 24 | Layout, 25 | BASECOORD_ELEMENT_NAMEMAP, 26 | ) 27 | 28 | 29 | def load_json(filename: str) -> Union[BaseLayoutElement, Layout]: 30 | """Load a JSON file and save it as a layout object with appropriate data types. 31 | 32 | Args: 33 | filename (str): 34 | The name of the JSON file. 35 | 36 | Returns: 37 | Union[BaseLayoutElement, Layout]: 38 | Based on the JSON file format, it will automatically parse 39 | the type of the data and load it accordingly. 40 | """ 41 | with open(filename, "r") as fp: 42 | res = json.load(fp) 43 | 44 | return load_dict(res) 45 | 46 | 47 | def load_dict(data: Union[Dict, List[Dict]]) -> Union[BaseLayoutElement, Layout]: 48 | """Load a dict of list of dict representations of some layout data, 49 | automatically parse its type, and save it as any of BaseLayoutElement 50 | or Layout datatype. 51 | 52 | Args: 53 | data (Union[Dict, List]): 54 | A dict of list of dict representations of the layout data 55 | 56 | Raises: 57 | ValueError: 58 | If the data format is incompatible with the layout-data-JSON format, 59 | raise a `ValueError`. 60 | ValueError: 61 | If any `block_type` name is not in the available list of layout element 62 | names defined in `BASECOORD_ELEMENT_NAMEMAP`, raise a `ValueError`. 63 | 64 | Returns: 65 | Union[BaseLayoutElement, Layout]: 66 | Based on the dict format, it will automatically parse the type of 67 | the data and load it accordingly. 68 | """ 69 | if isinstance(data, dict): 70 | if "page_data" in data: 71 | # It is a layout instance 72 | return Layout(load_dict(data["blocks"])._blocks, page_data=data["page_data"]) 73 | else: 74 | 75 | if data["block_type"] not in BASECOORD_ELEMENT_NAMEMAP: 76 | raise ValueError(f"Invalid block_type {data['block_type']}") 77 | 78 | # Check if it is a textblock 79 | is_textblock = any(ele in data for ele in TextBlock._features) 80 | if is_textblock: 81 | return TextBlock.from_dict(data) 82 | else: 83 | return BASECOORD_ELEMENT_NAMEMAP[data["block_type"]].from_dict(data) 84 | 85 | elif isinstance(data, list): 86 | return Layout([load_dict(ele) for ele in data]) 87 | 88 | else: 89 | raise ValueError(f"Invalid input JSON structure.") 90 | 91 | 92 | def load_csv(filename: str, block_type: str = None) -> Layout: 93 | """Load the Layout object from the given CSV file. 94 | 95 | Args: 96 | filename (str): 97 | The name of the CSV file. A row of the table represents 98 | an individual layout element. 99 | 100 | block_type (str): 101 | If there's no block_type column in the CSV file, 102 | you must pass in a block_type variable such that layout parser 103 | can appropriately detect the type of the layout elements. 104 | 105 | Returns: 106 | Layout: 107 | The parsed Layout object from the CSV file. 108 | """ 109 | 110 | return load_dataframe(pd.read_csv(filename), block_type=block_type) 111 | 112 | 113 | def load_dataframe(df: pd.DataFrame, block_type: str = None) -> Layout: 114 | """Load the Layout object from the given dataframe. 115 | 116 | Args: 117 | df (pd.DataFrame): 118 | 119 | block_type (str): 120 | If there's no block_type column in the CSV file, 121 | you must pass in a block_type variable such that layout parser 122 | can appropriately detect the type of the layout elements. 123 | 124 | Returns: 125 | Layout: 126 | The parsed Layout object from the CSV file. 127 | """ 128 | df = df.copy() 129 | if "points" in df.columns: 130 | if df["points"].dtype == object: 131 | df["points"] = df["points"].map( 132 | lambda x: ast.literal_eval(x) if not pd.isna(x) else x 133 | ) 134 | 135 | if block_type is None: 136 | if "block_type" not in df.columns: 137 | raise ValueError( 138 | "`block_type` not specified both in dataframe and arguments" 139 | ) 140 | else: 141 | df["block_type"] = block_type 142 | 143 | if any(col in TextBlock._features for col in df.columns): 144 | # Automatically setting index for textblock 145 | if "id" not in df.columns: 146 | df["id"] = df.index 147 | 148 | return load_dict(df.apply(lambda x: x.dropna().to_dict(), axis=1).to_list()) 149 | -------------------------------------------------------------------------------- /src/layoutparser/io/pdf.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The Layout Parser team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from typing import List, Union, Optional, Dict, Tuple 16 | 17 | import pdfplumber 18 | import pandas as pd 19 | 20 | from ..elements import Layout 21 | from .basic import load_dataframe 22 | 23 | DEFAULT_PDF_DPI = 72 24 | 25 | 26 | def extract_words_for_page( 27 | page: pdfplumber.page.Page, 28 | x_tolerance=1.5, 29 | y_tolerance=2, 30 | keep_blank_chars=False, 31 | use_text_flow=True, 32 | horizontal_ltr=True, 33 | vertical_ttb=True, 34 | extra_attrs=None, 35 | ) -> Layout: 36 | """The helper function used for extracting words from a pdfplumber page 37 | object. 38 | 39 | Returns: 40 | Layout: a layout object representing all extracted pdf tokens on this page. 41 | """ 42 | if extra_attrs is None: 43 | extra_attrs = ["fontname", "size"] 44 | 45 | tokens = page.extract_words( 46 | x_tolerance=x_tolerance, 47 | y_tolerance=y_tolerance, 48 | keep_blank_chars=keep_blank_chars, 49 | use_text_flow=use_text_flow, 50 | horizontal_ltr=horizontal_ltr, 51 | vertical_ttb=vertical_ttb, 52 | extra_attrs=extra_attrs, 53 | ) 54 | 55 | df = pd.DataFrame(tokens) 56 | 57 | if len(df) == 0: 58 | return Layout() 59 | 60 | df[["x0", "x1"]] = ( 61 | df[["x0", "x1"]].clip(lower=0, upper=int(page.width)).astype("float") 62 | ) 63 | df[["top", "bottom"]] = ( 64 | df[["top", "bottom"]].clip(lower=0, upper=int(page.height)).astype("float") 65 | ) 66 | 67 | page_tokens = load_dataframe( 68 | df.reset_index().rename( 69 | columns={ 70 | "x0": "x_1", 71 | "x1": "x_2", 72 | "top": "y_1", 73 | "bottom": "y_2", 74 | "index": "id", 75 | "fontname": "type", # also loading fontname as "type" 76 | } 77 | ), 78 | block_type="rectangle", 79 | ) 80 | 81 | return page_tokens 82 | 83 | 84 | def load_pdf( 85 | filename: str, 86 | load_images: bool = False, 87 | x_tolerance: int = 1.5, 88 | y_tolerance: int = 2, 89 | keep_blank_chars: bool = False, 90 | use_text_flow: bool = True, 91 | horizontal_ltr: bool = True, 92 | vertical_ttb: bool = True, 93 | extra_attrs: Optional[List[str]] = None, 94 | dpi: int = DEFAULT_PDF_DPI, 95 | ) -> Union[List[Layout], Tuple[List[Layout], List["Image.Image"]]]: 96 | """Load all tokens for each page from a PDF file, and save them 97 | in a list of Layout objects with the original page order. 98 | 99 | Args: 100 | filename (str): The path to the PDF file. 101 | load_images (bool, optional): 102 | Whether load screenshot for each page of the PDF file. 103 | When set to true, the function will return both the layout and 104 | screenshot image for each page. 105 | Defaults to False. 106 | x_tolerance (int, optional): 107 | The threshold used for extracting "word tokens" from the pdf file. 108 | It will merge the pdf characters into a word token if the difference 109 | between the x_2 of one character and the x_1 of the next is less than 110 | or equal to x_tolerance. See details in `pdf2plumber's documentation 111 | `_. 112 | Defaults to 1.5. 113 | y_tolerance (int, optional): 114 | The threshold used for extracting "word tokens" from the pdf file. 115 | It will merge the pdf characters into a word token if the difference 116 | between the y_2 of one character and the y_1 of the next is less than 117 | or equal to y_tolerance. See details in `pdf2plumber's documentation 118 | `_. 119 | Defaults to 2. 120 | keep_blank_chars (bool, optional): 121 | When keep_blank_chars is set to True, it will treat blank characters 122 | are treated as part of a word, not as a space between words. See 123 | details in `pdf2plumber's documentation 124 | `_. 125 | Defaults to False. 126 | use_text_flow (bool, optional): 127 | When use_text_flow is set to True, it will use the PDF's underlying 128 | flow of characters as a guide for ordering and segmenting the words, 129 | rather than presorting the characters by x/y position. (This mimics 130 | how dragging a cursor highlights text in a PDF; as with that, the 131 | order does not always appear to be logical.) See details in 132 | `pdf2plumber's documentation 133 | `_. 134 | Defaults to True. 135 | horizontal_ltr (bool, optional): 136 | When horizontal_ltr is set to True, it means the doc should read 137 | text from left to right, vice versa. 138 | Defaults to True. 139 | vertical_ttb (bool, optional): 140 | When vertical_ttb is set to True, it means the doc should read 141 | text from top to bottom, vice versa. 142 | Defaults to True. 143 | extra_attrs (Optional[List[str]], optional): 144 | Passing a list of extra_attrs (e.g., ["fontname", "size"]) will 145 | restrict each words to characters that share exactly the same 146 | value for each of those `attributes extracted by pdfplumber 147 | `_, 148 | and the resulting word dicts will indicate those attributes. 149 | See details in `pdf2plumber's documentation 150 | `_. 151 | Defaults to `["fontname", "size"]`. 152 | dpi (int, optional): 153 | When loading images of the pdf, you can also specify the resolution 154 | (or `DPI, dots per inch `_) 155 | for rendering the images. Higher DPI values mean clearer images (also 156 | larger file sizes). 157 | Setting dpi will also automatically resizes the extracted pdf_layout 158 | to match the sizes of the images. Therefore, when visualizing the 159 | pdf_layouts, it can be rendered appropriately. 160 | Defaults to `DEFAULT_PDF_DPI=72`, which is also the default rendering dpi 161 | from the pdfplumber PDF parser. 162 | 163 | Returns: 164 | List[Layout]: 165 | When `load_images=False`, it will only load the pdf_tokens from 166 | the PDF file. Each element of the list denotes all the tokens appeared 167 | on a single page, and the list is ordered the same as the original PDF 168 | page order. 169 | Tuple[List[Layout], List["Image.Image"]]: 170 | When `load_images=True`, besides the `all_page_layout`, it will also 171 | return a list of page images. 172 | 173 | Examples:: 174 | >>> import layoutparser as lp 175 | >>> pdf_layout = lp.load_pdf("path/to/pdf") 176 | >>> pdf_layout[0] # the layout for page 0 177 | >>> pdf_layout, pdf_images = lp.load_pdf("path/to/pdf", load_images=True) 178 | >>> lp.draw_box(pdf_images[0], pdf_layout[0]) 179 | """ 180 | 181 | plumber_pdf_object = pdfplumber.open(filename) 182 | 183 | all_page_layout = [] 184 | for page_id in range(len(plumber_pdf_object.pages)): 185 | cur_page = plumber_pdf_object.pages[page_id] 186 | 187 | page_tokens = extract_words_for_page( 188 | cur_page, 189 | x_tolerance=x_tolerance, 190 | y_tolerance=y_tolerance, 191 | keep_blank_chars=keep_blank_chars, 192 | use_text_flow=use_text_flow, 193 | horizontal_ltr=horizontal_ltr, 194 | vertical_ttb=vertical_ttb, 195 | extra_attrs=extra_attrs, 196 | ) 197 | 198 | # Adding metadata for the current page 199 | page_tokens.page_data["width"] = float(cur_page.width) 200 | page_tokens.page_data["height"] = float(cur_page.height) 201 | page_tokens.page_data["index"] = page_id 202 | 203 | all_page_layout.append(page_tokens) 204 | 205 | if not load_images: 206 | return all_page_layout 207 | else: 208 | import pdf2image 209 | 210 | pdf_images = pdf2image.convert_from_path(filename, dpi=dpi) 211 | 212 | for page_id, page_image in enumerate(pdf_images): 213 | image_width, image_height = page_image.size 214 | page_layout = all_page_layout[page_id] 215 | layout_width = page_layout.page_data["width"] 216 | layout_height = page_layout.page_data["height"] 217 | if image_width != layout_width or image_height != layout_height: 218 | scale_x = image_width / layout_width 219 | scale_y = image_height / layout_height 220 | page_layout = page_layout.scale((scale_x, scale_y)) 221 | page_layout.page_data["width"] = image_width 222 | page_layout.page_data["height"] = image_height 223 | all_page_layout[page_id] = page_layout 224 | 225 | return all_page_layout, pdf_images -------------------------------------------------------------------------------- /src/layoutparser/misc/NotoSerifCJKjp-Regular.otf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Layout-Parser/layout-parser/04e28168d820eea3a1ff1e098078323e7b48648b/src/layoutparser/misc/NotoSerifCJKjp-Regular.otf -------------------------------------------------------------------------------- /src/layoutparser/models/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The Layout Parser team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from .detectron2.layoutmodel import Detectron2LayoutModel 16 | from .paddledetection.layoutmodel import PaddleDetectionLayoutModel 17 | from .effdet.layoutmodel import EfficientDetLayoutModel 18 | from .auto_layoutmodel import AutoLayoutModel -------------------------------------------------------------------------------- /src/layoutparser/models/auto_layoutmodel.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The Layout Parser team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from typing import Optional, Dict, Union, List 16 | from collections import defaultdict 17 | 18 | from .model_config import ( 19 | is_lp_layout_model_config_any_format, 20 | ) 21 | from ..file_utils import ( 22 | is_effdet_available, 23 | is_detectron2_available, 24 | is_paddle_available, 25 | ) 26 | 27 | ALL_AVAILABLE_BACKENDS = dict() 28 | ALL_AVAILABLE_DATASETS = defaultdict(list) 29 | 30 | if is_effdet_available(): 31 | from .effdet.layoutmodel import EfficientDetLayoutModel 32 | from .effdet.catalog import MODEL_CATALOG as _effdet_model_catalog 33 | 34 | # fmt: off 35 | ALL_AVAILABLE_BACKENDS[EfficientDetLayoutModel.DETECTOR_NAME] = EfficientDetLayoutModel 36 | for dataset_name in _effdet_model_catalog: 37 | ALL_AVAILABLE_DATASETS[dataset_name].append(EfficientDetLayoutModel.DETECTOR_NAME) 38 | # fmt: on 39 | 40 | if is_detectron2_available(): 41 | from .detectron2.layoutmodel import Detectron2LayoutModel 42 | from .detectron2.catalog import MODEL_CATALOG as _detectron2_model_catalog 43 | 44 | # fmt: off 45 | ALL_AVAILABLE_BACKENDS[Detectron2LayoutModel.DETECTOR_NAME] = Detectron2LayoutModel 46 | for dataset_name in _detectron2_model_catalog: 47 | ALL_AVAILABLE_DATASETS[dataset_name].append(Detectron2LayoutModel.DETECTOR_NAME) 48 | # fmt: on 49 | 50 | if is_paddle_available(): 51 | from .paddledetection.layoutmodel import PaddleDetectionLayoutModel 52 | from .paddledetection.catalog import MODEL_CATALOG as _paddle_model_catalog 53 | 54 | # fmt: off 55 | ALL_AVAILABLE_BACKENDS[PaddleDetectionLayoutModel.DETECTOR_NAME] = PaddleDetectionLayoutModel 56 | for dataset_name in _paddle_model_catalog: 57 | ALL_AVAILABLE_DATASETS[dataset_name].append(PaddleDetectionLayoutModel.DETECTOR_NAME) 58 | # fmt: on 59 | 60 | 61 | def AutoLayoutModel( 62 | config_path: str, 63 | model_path: Optional[str] = None, 64 | label_map: Optional[Dict] = None, 65 | device: Optional[str] = None, 66 | extra_config: Optional[Union[Dict, List]] = None, 67 | ) -> "BaseLayoutModel": 68 | """[summary] 69 | 70 | Args: 71 | config_path (:obj:`str`): 72 | The path to the configuration file. 73 | model_path (:obj:`str`, None): 74 | The path to the saved weights of the model. 75 | If set, overwrite the weights in the configuration file. 76 | Defaults to `None`. 77 | label_map (:obj:`dict`, optional): 78 | The map from the model prediction (ids) to real 79 | word labels (strings). If the config is from one of the supported 80 | datasets, Layout Parser will automatically initialize the label_map. 81 | Defaults to `None`. 82 | device(:obj:`str`, optional): 83 | Whether to use cuda or cpu devices. If not set, LayoutParser will 84 | automatically determine the device to initialize the models on. 85 | extra_config (:obj:`dict`, optional): 86 | Extra configuration passed used for initializing the layout model. 87 | 88 | Returns: 89 | # BaseLayoutModel: the create LayoutModel instance 90 | """ 91 | if not is_lp_layout_model_config_any_format(config_path): 92 | raise ValueError(f"Invalid model config_path {config_path}") 93 | 94 | # Try to search for the model keywords 95 | for backend_name in ALL_AVAILABLE_BACKENDS: 96 | if backend_name in config_path: 97 | return ALL_AVAILABLE_BACKENDS[backend_name]( 98 | config_path, 99 | model_path=model_path, 100 | label_map=label_map, 101 | extra_config=extra_config, 102 | device=device, 103 | ) 104 | 105 | # Try to search for the dataset keywords 106 | for dataset_name in ALL_AVAILABLE_DATASETS: 107 | if dataset_name in config_path: 108 | return ALL_AVAILABLE_BACKENDS[ALL_AVAILABLE_DATASETS[dataset_name][0]]( 109 | config_path, 110 | model_path=model_path, 111 | label_map=label_map, 112 | extra_config=extra_config, 113 | device=device, 114 | ) 115 | 116 | raise ValueError(f"No available model found for {config_path}") -------------------------------------------------------------------------------- /src/layoutparser/models/base_catalog.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The Layout Parser team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from iopath.common.file_io import HTTPURLHandler 16 | from iopath.common.file_io import PathManager as PathManagerBase 17 | 18 | # A trick learned from https://github.com/facebookresearch/detectron2/blob/65faeb4779e4c142484deeece18dc958c5c9ad18/detectron2/utils/file_io.py#L3 19 | 20 | 21 | class DropboxHandler(HTTPURLHandler): 22 | """ 23 | Supports download and file check for dropbox links 24 | """ 25 | 26 | def _get_supported_prefixes(self): 27 | return ["https://www.dropbox.com"] 28 | 29 | def _isfile(self, path): 30 | return path in self.cache_map 31 | 32 | 33 | PathManager = PathManagerBase() 34 | PathManager.register_handler(DropboxHandler()) -------------------------------------------------------------------------------- /src/layoutparser/models/base_layoutmodel.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The Layout Parser team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from typing import Optional, Tuple, Union, Dict 16 | from abc import ABC, abstractmethod 17 | 18 | from .model_config import LayoutModelConfig, add_identifier_for_config, layout_model_config_parser, is_lp_layout_model_config_any_format 19 | from ..file_utils import requires_backends 20 | 21 | class BaseLayoutModel(ABC): 22 | 23 | # TODO: Build a metaclass for lazy module loader 24 | @property 25 | @abstractmethod 26 | def DEPENDENCIES(self): 27 | """DEPENDENCIES lists all necessary dependencies for the class.""" 28 | pass 29 | 30 | @property 31 | @abstractmethod 32 | def DETECTOR_NAME(self): 33 | pass 34 | 35 | @property 36 | @abstractmethod 37 | def MODEL_CATALOG(self) -> Dict[str, Dict[str, str]]: 38 | pass 39 | 40 | @abstractmethod 41 | def detect(self, image: Union["np.ndarray", "Image.Image"]): 42 | pass 43 | 44 | 45 | @abstractmethod 46 | def image_loader(self, image: Union["np.ndarray", "Image.Image"]): 47 | """It will process the input images appropriately to the target format.""" 48 | pass 49 | 50 | def _parse_config(self, config_path:str, identifier:str) -> Union[LayoutModelConfig, str]: 51 | 52 | if is_lp_layout_model_config_any_format(config_path): 53 | config_path = add_identifier_for_config(config_path, identifier) 54 | for dataset_name in self.MODEL_CATALOG: 55 | if dataset_name in config_path: 56 | default_model_arch = list(self.MODEL_CATALOG[dataset_name].keys())[0] 57 | # Use the first model_name for the dataset as the default_model_arch 58 | return layout_model_config_parser(config_path, self.DETECTOR_NAME, default_model_arch) 59 | raise ValueError(f"The config {config_path} is not a valid config for {self.__class__}, " 60 | f"possibly because there aren't models trained for the specified dataset.") 61 | else: 62 | return config_path 63 | 64 | def config_parser(self, config_path:str, model_path: Optional[str], allow_empty_path=False) -> Tuple[str, str]: 65 | 66 | config_path = self._parse_config(config_path, "config") 67 | 68 | if isinstance(config_path, str) and model_path is None: 69 | if not allow_empty_path: 70 | raise ValueError( 71 | f"Invalid config and model path pairs ({(config_path, model_path)}):" 72 | f"When config_path is a regular URL, the model_path should not be empty" 73 | ) 74 | else: 75 | return config_path, model_path 76 | elif isinstance(config_path, LayoutModelConfig) and model_path is None: 77 | model_path = config_path.dual() 78 | else: 79 | model_path = self._parse_config(model_path, "weight") 80 | 81 | config_path = config_path if isinstance(config_path, str) else config_path.full 82 | model_path = model_path if isinstance(model_path, str) else model_path.full 83 | return config_path, model_path 84 | 85 | def __new__(cls, *args, **kwargs): 86 | 87 | requires_backends(cls, cls.DEPENDENCIES) 88 | return super().__new__(cls) -------------------------------------------------------------------------------- /src/layoutparser/models/detectron2/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The Layout Parser team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from . import catalog as _UNUSED 16 | # A trick learned from 17 | # https://github.com/facebookresearch/detectron2/blob/62cf3a2b6840734d2717abdf96e2dd57ed6612a6/detectron2/checkpoint/__init__.py#L6 18 | from .layoutmodel import Detectron2LayoutModel 19 | -------------------------------------------------------------------------------- /src/layoutparser/models/detectron2/catalog.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The Layout Parser team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from iopath.common.file_io import PathHandler 16 | 17 | from ..base_catalog import PathManager 18 | 19 | MODEL_CATALOG = { 20 | "HJDataset": { 21 | "faster_rcnn_R_50_FPN_3x": "https://www.dropbox.com/s/6icw6at8m28a2ho/model_final.pth?dl=1", 22 | "mask_rcnn_R_50_FPN_3x": "https://www.dropbox.com/s/893paxpy5suvlx9/model_final.pth?dl=1", 23 | "retinanet_R_50_FPN_3x": "https://www.dropbox.com/s/yxsloxu3djt456i/model_final.pth?dl=1", 24 | }, 25 | "PubLayNet": { 26 | "faster_rcnn_R_50_FPN_3x": "https://www.dropbox.com/s/dgy9c10wykk4lq4/model_final.pth?dl=1", 27 | "mask_rcnn_R_50_FPN_3x": "https://www.dropbox.com/s/d9fc9tahfzyl6df/model_final.pth?dl=1", 28 | "mask_rcnn_X_101_32x8d_FPN_3x": "https://www.dropbox.com/s/57zjbwv6gh3srry/model_final.pth?dl=1", 29 | }, 30 | "PrimaLayout": { 31 | "mask_rcnn_R_50_FPN_3x": "https://www.dropbox.com/s/h7th27jfv19rxiy/model_final.pth?dl=1" 32 | }, 33 | "NewspaperNavigator": { 34 | "faster_rcnn_R_50_FPN_3x": "https://www.dropbox.com/s/6ewh6g8rqt2ev3a/model_final.pth?dl=1", 35 | }, 36 | "TableBank": { 37 | "faster_rcnn_R_50_FPN_3x": "https://www.dropbox.com/s/8v4uqmz1at9v72a/model_final.pth?dl=1", 38 | "faster_rcnn_R_101_FPN_3x": "https://www.dropbox.com/s/6vzfk8lk9xvyitg/model_final.pth?dl=1", 39 | }, 40 | "MFD": { 41 | "faster_rcnn_R_50_FPN_3x": "https://www.dropbox.com/s/7xel0i3iqpm2p8y/model_final.pth?dl=1", 42 | }, 43 | } 44 | 45 | CONFIG_CATALOG = { 46 | "HJDataset": { 47 | "faster_rcnn_R_50_FPN_3x": "https://www.dropbox.com/s/j4yseny2u0hn22r/config.yml?dl=1", 48 | "mask_rcnn_R_50_FPN_3x": "https://www.dropbox.com/s/4jmr3xanmxmjcf8/config.yml?dl=1", 49 | "retinanet_R_50_FPN_3x": "https://www.dropbox.com/s/z8a8ywozuyc5c2x/config.yml?dl=1", 50 | }, 51 | "PubLayNet": { 52 | "faster_rcnn_R_50_FPN_3x": "https://www.dropbox.com/s/f3b12qc4hc0yh4m/config.yml?dl=1", 53 | "mask_rcnn_R_50_FPN_3x": "https://www.dropbox.com/s/u9wbsfwz4y0ziki/config.yml?dl=1", 54 | "mask_rcnn_X_101_32x8d_FPN_3x": "https://www.dropbox.com/s/nau5ut6zgthunil/config.yaml?dl=1", 55 | }, 56 | "PrimaLayout": { 57 | "mask_rcnn_R_50_FPN_3x": "https://www.dropbox.com/s/yc92x97k50abynt/config.yaml?dl=1" 58 | }, 59 | "NewspaperNavigator": { 60 | "faster_rcnn_R_50_FPN_3x": "https://www.dropbox.com/s/wnido8pk4oubyzr/config.yml?dl=1", 61 | }, 62 | "TableBank": { 63 | "faster_rcnn_R_50_FPN_3x": "https://www.dropbox.com/s/7cqle02do7ah7k4/config.yaml?dl=1", 64 | "faster_rcnn_R_101_FPN_3x": "https://www.dropbox.com/s/h63n6nv51kfl923/config.yaml?dl=1", 65 | }, 66 | "MFD": { 67 | "faster_rcnn_R_50_FPN_3x": "https://www.dropbox.com/s/ld9izb95f19369w/config.yaml?dl=1", 68 | }, 69 | } 70 | 71 | # fmt: off 72 | LABEL_MAP_CATALOG = { 73 | "HJDataset": { 74 | 1: "Page Frame", 75 | 2: "Row", 76 | 3: "Title Region", 77 | 4: "Text Region", 78 | 5: "Title", 79 | 6: "Subtitle", 80 | 7: "Other", 81 | }, 82 | "PubLayNet": { 83 | 0: "Text", 84 | 1: "Title", 85 | 2: "List", 86 | 3: "Table", 87 | 4: "Figure"}, 88 | "PrimaLayout": { 89 | 1: "TextRegion", 90 | 2: "ImageRegion", 91 | 3: "TableRegion", 92 | 4: "MathsRegion", 93 | 5: "SeparatorRegion", 94 | 6: "OtherRegion", 95 | }, 96 | "NewspaperNavigator": { 97 | 0: "Photograph", 98 | 1: "Illustration", 99 | 2: "Map", 100 | 3: "Comics/Cartoon", 101 | 4: "Editorial Cartoon", 102 | 5: "Headline", 103 | 6: "Advertisement", 104 | }, 105 | "TableBank": { 106 | 0: "Table" 107 | }, 108 | "MFD": { 109 | 1: "Equation" 110 | }, 111 | } 112 | # fmt: on 113 | 114 | 115 | class LayoutParserDetectron2ModelHandler(PathHandler): 116 | """ 117 | Resolve anything that's in LayoutParser model zoo. 118 | """ 119 | 120 | PREFIX = "lp://detectron2/" 121 | 122 | def _get_supported_prefixes(self): 123 | return [self.PREFIX] 124 | 125 | def _get_local_path(self, path, **kwargs): 126 | model_name = path[len(self.PREFIX) :] 127 | 128 | dataset_name, *model_name, data_type = model_name.split("/") 129 | 130 | if data_type == "weight": 131 | model_url = MODEL_CATALOG[dataset_name]["/".join(model_name)] 132 | elif data_type == "config": 133 | model_url = CONFIG_CATALOG[dataset_name]["/".join(model_name)] 134 | else: 135 | raise ValueError(f"Unknown data_type {data_type}") 136 | return PathManager.get_local_path(model_url, **kwargs) 137 | 138 | def _open(self, path, mode="r", **kwargs): 139 | return PathManager.open(self._get_local_path(path), mode, **kwargs) 140 | 141 | 142 | PathManager.register_handler(LayoutParserDetectron2ModelHandler()) 143 | -------------------------------------------------------------------------------- /src/layoutparser/models/detectron2/layoutmodel.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The Layout Parser team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from typing import Union 16 | from PIL import Image 17 | import numpy as np 18 | import warnings 19 | 20 | from .catalog import MODEL_CATALOG, PathManager, LABEL_MAP_CATALOG 21 | from ..base_layoutmodel import BaseLayoutModel 22 | from ...elements import Rectangle, TextBlock, Layout 23 | from ...file_utils import is_torch_cuda_available, is_detectron2_available 24 | 25 | if is_detectron2_available(): 26 | import detectron2.engine 27 | import detectron2.config 28 | 29 | 30 | __all__ = ["Detectron2LayoutModel"] 31 | 32 | 33 | class Detectron2LayoutModel(BaseLayoutModel): 34 | """Create a Detectron2-based Layout Detection Model 35 | 36 | Args: 37 | config_path (:obj:`str`): 38 | The path to the configuration file. 39 | model_path (:obj:`str`, None): 40 | The path to the saved weights of the model. 41 | If set, overwrite the weights in the configuration file. 42 | Defaults to `None`. 43 | label_map (:obj:`dict`, optional): 44 | The map from the model prediction (ids) to real 45 | word labels (strings). If the config is from one of the supported 46 | datasets, Layout Parser will automatically initialize the label_map. 47 | Defaults to `None`. 48 | device(:obj:`str`, optional): 49 | Whether to use cuda or cpu devices. If not set, LayoutParser will 50 | automatically determine the device to initialize the models on. 51 | extra_config (:obj:`list`, optional): 52 | Extra configuration passed to the Detectron2 model 53 | configuration. The argument will be used in the `merge_from_list 54 | `_ function. 56 | Defaults to `[]`. 57 | 58 | Examples:: 59 | >>> import layoutparser as lp 60 | >>> model = lp.Detectron2LayoutModel('lp://HJDataset/faster_rcnn_R_50_FPN_3x/config') 61 | >>> model.detect(image) 62 | 63 | """ 64 | 65 | DEPENDENCIES = ["detectron2"] 66 | DETECTOR_NAME = "detectron2" 67 | MODEL_CATALOG = MODEL_CATALOG 68 | 69 | def __init__( 70 | self, 71 | config_path, 72 | model_path=None, 73 | label_map=None, 74 | extra_config=None, 75 | enforce_cpu=None, 76 | device=None, 77 | ): 78 | 79 | if enforce_cpu is not None: 80 | warnings.warn( 81 | "Setting enforce_cpu is deprecated. Please set `device` instead.", 82 | DeprecationWarning, 83 | ) 84 | 85 | if extra_config is None: 86 | extra_config = [] 87 | 88 | config_path, model_path = self.config_parser( 89 | config_path, model_path, allow_empty_path=True 90 | ) 91 | config_path = PathManager.get_local_path(config_path) 92 | 93 | if label_map is None: 94 | if config_path.startswith("lp://"): 95 | dataset_name = config_path.lstrip("lp://").split("/")[1] 96 | label_map = LABEL_MAP_CATALOG[dataset_name] 97 | else: 98 | label_map = {} 99 | 100 | cfg = detectron2.config.get_cfg() 101 | cfg.merge_from_file(config_path) 102 | cfg.merge_from_list(extra_config) 103 | 104 | if model_path is not None: 105 | model_path = PathManager.get_local_path(model_path) 106 | # Because it will be forwarded to the detectron2 paths 107 | cfg.MODEL.WEIGHTS = model_path 108 | 109 | if is_torch_cuda_available(): 110 | if device is None: 111 | device = "cuda" 112 | else: 113 | device = "cpu" 114 | cfg.MODEL.DEVICE = device 115 | 116 | self.cfg = cfg 117 | 118 | self.label_map = label_map 119 | self._create_model() 120 | 121 | def _create_model(self): 122 | self.model = detectron2.engine.DefaultPredictor(self.cfg) 123 | 124 | def gather_output(self, outputs): 125 | 126 | instance_pred = outputs["instances"].to("cpu") 127 | 128 | layout = Layout() 129 | scores = instance_pred.scores.tolist() 130 | boxes = instance_pred.pred_boxes.tensor.tolist() 131 | labels = instance_pred.pred_classes.tolist() 132 | 133 | for score, box, label in zip(scores, boxes, labels): 134 | x_1, y_1, x_2, y_2 = box 135 | 136 | 137 | label = self.label_map.get(label, label) 138 | 139 | cur_block = TextBlock( 140 | Rectangle(x_1, y_1, x_2, y_2), type=label, score=score 141 | ) 142 | layout.append(cur_block) 143 | 144 | return layout 145 | 146 | def detect(self, image): 147 | """Detect the layout of a given image. 148 | 149 | Args: 150 | image (:obj:`np.ndarray` or `PIL.Image`): The input image to detect. 151 | 152 | Returns: 153 | :obj:`~layoutparser.Layout`: The detected layout of the input image 154 | """ 155 | 156 | image = self.image_loader(image) 157 | outputs = self.model(image) 158 | layout = self.gather_output(outputs) 159 | return layout 160 | 161 | def image_loader(self, image: Union["np.ndarray", "Image.Image"]): 162 | # Convert PIL Image Input 163 | if isinstance(image, Image.Image): 164 | if image.mode != "RGB": 165 | image = image.convert("RGB") 166 | image = np.array(image) 167 | 168 | return image 169 | -------------------------------------------------------------------------------- /src/layoutparser/models/effdet/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The Layout Parser team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from . import catalog as _UNUSED 16 | from .layoutmodel import EfficientDetLayoutModel 17 | -------------------------------------------------------------------------------- /src/layoutparser/models/effdet/catalog.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The Layout Parser team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from iopath.common.file_io import PathHandler 16 | 17 | from ..base_catalog import PathManager 18 | 19 | MODEL_CATALOG = { 20 | "PubLayNet": { 21 | "tf_efficientdet_d0": "https://www.dropbox.com/s/ukbw5s673633hsw/publaynet-tf_efficientdet_d0.pth.tar?dl=1", 22 | "tf_efficientdet_d1": "https://www.dropbox.com/s/gxy11xkkiwnpgog/publaynet-tf_efficientdet_d1.pth.tar?dl=1" 23 | }, 24 | "MFD": { 25 | "tf_efficientdet_d0": "https://www.dropbox.com/s/dkr22iux7thlhel/mfd-tf_efficientdet_d0.pth.tar?dl=1", 26 | "tf_efficientdet_d1": "https://www.dropbox.com/s/icmbiaqr5s9bz1x/mfd-tf_efficientdet_d1.pth.tar?dl=1" 27 | } 28 | } 29 | 30 | # In effdet training scripts, it requires the label_map starting 31 | # from 1 instead of 0 32 | LABEL_MAP_CATALOG = { 33 | "PubLayNet": { 34 | 1: "Text", 35 | 2: "Title", 36 | 3: "List", 37 | 4: "Table", 38 | 5: "Figure" 39 | }, 40 | "MFD": { 41 | 1: "Equation", 42 | } 43 | } 44 | 45 | class LayoutParserEfficientDetModelHandler(PathHandler): 46 | """ 47 | Resolve anything that's in LayoutParser model zoo. 48 | """ 49 | 50 | PREFIX = "lp://efficientdet/" 51 | 52 | def _get_supported_prefixes(self): 53 | return [self.PREFIX] 54 | 55 | def _get_local_path(self, path, **kwargs): 56 | model_name = path[len(self.PREFIX) :] 57 | 58 | dataset_name, *model_name, data_type = model_name.split("/") 59 | 60 | if data_type == "weight": 61 | model_url = MODEL_CATALOG[dataset_name]["/".join(model_name)] 62 | else: 63 | raise ValueError(f"Unknown data_type {data_type}") 64 | return PathManager.get_local_path(model_url, **kwargs) 65 | 66 | def _open(self, path, mode="r", **kwargs): 67 | return PathManager.open(self._get_local_path(path), mode, **kwargs) 68 | 69 | 70 | PathManager.register_handler(LayoutParserEfficientDetModelHandler()) 71 | -------------------------------------------------------------------------------- /src/layoutparser/models/effdet/layoutmodel.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The Layout Parser team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from typing import List, Optional, Union, Dict, Any, Tuple 16 | 17 | from PIL import Image 18 | import numpy as np 19 | 20 | from .catalog import PathManager, LABEL_MAP_CATALOG, MODEL_CATALOG 21 | from ..base_layoutmodel import BaseLayoutModel 22 | from ...elements import Rectangle, TextBlock, Layout 23 | 24 | from ...file_utils import is_effdet_available, is_torch_cuda_available 25 | 26 | if is_effdet_available(): 27 | import torch 28 | from effdet import create_model 29 | from effdet.data.transforms import ( 30 | IMAGENET_DEFAULT_MEAN, 31 | IMAGENET_DEFAULT_STD, 32 | transforms_coco_eval, 33 | ) 34 | else: 35 | # Copied from https://github.com/rwightman/efficientdet-pytorch/blob/c5b694aa34900fdee6653210d856ca8320bf7d4e/effdet/data/transforms.py#L13 36 | # Such that when effdet is not loaded, we'll still have default values for IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD 37 | IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406) 38 | IMAGENET_DEFAULT_STD = (0.229, 0.224, 0.225) 39 | # IMAGENET_INCEPTION_MEAN = (0.5, 0.5, 0.5) 40 | # IMAGENET_INCEPTION_STD = (0.5, 0.5, 0.5) 41 | 42 | 43 | class InputTransform: 44 | def __init__( 45 | self, 46 | image_size, 47 | mean=IMAGENET_DEFAULT_MEAN, 48 | std=IMAGENET_DEFAULT_STD, 49 | ): 50 | 51 | self.mean = mean 52 | self.std = std 53 | 54 | self.transform = transforms_coco_eval( 55 | image_size, 56 | interpolation="bilinear", 57 | use_prefetcher=True, 58 | fill_color="mean", 59 | mean=self.mean, 60 | std=self.std, 61 | ) 62 | 63 | self.mean_tensor = torch.tensor([x * 255 for x in mean]).view(1, 3, 1, 1) 64 | self.std_tensor = torch.tensor([x * 255 for x in std]).view(1, 3, 1, 1) 65 | 66 | def preprocess(self, image: Image) -> Tuple["torch.Tensor", Dict]: 67 | 68 | image = image.convert("RGB") 69 | image_info = {"img_size": image.size} 70 | 71 | input, image_info = self.transform(image, image_info) 72 | image_info = { 73 | key: torch.tensor(val).unsqueeze(0) for key, val in image_info.items() 74 | } 75 | 76 | input = torch.tensor(input).unsqueeze(0) 77 | input = input.float().sub_(self.mean_tensor).div_(self.std_tensor) 78 | 79 | return input, image_info 80 | 81 | 82 | class EfficientDetLayoutModel(BaseLayoutModel): 83 | """Create a EfficientDet-based Layout Detection Model 84 | 85 | Args: 86 | config_path (:obj:`str`): 87 | The path to the configuration file. 88 | model_path (:obj:`str`, None): 89 | The path to the saved weights of the model. 90 | If set, overwrite the weights in the configuration file. 91 | Defaults to `None`. 92 | label_map (:obj:`dict`, optional): 93 | The map from the model prediction (ids) to real 94 | word labels (strings). If the config is from one of the supported 95 | datasets, Layout Parser will automatically initialize the label_map. 96 | Defaults to `None`. 97 | enforce_cpu(:obj:`bool`, optional): 98 | When set to `True`, it will enforce using cpu even if it is on a CUDA 99 | available device. 100 | extra_config (:obj:`dict`, optional): 101 | Extra configuration passed to the EfficientDet model 102 | configuration. Currently supported arguments: 103 | num_classes: specifying the number of classes for the models 104 | output_confidence_threshold: minmum object prediction confidence to retain 105 | 106 | Examples:: 107 | >>> import layoutparser as lp 108 | >>> model = lp.EfficientDetLayoutModel("lp://PubLayNet/tf_efficientdet_d0/config") 109 | >>> model.detect(image) 110 | 111 | """ 112 | 113 | DEPENDENCIES = ["effdet"] 114 | DETECTOR_NAME = "efficientdet" 115 | MODEL_CATALOG = MODEL_CATALOG 116 | 117 | DEFAULT_OUTPUT_CONFIDENCE_THRESHOLD = 0.25 118 | 119 | def __init__( 120 | self, 121 | config_path: str, 122 | model_path: str = None, 123 | label_map: Optional[Dict] = None, 124 | extra_config: Optional[Dict] = None, 125 | enforce_cpu: bool = False, 126 | device: str = None, 127 | ): 128 | 129 | if is_torch_cuda_available(): 130 | if device is None: 131 | device = "cuda" 132 | else: 133 | device = "cpu" 134 | self.device = device 135 | 136 | extra_config = extra_config if extra_config is not None else {} 137 | 138 | self._initialize_model(config_path, model_path, label_map, extra_config) 139 | 140 | self.output_confidence_threshold = extra_config.get( 141 | "output_confidence_threshold", self.DEFAULT_OUTPUT_CONFIDENCE_THRESHOLD 142 | ) 143 | 144 | self.preprocessor = InputTransform(self.config.image_size) 145 | 146 | def _initialize_model( 147 | self, 148 | config_path: str, 149 | model_path: Optional[str], 150 | label_map: Optional[Dict], 151 | extra_config: Optional[Dict], 152 | ): 153 | 154 | config_path, model_path = self.config_parser(config_path, model_path) 155 | 156 | if config_path.startswith("lp://"): 157 | # If it's officially supported by layoutparser 158 | dataset_name, model_name = config_path.lstrip("lp://").split("/")[1:3] 159 | 160 | if label_map is None: 161 | label_map = LABEL_MAP_CATALOG[dataset_name] 162 | num_classes = len(label_map) 163 | 164 | model_path = PathManager.get_local_path(model_path) 165 | 166 | self.model = create_model( 167 | model_name, 168 | num_classes=num_classes, 169 | bench_task="predict", 170 | pretrained=True, 171 | checkpoint_path=model_path, 172 | ) 173 | else: 174 | assert ( 175 | model_path is not None 176 | ), f"When the specified model is not layoutparser-based, you need to specify the model_path" 177 | 178 | assert ( 179 | label_map is not None or "num_classes" in extra_config 180 | ), "When the specified model is not layoutparser-based, you need to specify the label_map or add num_classes in the extra_config" 181 | 182 | model_name = config_path 183 | model_path = PathManager.get_local_path( 184 | model_path 185 | ) # It might be an https URL 186 | 187 | num_classes = len(label_map) if label_map else extra_config["num_classes"] 188 | 189 | self.model = create_model( 190 | model_name, 191 | num_classes=num_classes, 192 | bench_task="predict", 193 | pretrained=True, 194 | checkpoint_path=model_path, 195 | ) 196 | 197 | self.model.to(self.device) 198 | self.model.eval() 199 | self.config = self.model.config 200 | self.label_map = label_map if label_map is not None else {} 201 | 202 | def detect(self, image: Union["np.ndarray", "Image.Image"]): 203 | 204 | image = self.image_loader(image) 205 | 206 | model_inputs, image_info = self.preprocessor.preprocess(image) 207 | 208 | model_outputs = self.model( 209 | model_inputs.to(self.device), 210 | {key: val.to(self.device) for key, val in image_info.items()}, 211 | ) 212 | 213 | layout = self.gather_output(model_outputs) 214 | return layout 215 | 216 | def gather_output(self, model_outputs: "torch.Tensor") -> Layout: 217 | 218 | model_outputs = model_outputs.cpu().detach() 219 | box_predictions = Layout() 220 | 221 | for index, sample in enumerate(model_outputs): 222 | sample[:, 2] -= sample[:, 0] 223 | sample[:, 3] -= sample[:, 1] 224 | 225 | for det in sample: 226 | 227 | score = float(det[4]) 228 | pred_cat = int(det[5]) 229 | x, y, w, h = det[0:4].tolist() 230 | 231 | if ( 232 | score < self.output_confidence_threshold 233 | ): # stop when below this threshold, scores in descending order 234 | break 235 | 236 | box_predictions.append( 237 | TextBlock( 238 | block=Rectangle(x, y, w + x, h + y), 239 | score=score, 240 | id=index, 241 | type=self.label_map.get(pred_cat, pred_cat), 242 | ) 243 | ) 244 | 245 | return box_predictions 246 | 247 | def image_loader(self, image: Union["np.ndarray", "Image.Image"]): 248 | 249 | # Convert cv2 Image Input 250 | if isinstance(image, np.ndarray): 251 | # In this case, we assume the image is loaded by cv2 252 | # and the channel order is BGR 253 | image = image[..., ::-1] 254 | image = Image.fromarray(image, mode="RGB") 255 | 256 | return image 257 | -------------------------------------------------------------------------------- /src/layoutparser/models/model_config.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The Layout Parser team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """ 16 | Inside layoutparser, we support the following formats for specifying layout model configs 17 | or weights: 18 | 19 | 1. URL-based formats: 20 | - A local path: ~/models/publaynet/path 21 | - Link to the models: https://web/url/to/models 22 | 23 | 2. LayoutParser Based Model/Config Path Formats: 24 | - Full format: lp://// 25 | - Short format: lp:/// 26 | - Brief format: lp:// 27 | 28 | For each LayoutParser-based format, you could also add a `config` or `weight` identifier 29 | after them: 30 | - Full format: lp:///// 31 | - Short format: lp://// 32 | - Brief format: lp:/// 33 | """ 34 | 35 | from dataclasses import dataclass 36 | 37 | LAYOUT_PARSER_MODEL_PREFIX = "lp://" 38 | ALLOWED_LAYOUT_MODEL_IDENTIFIER_NAMES = ["config", "weight"] 39 | 40 | 41 | @dataclass 42 | class LayoutModelConfig: 43 | 44 | backend_name: str 45 | dataset_name: str 46 | model_arch: str 47 | identifier: str 48 | 49 | def __post_init__(self): 50 | assert self.identifier in ALLOWED_LAYOUT_MODEL_IDENTIFIER_NAMES 51 | 52 | @property 53 | def full(self): 54 | return LAYOUT_PARSER_MODEL_PREFIX + "/".join( 55 | [self.backend_name, self.dataset_name, self.model_arch, self.identifier] 56 | ) 57 | 58 | @property 59 | def short(self): 60 | return LAYOUT_PARSER_MODEL_PREFIX + "/".join( 61 | [self.dataset_name, self.model_arch, self.identifier] 62 | ) 63 | 64 | @property 65 | def brief(self): 66 | return LAYOUT_PARSER_MODEL_PREFIX + "/".join([self.dataset_name, self.model_arch]) 67 | 68 | def dual(self): 69 | for identifier in ALLOWED_LAYOUT_MODEL_IDENTIFIER_NAMES: 70 | if identifier != self.identifier: 71 | break 72 | 73 | return self.__class__( 74 | backend_name=self.backend_name, 75 | dataset_name=self.dataset_name, 76 | model_arch=self.model_arch, 77 | identifier=identifier, 78 | ) 79 | 80 | 81 | def is_lp_layout_model_config_any_format(config: str) -> bool: 82 | if not config.startswith(LAYOUT_PARSER_MODEL_PREFIX): 83 | return False 84 | if len(config[len(LAYOUT_PARSER_MODEL_PREFIX) :].split("/")) not in [1, 2, 3, 4]: 85 | return False 86 | return True 87 | 88 | 89 | def add_identifier_for_config(config: str, identifier: str) -> str: 90 | return config.rstrip("/").rstrip(f"/{identifier}") + f"/{identifier}" 91 | 92 | 93 | def layout_model_config_parser( 94 | config, backend_name=None, model_arch=None 95 | ) -> LayoutModelConfig: 96 | 97 | assert config.split("/")[-1] in ALLOWED_LAYOUT_MODEL_IDENTIFIER_NAMES, ( 98 | f"The input config {config} does not contain identifier information." 99 | f"Consider run `config = add_identifier_for_config(config, identifier)` first." 100 | ) 101 | 102 | parts = config[len(LAYOUT_PARSER_MODEL_PREFIX) :].split("/") 103 | if len(parts) == 4: # Full format 104 | backend_name, dataset_name, model_arch, identifier = parts 105 | elif len(parts) == 3: # Short format 106 | assert backend_name != None 107 | 108 | if parts[0] == backend_name: 109 | # lp://// 110 | assert model_arch != None 111 | _, dataset_name, identifier = parts 112 | else: 113 | # lp://// 114 | dataset_name, model_arch, identifier = parts 115 | 116 | elif len(parts) == 2: # brief format 117 | assert backend_name != None 118 | assert model_arch != None 119 | if parts[0] == backend_name: 120 | # lp:/// 121 | raise ValueError(f"Invalid LP Model Config {config}") 122 | 123 | # lp:/// 124 | dataset_name, identifier = parts 125 | else: 126 | raise ValueError(f"Invalid LP Model Config {config}") 127 | 128 | return LayoutModelConfig( 129 | backend_name=backend_name, 130 | dataset_name=dataset_name, 131 | model_arch=model_arch, 132 | identifier=identifier, 133 | ) 134 | -------------------------------------------------------------------------------- /src/layoutparser/models/paddledetection/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The Layout Parser team and Paddle Detection model 2 | # contributors. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | from . import catalog as _UNUSED 17 | from .layoutmodel import PaddleDetectionLayoutModel 18 | -------------------------------------------------------------------------------- /src/layoutparser/models/paddledetection/catalog.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The Layout Parser team and Paddle Detection model 2 | # contributors. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import os 17 | import logging 18 | from typing import Any, Optional 19 | from urllib.parse import urlparse 20 | import tarfile 21 | import uuid 22 | 23 | from iopath.common.file_io import PathHandler 24 | from iopath.common.file_io import HTTPURLHandler 25 | from iopath.common.file_io import get_cache_dir, file_lock 26 | from iopath.common.download import download 27 | 28 | from ..base_catalog import PathManager 29 | 30 | MODEL_CATALOG = { 31 | "PubLayNet": { 32 | "ppyolov2_r50vd_dcn_365e": "https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_publaynet.tar", 33 | }, 34 | "TableBank": { 35 | "ppyolov2_r50vd_dcn_365e": "https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_tableBank_word.tar", 36 | # "ppyolov2_r50vd_dcn_365e_tableBank_latex": "https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_tableBank_latex.tar", 37 | # TODO: Train a single tablebank model for paddlepaddle 38 | }, 39 | } 40 | 41 | # fmt: off 42 | LABEL_MAP_CATALOG = { 43 | "PubLayNet": { 44 | 0: "Text", 45 | 1: "Title", 46 | 2: "List", 47 | 3: "Table", 48 | 4: "Figure"}, 49 | "TableBank": { 50 | 0: "Table" 51 | }, 52 | } 53 | # fmt: on 54 | 55 | 56 | # Paddle model package everything in tar files, and each model's tar file should contain 57 | # the following files in the list: 58 | _TAR_FILE_NAME_LIST = [ 59 | "inference.pdiparams", 60 | "inference.pdiparams.info", 61 | "inference.pdmodel", 62 | ] 63 | 64 | 65 | def _get_untar_directory(tar_file: str) -> str: 66 | 67 | base_path = os.path.dirname(tar_file) 68 | file_name = os.path.splitext(os.path.basename(tar_file))[0] 69 | target_folder = os.path.join(base_path, file_name) 70 | 71 | return target_folder 72 | 73 | 74 | def _untar_model_weights(model_tar): 75 | """untar model files""" 76 | 77 | model_dir = _get_untar_directory(model_tar) 78 | 79 | if not os.path.exists( 80 | os.path.join(model_dir, _TAR_FILE_NAME_LIST[0]) 81 | ) or not os.path.exists(os.path.join(model_dir, _TAR_FILE_NAME_LIST[2])): 82 | # the path to save the decompressed file 83 | os.makedirs(model_dir, exist_ok=True) 84 | with tarfile.open(model_tar, "r") as tarobj: 85 | for member in tarobj.getmembers(): 86 | filename = None 87 | for tar_file_name in _TAR_FILE_NAME_LIST: 88 | if tar_file_name in member.name: 89 | filename = tar_file_name 90 | if filename is None: 91 | continue 92 | file = tarobj.extractfile(member) 93 | with open(os.path.join(model_dir, filename), "wb") as model_file: 94 | model_file.write(file.read()) 95 | return model_dir 96 | 97 | 98 | def is_cached_folder_exists_and_valid(cached): 99 | possible_extracted_model_folder = _get_untar_directory(cached) 100 | if not os.path.exists(possible_extracted_model_folder): 101 | return False 102 | for tar_file in _TAR_FILE_NAME_LIST: 103 | if not os.path.exists(os.path.join(possible_extracted_model_folder, tar_file)): 104 | return False 105 | return True 106 | 107 | 108 | class PaddleModelURLHandler(HTTPURLHandler): 109 | """ 110 | Supports download and file check for Baidu Cloud links 111 | """ 112 | 113 | MAX_FILENAME_LEN = 250 114 | 115 | def _get_supported_prefixes(self): 116 | return ["https://paddle-model-ecology.bj.bcebos.com"] 117 | 118 | def _isfile(self, path): 119 | return path in self.cache_map 120 | 121 | def _get_local_path( 122 | self, 123 | path: str, 124 | force: bool = False, 125 | cache_dir: Optional[str] = None, 126 | **kwargs: Any, 127 | ) -> str: 128 | """ 129 | As paddle model stores all files in tar files, we need to extract them 130 | and get the newly extracted folder path. This function rewrites the base 131 | function to support the following situations: 132 | 133 | 1. If the tar file is not downloaded, it will download the tar file, 134 | extract it to the target folder, delete the downloaded tar file, 135 | and return the folder path. 136 | 2. If the extracted target folder is present, and all the necessary model 137 | files are present (specified in _TAR_FILE_NAME_LIST), it will 138 | return the folder path. 139 | 3. If the tar file is downloaded, but the extracted target folder is not 140 | present (or it doesn't contain the necessary files in _TAR_FILE_NAME_LIST), 141 | it will extract the tar file to the target folder, delete the tar file, 142 | and return the folder path. 143 | 144 | """ 145 | self._check_kwargs(kwargs) 146 | if ( 147 | force 148 | or path not in self.cache_map 149 | or not os.path.exists(self.cache_map[path]) 150 | ): 151 | logger = logging.getLogger(__name__) 152 | parsed_url = urlparse(path) 153 | dirname = os.path.join( 154 | get_cache_dir(cache_dir), os.path.dirname(parsed_url.path.lstrip("/")) 155 | ) 156 | filename = path.split("/")[-1] 157 | if len(filename) > self.MAX_FILENAME_LEN: 158 | filename = filename[:100] + "_" + uuid.uuid4().hex 159 | 160 | cached = os.path.join(dirname, filename) 161 | 162 | if is_cached_folder_exists_and_valid(cached): 163 | # When the cached folder exists and valid, we don't need to redownload 164 | # the tar file. 165 | self.cache_map[path] = _get_untar_directory(cached) 166 | 167 | else: 168 | with file_lock(cached): 169 | if not os.path.isfile(cached): 170 | logger.info("Downloading {} ...".format(path)) 171 | cached = download(path, dirname, filename=filename) 172 | 173 | if path.endswith(".tar"): 174 | model_dir = _untar_model_weights(cached) 175 | try: 176 | os.remove(cached) # remove the redundant tar file 177 | # TODO: remove the .lock file . 178 | except: 179 | logger.warning( 180 | f"Not able to remove the cached tar file {cached}" 181 | ) 182 | 183 | logger.info("URL {} cached in {}".format(path, model_dir)) 184 | self.cache_map[path] = model_dir 185 | 186 | return self.cache_map[path] 187 | 188 | 189 | class LayoutParserPaddleModelHandler(PathHandler): 190 | """ 191 | Resolve anything that's in LayoutParser model zoo. 192 | """ 193 | 194 | PREFIX = "lp://paddledetection/" 195 | 196 | def _get_supported_prefixes(self): 197 | return [self.PREFIX] 198 | 199 | def _get_local_path(self, path, **kwargs): 200 | model_name = path[len(self.PREFIX) :] 201 | dataset_name, *model_name, data_type = model_name.split("/") 202 | 203 | if data_type == "weight": 204 | model_url = MODEL_CATALOG[dataset_name]["/".join(model_name)] 205 | else: 206 | raise ValueError(f"Unknown data_type {data_type}") 207 | return PathManager.get_local_path(model_url, **kwargs) 208 | 209 | def _open(self, path, mode="r", **kwargs): 210 | return PathManager.open(self._get_local_path(path), mode, **kwargs) 211 | 212 | 213 | PathManager.register_handler(PaddleModelURLHandler()) 214 | PathManager.register_handler(LayoutParserPaddleModelHandler()) 215 | -------------------------------------------------------------------------------- /src/layoutparser/ocr/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The Layout Parser team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from .gcv_agent import GCVAgent, GCVFeatureType 16 | from .tesseract_agent import TesseractAgent, TesseractFeatureType -------------------------------------------------------------------------------- /src/layoutparser/ocr/base.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The Layout Parser team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from abc import ABC, abstractmethod 16 | from enum import IntEnum 17 | 18 | from ..file_utils import requires_backends 19 | 20 | class BaseOCRElementType(IntEnum): 21 | @property 22 | @abstractmethod 23 | def attr_name(self): 24 | pass 25 | 26 | 27 | class BaseOCRAgent(ABC): 28 | @property 29 | @abstractmethod 30 | def DEPENDENCIES(self): 31 | """DEPENDENCIES lists all necessary dependencies for the class.""" 32 | pass 33 | 34 | def __new__(cls, *args, **kwargs): 35 | 36 | requires_backends(cls, cls.DEPENDENCIES) 37 | return super().__new__(cls) 38 | 39 | @abstractmethod 40 | def detect(self, image): 41 | pass 42 | -------------------------------------------------------------------------------- /src/layoutparser/ocr/tesseract_agent.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The Layout Parser team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import io 16 | import csv 17 | import pickle 18 | 19 | import pandas as pd 20 | 21 | from .base import BaseOCRAgent, BaseOCRElementType 22 | from ..io import load_dataframe 23 | from ..file_utils import is_pytesseract_available 24 | 25 | if is_pytesseract_available(): 26 | import pytesseract 27 | 28 | 29 | class TesseractFeatureType(BaseOCRElementType): 30 | """ 31 | The element types for Tesseract Detection API 32 | """ 33 | 34 | PAGE = 0 35 | BLOCK = 1 36 | PARA = 2 37 | LINE = 3 38 | WORD = 4 39 | 40 | @property 41 | def attr_name(self): 42 | name_cvt = { 43 | TesseractFeatureType.PAGE: "page_num", 44 | TesseractFeatureType.BLOCK: "block_num", 45 | TesseractFeatureType.PARA: "par_num", 46 | TesseractFeatureType.LINE: "line_num", 47 | TesseractFeatureType.WORD: "word_num", 48 | } 49 | return name_cvt[self] 50 | 51 | @property 52 | def group_levels(self): 53 | levels = ["page_num", "block_num", "par_num", "line_num", "word_num"] 54 | return levels[: self + 1] 55 | 56 | 57 | class TesseractAgent(BaseOCRAgent): 58 | """ 59 | A wrapper for `Tesseract `_ Text 60 | Detection APIs based on `PyTesseract `_. 61 | """ 62 | 63 | DEPENDENCIES = ["pytesseract"] 64 | 65 | def __init__(self, languages="eng", **kwargs): 66 | """Create a Tesseract OCR Agent. 67 | 68 | Args: 69 | languages (:obj:`list` or :obj:`str`, optional): 70 | You can specify the language code(s) of the documents to detect to improve 71 | accuracy. The supported language and their code can be found on 72 | `its github repo `_. 73 | It supports two formats: 1) you can pass in the languages code as a string 74 | of format like `"eng+fra"`, or 2) you can pack them as a list of strings 75 | `["eng", "fra"]`. 76 | Defaults to 'eng'. 77 | """ 78 | self.lang = languages if isinstance(languages, str) else "+".join(languages) 79 | self.configs = kwargs 80 | 81 | @classmethod 82 | def with_tesseract_executable(cls, tesseract_cmd_path, **kwargs): 83 | 84 | pytesseract.pytesseract.tesseract_cmd = tesseract_cmd_path 85 | return cls(**kwargs) 86 | 87 | def _detect(self, img_content): 88 | res = {} 89 | res["text"] = pytesseract.image_to_string( 90 | img_content, lang=self.lang, **self.configs 91 | ) 92 | _data = pytesseract.image_to_data(img_content, lang=self.lang, **self.configs) 93 | res["data"] = pd.read_csv( 94 | io.StringIO(_data), 95 | quoting=csv.QUOTE_NONE, 96 | encoding="utf-8", 97 | sep="\t", 98 | converters={"text": str}, 99 | ) 100 | return res 101 | 102 | def detect( 103 | self, image, return_response=False, return_only_text=True, agg_output_level=None 104 | ): 105 | """Send the input image for OCR. 106 | 107 | Args: 108 | image (:obj:`np.ndarray` or :obj:`str`): 109 | The input image array or the name of the image file 110 | return_response (:obj:`bool`, optional): 111 | Whether directly return all output (string and boxes 112 | info) from Tesseract. 113 | Defaults to `False`. 114 | return_only_text (:obj:`bool`, optional): 115 | Whether return only the texts in the OCR results. 116 | Defaults to `False`. 117 | agg_output_level (:obj:`~TesseractFeatureType`, optional): 118 | When set, aggregate the GCV output with respect to the 119 | specified aggregation level. Defaults to `None`. 120 | """ 121 | 122 | res = self._detect(image) 123 | 124 | if return_response: 125 | return res 126 | 127 | if return_only_text: 128 | return res["text"] 129 | 130 | if agg_output_level is not None: 131 | return self.gather_data(res, agg_output_level) 132 | 133 | return res["text"] 134 | 135 | @staticmethod 136 | def gather_data(response, agg_level): 137 | """ 138 | Gather the OCR'ed text, bounding boxes, and confidence 139 | in a given aggeragation level. 140 | """ 141 | assert isinstance( 142 | agg_level, TesseractFeatureType 143 | ), f"Invalid agg_level {agg_level}" 144 | res = response["data"] 145 | df = ( 146 | res[~res.text.isna()] 147 | .groupby(agg_level.group_levels) 148 | .apply( 149 | lambda gp: pd.Series( 150 | [ 151 | gp["left"].min(), 152 | gp["top"].min(), 153 | gp["width"].max(), 154 | gp["height"].max(), 155 | gp["conf"].mean(), 156 | gp["text"].str.cat(sep=" "), 157 | ] 158 | ) 159 | ) 160 | .reset_index(drop=True) 161 | .reset_index() 162 | .rename( 163 | columns={ 164 | 0: "x_1", 165 | 1: "y_1", 166 | 2: "w", 167 | 3: "h", 168 | 4: "score", 169 | 5: "text", 170 | "index": "id", 171 | } 172 | ) 173 | .assign( 174 | x_2=lambda x: x.x_1 + x.w, 175 | y_2=lambda x: x.y_1 + x.h, 176 | block_type="rectangle", 177 | ) 178 | .drop(columns=["w", "h"]) 179 | ) 180 | 181 | return load_dataframe(df) 182 | 183 | @staticmethod 184 | def load_response(filename): 185 | with open(filename, "rb") as fp: 186 | res = pickle.load(fp) 187 | return res 188 | 189 | @staticmethod 190 | def save_response(res, file_name): 191 | 192 | with open(file_name, "wb") as fp: 193 | pickle.dump(res, fp, protocol=pickle.HIGHEST_PROTOCOL) 194 | -------------------------------------------------------------------------------- /src/layoutparser/tools/__init__.py: -------------------------------------------------------------------------------- 1 | from .shape_operations import ( 2 | generalized_connected_component_analysis_1d, 3 | simple_line_detection, 4 | group_textblocks_based_on_category, 5 | ) 6 | -------------------------------------------------------------------------------- /src/layoutparser/tools/shape_operations.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The Layout Parser team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from typing import List, Union, Any, Callable, Iterable 16 | from functools import partial, reduce 17 | 18 | import numpy as np 19 | from scipy.sparse import csr_matrix 20 | from scipy.sparse.csgraph import connected_components 21 | 22 | from ..elements import BaseLayoutElement, TextBlock 23 | 24 | 25 | def generalized_connected_component_analysis_1d( 26 | sequence: List[Any], 27 | scoring_func: Callable[[Any, Any], int], 28 | aggregation_func: Callable[[List[Any]], Any] = None, 29 | default_score_value: int = 0, 30 | ) -> List[Any]: 31 | """Perform connected componenet analysis for any 1D sequence based on 32 | the scoring function and the aggregation function. 33 | It will generate the adjacency_matrix for the 1D sequence object using 34 | the provided `scoring_func` and find the connected componenets. 35 | The `aggregation_func` will be used to aggregate all elements within 36 | identified components (when not set, it will be the identity function). 37 | 38 | Args: 39 | sequence (List[Any]): 40 | The provided 1D sequence of objects. 41 | scoring_func (Callable[[Any, Any], int]): 42 | The scoring function used to construct the adjacency_matrix. 43 | It should take two objects in the sequence and produe a integer. 44 | aggregation_func (Callable[[List[Any]], Any], optional): 45 | The function used to aggregate the elements within an identified 46 | component. 47 | Defaults to the identify function: `lambda x: x`. 48 | default_score_value (int, optional): 49 | Used to set the default (background) score values that should be 50 | not considered when running connected component analysis. 51 | Defaults to 0. 52 | 53 | Returns: 54 | List[Any]: A list of length n - the number of the detected componenets. 55 | """ 56 | 57 | if aggregation_func is None: 58 | aggregation_func = lambda x: x # Identity Function 59 | 60 | seq_len = len(sequence) 61 | adjacency_matrix = np.ones((seq_len, seq_len)) * default_score_value 62 | 63 | for i in range(seq_len): 64 | for j in range(i + 1, seq_len): 65 | adjacency_matrix[i][j] = scoring_func(sequence[i], sequence[j]) 66 | 67 | graph = csr_matrix(adjacency_matrix) 68 | n_components, labels = connected_components( 69 | csgraph=graph, directed=False, return_labels=True 70 | ) 71 | 72 | grouped_sequence = [] 73 | for comp_idx in range(n_components): 74 | element_idx = np.where(labels == comp_idx)[0] 75 | grouped_sequence.append(aggregation_func([sequence[i] for i in element_idx])) 76 | 77 | return grouped_sequence 78 | 79 | 80 | def simple_line_detection( 81 | layout: Iterable[BaseLayoutElement], x_tolerance: int = 10, y_tolerance: int = 10 82 | ) -> List[BaseLayoutElement]: 83 | """Perform line detection based on connected component analysis. 84 | 85 | The is_line_wise_close is the scoring function, which returns True 86 | if the y-difference is smaller than the y_tolerance AND the 87 | x-difference (the horizontal gap between two boxes) is also smaller 88 | than the x_tolerance, and False otherwise. 89 | 90 | All the detected components will then be passed into aggregation_func, 91 | which returns the overall union box of all the elements, or the line 92 | box. 93 | 94 | Args: 95 | layout (Iterable): 96 | A list (or Layout) of BaseLayoutElement 97 | x_tolerance (int, optional): 98 | The value used for specifying the maximum allowed y-difference 99 | when considered whether two tokens are from the same line. 100 | Defaults to 10. 101 | y_tolerance (int, optional): 102 | The value used for specifying the maximum allowed horizontal gap 103 | when considered whether two tokens are from the same line. 104 | Defaults to 10. 105 | 106 | Returns: 107 | List[BaseLayoutElement]: A list of BaseLayoutElement, denoting the line boxes. 108 | """ 109 | 110 | def is_line_wise_close(token_a, token_b, x_tolerance, y_tolerance): 111 | y_a = token_a.block.center[1] 112 | y_b = token_b.block.center[1] 113 | 114 | a_left, a_right = token_a.block.coordinates[0::2] 115 | b_left, b_right = token_b.block.coordinates[0::2] 116 | 117 | return ( 118 | abs(y_a - y_b) <= y_tolerance 119 | and min(abs(a_left - b_right), abs(a_right - b_left)) <= x_tolerance 120 | ) 121 | # If the y-difference is smaller than the y_tolerance AND 122 | # the x-difference (the horizontal gap between two boxes) 123 | # is also smaller than the x_tolerance threshold, then 124 | # these two tokens are considered as line-wise close. 125 | 126 | detected_lines = generalized_connected_component_analysis_1d( 127 | layout, 128 | scoring_func=partial( 129 | is_line_wise_close, y_tolerance=x_tolerance, x_tolerance=y_tolerance 130 | ), 131 | aggregation_func=lambda seq: reduce(layout[0].__class__.union, seq), 132 | ) 133 | 134 | return detected_lines 135 | 136 | 137 | def group_textblocks_based_on_category( 138 | layout: Iterable[TextBlock], union_group: bool = True 139 | ) -> Union[List[TextBlock], List[List[TextBlock]]]: 140 | """Group textblocks based on their category (block.type). 141 | 142 | Args: 143 | layout (Iterable): 144 | A list (or Layout) of BaseLayoutElement 145 | union_group (bool): 146 | Whether to union the boxes within each group. 147 | Defaults to True. 148 | 149 | Returns: 150 | List[TextBlock]: When `union_group=True`, it produces a list of 151 | TextBlocks, denoting the boundaries of each texblock group. 152 | List[List[TextBlock]]: When `union_group=False`, it preserves 153 | the elements within each group for further processing. 154 | """ 155 | 156 | if union_group: 157 | aggregation_func = lambda seq: reduce(layout[0].__class__.union, seq) 158 | else: 159 | aggregation_func = None 160 | 161 | detected_group_boxes = generalized_connected_component_analysis_1d( 162 | layout, 163 | scoring_func=lambda a, b: a.type == b.type, 164 | aggregation_func=aggregation_func, 165 | ) 166 | 167 | return detected_group_boxes 168 | -------------------------------------------------------------------------------- /tests/fixtures/io/empty.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Layout-Parser/layout-parser/04e28168d820eea3a1ff1e098078323e7b48648b/tests/fixtures/io/empty.pdf -------------------------------------------------------------------------------- /tests/fixtures/io/example.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Layout-Parser/layout-parser/04e28168d820eea3a1ff1e098078323e7b48648b/tests/fixtures/io/example.pdf -------------------------------------------------------------------------------- /tests/fixtures/io/generate_test_jsons.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The Layout Parser team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import json 16 | import numpy as np 17 | from layoutparser.elements import Interval, Rectangle, Quadrilateral, TextBlock, Layout 18 | 19 | if __name__ == "__main__": 20 | 21 | i = Interval(1, 2, "y", canvas_height=5) 22 | r = Rectangle(1, 2, 3, 4) 23 | q = Quadrilateral(np.arange(8).reshape(4, 2), 200, 400) 24 | l = Layout([i, r, q], page_data={"width": 200, "height": 200}) 25 | 26 | with open("interval.json", "w") as fp: 27 | json.dump(i.to_dict(), fp) 28 | with open("rectangle.json", "w") as fp: 29 | json.dump(r.to_dict(), fp) 30 | with open("quadrilateral.json", "w") as fp: 31 | json.dump(q.to_dict(), fp) 32 | with open("layout.json", "w") as fp: 33 | json.dump(l.to_dict(), fp) 34 | l.to_dataframe().to_csv("layout.csv", index=None) 35 | 36 | i2 = TextBlock(i, "") 37 | r2 = TextBlock(r, id=24) 38 | q2 = TextBlock(q, text="test", parent=45) 39 | l2 = Layout([i2, r2, q2]) 40 | 41 | with open("interval_textblock.json", "w") as fp: 42 | json.dump(i2.to_dict(), fp) 43 | with open("rectangle_textblock.json", "w") as fp: 44 | json.dump(r2.to_dict(), fp) 45 | with open("quadrilateral_textblock.json", "w") as fp: 46 | json.dump(q2.to_dict(), fp) 47 | with open("layout_textblock.json", "w") as fp: 48 | json.dump(l2.to_dict(), fp) 49 | l2.to_dataframe().to_csv("layout_textblock.csv", index=None) -------------------------------------------------------------------------------- /tests/fixtures/io/interval.json: -------------------------------------------------------------------------------- 1 | {"start": 1, "end": 2, "axis": "y", "canvas_height": 5, "canvas_width": 0, "block_type": "interval"} -------------------------------------------------------------------------------- /tests/fixtures/io/interval_textblock.json: -------------------------------------------------------------------------------- 1 | {"start": 1, "end": 2, "axis": "y", "canvas_height": 5, "canvas_width": 0, "block_type": "interval", "text": ""} -------------------------------------------------------------------------------- /tests/fixtures/io/layout.csv: -------------------------------------------------------------------------------- 1 | start,end,axis,canvas_height,canvas_width,block_type,x_1,y_1,x_2,y_2,points,height,width 2 | 1.0,2.0,y,5.0,0.0,interval,,,,,,, 3 | ,,,,,rectangle,1.0,2.0,3.0,4.0,,, 4 | ,,,,,quadrilateral,,,,,"[0, 1, 2, 3, 4, 5, 6, 7]",200.0,400.0 5 | -------------------------------------------------------------------------------- /tests/fixtures/io/layout.json: -------------------------------------------------------------------------------- 1 | {"page_data": {"width": 200, "height": 200}, "blocks": [{"start": 1, "end": 2, "axis": "y", "canvas_height": 5, "canvas_width": 0, "block_type": "interval"}, {"x_1": 1, "y_1": 2, "x_2": 3, "y_2": 4, "block_type": "rectangle"}, {"points": [0, 1, 2, 3, 4, 5, 6, 7], "height": 200, "width": 400, "block_type": "quadrilateral"}]} -------------------------------------------------------------------------------- /tests/fixtures/io/layout_textblock.csv: -------------------------------------------------------------------------------- 1 | start,end,axis,canvas_height,canvas_width,block_type,text,x_1,y_1,x_2,y_2,id,points,height,width,parent 2 | 1.0,2.0,y,5.0,0.0,interval,,,,,,,,,, 3 | ,,,,,rectangle,,1.0,2.0,3.0,4.0,24.0,,,, 4 | ,,,,,quadrilateral,test,,,,,,"[0, 1, 2, 3, 4, 5, 6, 7]",200.0,400.0,45.0 5 | -------------------------------------------------------------------------------- /tests/fixtures/io/layout_textblock.json: -------------------------------------------------------------------------------- 1 | {"page_data": {}, "blocks": [{"start": 1, "end": 2, "axis": "y", "canvas_height": 5, "canvas_width": 0, "block_type": "interval", "text": ""}, {"x_1": 1, "y_1": 2, "x_2": 3, "y_2": 4, "block_type": "rectangle", "id": 24}, {"points": [0, 1, 2, 3, 4, 5, 6, 7], "height": 200, "width": 400, "block_type": "quadrilateral", "text": "test", "parent": 45}]} -------------------------------------------------------------------------------- /tests/fixtures/io/quadrilateral.json: -------------------------------------------------------------------------------- 1 | {"points": [0, 1, 2, 3, 4, 5, 6, 7], "height": 200, "width": 400, "block_type": "quadrilateral"} -------------------------------------------------------------------------------- /tests/fixtures/io/quadrilateral_textblock.json: -------------------------------------------------------------------------------- 1 | {"points": [0, 1, 2, 3, 4, 5, 6, 7], "height": 200, "width": 400, "block_type": "quadrilateral", "text": "test", "parent": 45} -------------------------------------------------------------------------------- /tests/fixtures/io/rectangle.json: -------------------------------------------------------------------------------- 1 | {"x_1": 1, "y_1": 2, "x_2": 3, "y_2": 4, "block_type": "rectangle"} -------------------------------------------------------------------------------- /tests/fixtures/io/rectangle_textblock.json: -------------------------------------------------------------------------------- 1 | {"x_1": 1, "y_1": 2, "x_2": 3, "y_2": 4, "block_type": "rectangle", "id": 24} -------------------------------------------------------------------------------- /tests/fixtures/model/config.yml: -------------------------------------------------------------------------------- 1 | CUDNN_BENCHMARK: false 2 | DATALOADER: 3 | ASPECT_RATIO_GROUPING: true 4 | FILTER_EMPTY_ANNOTATIONS: true 5 | NUM_WORKERS: 2 6 | REPEAT_THRESHOLD: 0.0 7 | SAMPLER_TRAIN: TrainingSampler 8 | DATASETS: 9 | PRECOMPUTED_PROPOSAL_TOPK_TEST: 1000 10 | PRECOMPUTED_PROPOSAL_TOPK_TRAIN: 2000 11 | PROPOSAL_FILES_TEST: [] 12 | PROPOSAL_FILES_TRAIN: [] 13 | TEST: 14 | - HJDataset_test 15 | TRAIN: 16 | - HJDataset_train 17 | GLOBAL: 18 | HACK: 1.0 19 | INPUT: 20 | CROP: 21 | ENABLED: false 22 | SIZE: 23 | - 0.9 24 | - 0.9 25 | TYPE: relative_range 26 | FORMAT: BGR 27 | MASK_FORMAT: polygon 28 | MAX_SIZE_TEST: 1333 29 | MAX_SIZE_TRAIN: 1333 30 | MIN_SIZE_TEST: 800 31 | MIN_SIZE_TRAIN: 32 | - 640 33 | - 672 34 | - 704 35 | - 736 36 | - 768 37 | - 800 38 | MIN_SIZE_TRAIN_SAMPLING: choice 39 | MODEL: 40 | ANCHOR_GENERATOR: 41 | ANGLES: 42 | - - -90 43 | - 0 44 | - 90 45 | ASPECT_RATIOS: 46 | - - 0.5 47 | - 1.0 48 | - 2.0 49 | NAME: DefaultAnchorGenerator 50 | OFFSET: 0.0 51 | SIZES: 52 | - - 32 53 | - - 64 54 | - - 128 55 | - - 256 56 | - - 512 57 | BACKBONE: 58 | FREEZE_AT: 2 59 | NAME: build_resnet_fpn_backbone 60 | DEVICE: cuda 61 | FPN: 62 | FUSE_TYPE: sum 63 | IN_FEATURES: 64 | - res2 65 | - res3 66 | - res4 67 | - res5 68 | NORM: '' 69 | OUT_CHANNELS: 256 70 | KEYPOINT_ON: false 71 | LOAD_PROPOSALS: false 72 | MASK_ON: false 73 | META_ARCHITECTURE: GeneralizedRCNN 74 | PANOPTIC_FPN: 75 | COMBINE: 76 | ENABLED: true 77 | INSTANCES_CONFIDENCE_THRESH: 0.5 78 | OVERLAP_THRESH: 0.5 79 | STUFF_AREA_LIMIT: 4096 80 | INSTANCE_LOSS_WEIGHT: 1.0 81 | PIXEL_MEAN: 82 | - 103.53 83 | - 116.28 84 | - 123.675 85 | PIXEL_STD: 86 | - 1.0 87 | - 1.0 88 | - 1.0 89 | PROPOSAL_GENERATOR: 90 | MIN_SIZE: 0 91 | NAME: RPN 92 | RESNETS: 93 | DEFORM_MODULATED: false 94 | DEFORM_NUM_GROUPS: 1 95 | DEFORM_ON_PER_STAGE: 96 | - false 97 | - false 98 | - false 99 | - false 100 | DEPTH: 50 101 | NORM: FrozenBN 102 | NUM_GROUPS: 1 103 | OUT_FEATURES: 104 | - res2 105 | - res3 106 | - res4 107 | - res5 108 | RES2_OUT_CHANNELS: 256 109 | RES5_DILATION: 1 110 | STEM_OUT_CHANNELS: 64 111 | STRIDE_IN_1X1: true 112 | WIDTH_PER_GROUP: 64 113 | RETINANET: 114 | BBOX_REG_WEIGHTS: 115 | - 1.0 116 | - 1.0 117 | - 1.0 118 | - 1.0 119 | FOCAL_LOSS_ALPHA: 0.25 120 | FOCAL_LOSS_GAMMA: 2.0 121 | IN_FEATURES: 122 | - p3 123 | - p4 124 | - p5 125 | - p6 126 | - p7 127 | IOU_LABELS: 128 | - 0 129 | - -1 130 | - 1 131 | IOU_THRESHOLDS: 132 | - 0.4 133 | - 0.5 134 | NMS_THRESH_TEST: 0.5 135 | NUM_CLASSES: 9 136 | NUM_CONVS: 4 137 | PRIOR_PROB: 0.01 138 | SCORE_THRESH_TEST: 0.05 139 | SMOOTH_L1_LOSS_BETA: 0.1 140 | TOPK_CANDIDATES_TEST: 1000 141 | ROI_BOX_CASCADE_HEAD: 142 | BBOX_REG_WEIGHTS: 143 | - - 10.0 144 | - 10.0 145 | - 5.0 146 | - 5.0 147 | - - 20.0 148 | - 20.0 149 | - 10.0 150 | - 10.0 151 | - - 30.0 152 | - 30.0 153 | - 15.0 154 | - 15.0 155 | IOUS: 156 | - 0.5 157 | - 0.6 158 | - 0.7 159 | ROI_BOX_HEAD: 160 | BBOX_REG_WEIGHTS: 161 | - 10.0 162 | - 10.0 163 | - 5.0 164 | - 5.0 165 | CLS_AGNOSTIC_BBOX_REG: false 166 | CONV_DIM: 256 167 | FC_DIM: 1024 168 | NAME: FastRCNNConvFCHead 169 | NORM: '' 170 | NUM_CONV: 0 171 | NUM_FC: 2 172 | POOLER_RESOLUTION: 7 173 | POOLER_SAMPLING_RATIO: 0 174 | POOLER_TYPE: ROIAlignV2 175 | SMOOTH_L1_BETA: 0.0 176 | TRAIN_ON_PRED_BOXES: false 177 | ROI_HEADS: 178 | BATCH_SIZE_PER_IMAGE: 256 179 | IN_FEATURES: 180 | - p2 181 | - p3 182 | - p4 183 | - p5 184 | IOU_LABELS: 185 | - 0 186 | - 1 187 | IOU_THRESHOLDS: 188 | - 0.5 189 | NAME: StandardROIHeads 190 | NMS_THRESH_TEST: 0.5 191 | NUM_CLASSES: 8 192 | POSITIVE_FRACTION: 0.25 193 | PROPOSAL_APPEND_GT: true 194 | SCORE_THRESH_TEST: 0.05 195 | ROI_KEYPOINT_HEAD: 196 | CONV_DIMS: 197 | - 512 198 | - 512 199 | - 512 200 | - 512 201 | - 512 202 | - 512 203 | - 512 204 | - 512 205 | LOSS_WEIGHT: 1.0 206 | MIN_KEYPOINTS_PER_IMAGE: 1 207 | NAME: KRCNNConvDeconvUpsampleHead 208 | NORMALIZE_LOSS_BY_VISIBLE_KEYPOINTS: true 209 | NUM_KEYPOINTS: 17 210 | POOLER_RESOLUTION: 14 211 | POOLER_SAMPLING_RATIO: 0 212 | POOLER_TYPE: ROIAlignV2 213 | ROI_MASK_HEAD: 214 | CLS_AGNOSTIC_MASK: false 215 | CONV_DIM: 256 216 | NAME: MaskRCNNConvUpsampleHead 217 | NORM: '' 218 | NUM_CONV: 4 219 | POOLER_RESOLUTION: 14 220 | POOLER_SAMPLING_RATIO: 0 221 | POOLER_TYPE: ROIAlignV2 222 | RPN: 223 | BATCH_SIZE_PER_IMAGE: 256 224 | BBOX_REG_WEIGHTS: 225 | - 1.0 226 | - 1.0 227 | - 1.0 228 | - 1.0 229 | BOUNDARY_THRESH: -1 230 | HEAD_NAME: StandardRPNHead 231 | IN_FEATURES: 232 | - p2 233 | - p3 234 | - p4 235 | - p5 236 | - p6 237 | IOU_LABELS: 238 | - 0 239 | - -1 240 | - 1 241 | IOU_THRESHOLDS: 242 | - 0.3 243 | - 0.7 244 | LOSS_WEIGHT: 1.0 245 | NMS_THRESH: 0.7 246 | POSITIVE_FRACTION: 0.5 247 | POST_NMS_TOPK_TEST: 1000 248 | POST_NMS_TOPK_TRAIN: 1000 249 | PRE_NMS_TOPK_TEST: 1000 250 | PRE_NMS_TOPK_TRAIN: 2000 251 | SMOOTH_L1_BETA: 0.0 252 | SEM_SEG_HEAD: 253 | COMMON_STRIDE: 4 254 | CONVS_DIM: 128 255 | IGNORE_VALUE: 255 256 | IN_FEATURES: 257 | - p2 258 | - p3 259 | - p4 260 | - p5 261 | LOSS_WEIGHT: 1.0 262 | NAME: SemSegFPNHead 263 | NORM: GN 264 | NUM_CLASSES: 54 265 | WEIGHTS: https://www.dropbox.com/s/3hafewz6wcvev04/model_final.pth?dl=1 266 | OUTPUT_DIR: ./train_log/faster_rcnn_R_50_FPN_3x 267 | SEED: -1 268 | SOLVER: 269 | BASE_LR: 0.00025 270 | BIAS_LR_FACTOR: 1.0 271 | CHECKPOINT_PERIOD: 30000 272 | GAMMA: 0.1 273 | IMS_PER_BATCH: 2 274 | LR_SCHEDULER_NAME: WarmupMultiStepLR 275 | MAX_ITER: 60000 276 | MOMENTUM: 0.9 277 | STEPS: 278 | - 210000 279 | - 250000 280 | WARMUP_FACTOR: 0.001 281 | WARMUP_ITERS: 1000 282 | WARMUP_METHOD: linear 283 | WEIGHT_DECAY: 0.0001 284 | WEIGHT_DECAY_BIAS: 0.0001 285 | WEIGHT_DECAY_NORM: 0.0 286 | TEST: 287 | AUG: 288 | ENABLED: false 289 | FLIP: true 290 | MAX_SIZE: 4000 291 | MIN_SIZES: 292 | - 400 293 | - 500 294 | - 600 295 | - 700 296 | - 800 297 | - 900 298 | - 1000 299 | - 1100 300 | - 1200 301 | DETECTIONS_PER_IMAGE: 100 302 | EVAL_PERIOD: 0 303 | EXPECTED_RESULTS: [] 304 | KEYPOINT_OKS_SIGMAS: [] 305 | PRECISE_BN: 306 | ENABLED: false 307 | NUM_ITER: 200 308 | VERSION: 2 309 | VIS_PERIOD: 0 310 | -------------------------------------------------------------------------------- /tests/fixtures/model/layout_detection_reference.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Layout-Parser/layout-parser/04e28168d820eea3a1ff1e098078323e7b48648b/tests/fixtures/model/layout_detection_reference.jpg -------------------------------------------------------------------------------- /tests/fixtures/model/layout_detection_reference.json: -------------------------------------------------------------------------------- 1 | {"page_data": {}, "blocks": [{"x_1": 648.9922485351562, "y_1": 1418.7113037109375, "x_2": 1132.6805419921875, "y_2": 1479.303955078125, "block_type": "rectangle", "type": "Text", "score": 0.9995978474617004}, {"x_1": 106.12457275390625, "y_1": 1032.07470703125, "x_2": 599.2977905273438, "y_2": 1323.208984375, "block_type": "rectangle", "type": "Text", "score": 0.9981802701950073}, {"x_1": 639.54736328125, "y_1": 773.1265869140625, "x_2": 1135.9765625, "y_2": 1044.6507568359375, "block_type": "rectangle", "type": "Text", "score": 0.9974864721298218}, {"x_1": 104.36861419677734, "y_1": 767.3282470703125, "x_2": 595.1759643554688, "y_2": 970.451171875, "block_type": "rectangle", "type": "Text", "score": 0.9974320530891418}, {"x_1": 107.37610626220703, "y_1": 1448.544189453125, "x_2": 598.3998413085938, "y_2": 1488.01611328125, "block_type": "rectangle", "type": "Text", "score": 0.9953517913818359}, {"x_1": 132.01339721679688, "y_1": 146.253173828125, "x_2": 1160.3997802734375, "y_2": 652.8322143554688, "block_type": "rectangle", "type": "Figure", "score": 0.9953091740608215}, {"x_1": 103.79012298583984, "y_1": 1327.6717529296875, "x_2": 601.3895874023438, "y_2": 1429.9224853515625, "block_type": "rectangle", "type": "Text", "score": 0.9949470162391663}, {"x_1": 103.83270263671875, "y_1": 671.7702026367188, "x_2": 1138.1756591796875, "y_2": 748.6300659179688, "block_type": "rectangle", "type": "Text", "score": 0.9943684935569763}, {"x_1": 104.0943832397461, "y_1": 985.9046020507812, "x_2": 444.34979248046875, "y_2": 1011.3511352539062, "block_type": "rectangle", "type": "Title", "score": 0.9880087375640869}, {"x_1": 395.9805908203125, "y_1": 141.7040252685547, "x_2": 1141.115478515625, "y_2": 659.3515625, "block_type": "rectangle", "type": "Figure", "score": 0.9815265536308289}, {"x_1": 107.32891845703125, "y_1": 149.01644897460938, "x_2": 405.1805419921875, "y_2": 582.9757690429688, "block_type": "rectangle", "type": "Figure", "score": 0.965209424495697}, {"x_1": 638.6964721679688, "y_1": 1075.6173095703125, "x_2": 1137.9869384765625, "y_2": 1154.6956787109375, "block_type": "rectangle", "type": "Text", "score": 0.9612341523170471}, {"x_1": 137.1743621826172, "y_1": 591.2607421875, "x_2": 376.2920227050781, "y_2": 609.2918701171875, "block_type": "rectangle", "type": "Text", "score": 0.9027073979377747}, {"x_1": 643.3095703125, "y_1": 1175.7694091796875, "x_2": 1127.9664306640625, "y_2": 1416.0784912109375, "block_type": "rectangle", "type": "Table", "score": 0.8846631646156311}]} -------------------------------------------------------------------------------- /tests/fixtures/model/test_model_image.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Layout-Parser/layout-parser/04e28168d820eea3a1ff1e098078323e7b48648b/tests/fixtures/model/test_model_image.jpg -------------------------------------------------------------------------------- /tests/fixtures/ocr/test_gcv_image.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Layout-Parser/layout-parser/04e28168d820eea3a1ff1e098078323e7b48648b/tests/fixtures/ocr/test_gcv_image.jpg -------------------------------------------------------------------------------- /tests/fixtures/ocr/test_tesseract_response.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Layout-Parser/layout-parser/04e28168d820eea3a1ff1e098078323e7b48648b/tests/fixtures/ocr/test_tesseract_response.pickle -------------------------------------------------------------------------------- /tests/test_io.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The Layout Parser team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import numpy as np 16 | from layoutparser.elements import Interval, Rectangle, Quadrilateral, TextBlock, Layout 17 | from layoutparser import load_json, load_dict, load_csv, load_pdf 18 | 19 | def test_json(): 20 | 21 | i = Interval(1, 2, "y", canvas_height=5) 22 | r = Rectangle(1, 2, 3, 4) 23 | q = Quadrilateral(np.arange(8).reshape(4, 2), 200, 400) 24 | l = Layout([i, r, q], page_data={"width": 200, "height": 200}) 25 | 26 | i2 = TextBlock(i, "") 27 | r2 = TextBlock(r, id=24) 28 | q2 = TextBlock(q, text="test", parent=45) 29 | l2 = Layout([i2, r2, q2]) 30 | 31 | i3 = TextBlock(i, None) 32 | r3 = TextBlock(r, id=None) 33 | q3 = TextBlock(q, text=None, parent=None) 34 | l3 = Layout([i3, r3, q3], page_data={"width": 200, "height": 200}) 35 | 36 | # fmt: off 37 | assert i == load_dict(i.to_dict()) == load_json("tests/fixtures/io/interval.json") 38 | assert r == load_dict(r.to_dict()) == load_json("tests/fixtures/io/rectangle.json") 39 | assert q == load_dict(q.to_dict()) == load_json("tests/fixtures/io/quadrilateral.json") 40 | assert l == load_dict(l.to_dict()) == load_json("tests/fixtures/io/layout.json") 41 | 42 | assert i2 == load_dict(i2.to_dict()) == load_json("tests/fixtures/io/interval_textblock.json") 43 | assert r2 == load_dict(r2.to_dict()) == load_json("tests/fixtures/io/rectangle_textblock.json") 44 | assert q2 == load_dict(q2.to_dict()) == load_json("tests/fixtures/io/quadrilateral_textblock.json") 45 | assert l2 == load_dict(l2.to_dict()) == load_json("tests/fixtures/io/layout_textblock.json") 46 | 47 | # Test if LP can ignore the unused None features 48 | assert l == load_dict(l3.to_dict()) 49 | # fmt: on 50 | 51 | 52 | def test_csv(): 53 | i = Interval(1, 2, "y", canvas_height=5) 54 | r = Rectangle(1, 2, 3, 4) 55 | q = Quadrilateral(np.arange(8).reshape(4, 2), 200, 400) 56 | l = Layout([i, r, q], page_data={"width": 200, "height": 200}) 57 | 58 | _l = load_csv("tests/fixtures/io/layout.csv") 59 | assert _l != l 60 | _l.page_data = {"width": 200, "height": 200} 61 | assert _l == l 62 | 63 | i2 = i # <- Allow mixmode loading 64 | r2 = TextBlock(r, id=24) 65 | q2 = TextBlock(q, text="test", parent=45) 66 | l2 = Layout([i2, r2, q2]) 67 | 68 | _l2 = load_csv("tests/fixtures/io/layout_textblock.csv") 69 | assert _l2 == l2 70 | 71 | 72 | def test_pdf(): 73 | pdf_layout = load_pdf("tests/fixtures/io/example.pdf") 74 | assert len(pdf_layout) == 1 75 | 76 | page_layout = pdf_layout[0] 77 | for attr_name in ["width", "height", "index"]: 78 | assert attr_name in page_layout.page_data 79 | 80 | assert len(set(ele.type for ele in page_layout)) == 3 81 | # Only three types of font show-up in the file 82 | 83 | def test_empty_pdf(): 84 | pdf_layout = load_pdf("tests/fixtures/io/empty.pdf") 85 | assert len(pdf_layout) == 1 # Only one page 86 | 87 | page_layout = pdf_layout[0] 88 | assert len(page_layout) == 0 # No selectable tokens on the page -------------------------------------------------------------------------------- /tests/test_model.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The Layout Parser team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import pytest 16 | import cv2 17 | 18 | from layoutparser import load_json 19 | from layoutparser.models import * 20 | 21 | ALL_DETECTRON2_MODEL_CONFIGS = [ 22 | "lp://PrimaLayout/mask_rcnn_R_50_FPN_3x/config", 23 | "lp://HJDataset/faster_rcnn_R_50_FPN_3x/config", 24 | "lp://HJDataset/mask_rcnn_R_50_FPN_3x/config", 25 | "lp://HJDataset/retinanet_R_50_FPN_3x/config", 26 | "lp://PubLayNet/faster_rcnn_R_50_FPN_3x/config", 27 | "lp://PubLayNet/mask_rcnn_R_50_FPN_3x/config", 28 | "lp://PubLayNet/mask_rcnn_X_101_32x8d_FPN_3x/config", 29 | "lp://NewspaperNavigator/faster_rcnn_R_50_FPN_3x/config", 30 | "lp://TableBank/faster_rcnn_R_50_FPN_3x/config", 31 | "lp://TableBank/faster_rcnn_R_101_FPN_3x/config", 32 | "lp://MFD/faster_rcnn_R_50_FPN_3x/config", 33 | ] 34 | 35 | ALL_PADDLEDETECTION_MODEL_CONFIGS = [ 36 | "lp://PubLayNet/ppyolov2_r50vd_dcn_365e/config", 37 | "lp://TableBank/ppyolov2_r50vd_dcn_365e/config", 38 | ] 39 | 40 | ALL_EFFDET_MODEL_CONFIGS = [ 41 | "lp://PubLayNet/tf_efficientdet_d0/config", 42 | "lp://PubLayNet/tf_efficientdet_d1/config", 43 | "lp://MFD/tf_efficientdet_d0/config", 44 | "lp://MFD/tf_efficientdet_d1/config", 45 | ] 46 | 47 | 48 | def _construct_valid_config_variations(config, backend_name): 49 | dataset_name, arch_name, identifier = config[len("lp://") :].split("/") 50 | return [ 51 | "lp://" + "/".join([backend_name, dataset_name, arch_name, identifier]), 52 | "lp://" + "/".join([backend_name, dataset_name, arch_name]), 53 | "lp://" + "/".join([backend_name, dataset_name]), 54 | "lp://" + "/".join([dataset_name, arch_name, identifier]), 55 | "lp://" + "/".join([dataset_name, arch_name]), 56 | "lp://" + "/".join([dataset_name]), 57 | ] 58 | 59 | 60 | def _construct_invalid_config_variations(config, backend_name): 61 | dataset_name, arch_name, identifier = config[len("lp://") :].split("/") 62 | return [ 63 | "lp://" + "/".join([backend_name]), 64 | ] 65 | 66 | 67 | def _single_config_test_pipeline(TestLayoutModel, base_config): 68 | for config in _construct_valid_config_variations( 69 | base_config, TestLayoutModel.DETECTOR_NAME 70 | ): 71 | model = TestLayoutModel(config) 72 | image = cv2.imread("tests/fixtures/model/test_model_image.jpg") 73 | layout = model.detect(image) 74 | del model 75 | 76 | for config in _construct_invalid_config_variations( 77 | base_config, TestLayoutModel.DETECTOR_NAME 78 | ): 79 | with pytest.raises(ValueError): 80 | model = TestLayoutModel(config) 81 | 82 | 83 | def test_Detectron2Model(is_large_scale=False): 84 | 85 | if is_large_scale: 86 | 87 | for config in ALL_DETECTRON2_MODEL_CONFIGS: 88 | model = Detectron2LayoutModel(config) 89 | 90 | image = cv2.imread("tests/fixtures/model/test_model_image.jpg") 91 | layout = model.detect(image) 92 | else: 93 | _single_config_test_pipeline( 94 | Detectron2LayoutModel, ALL_DETECTRON2_MODEL_CONFIGS[0] 95 | ) 96 | # Test in enforce CPU mode 97 | model = Detectron2LayoutModel("tests/fixtures/model/config.yml") 98 | image = cv2.imread("tests/fixtures/model/test_model_image.jpg") 99 | layout = model.detect(image) 100 | 101 | 102 | def test_Detectron2Model_version_compatibility(enabled=False): 103 | 104 | if enabled: 105 | model = Detectron2LayoutModel( 106 | config_path="lp://PubLayNet/faster_rcnn_R_50_FPN_3x/config", 107 | extra_config=[ 108 | "MODEL.ROI_HEADS.SCORE_THRESH_TEST", 109 | 0.85, 110 | "MODEL.ROI_HEADS.NMS_THRESH_TEST", 111 | 0.75, 112 | ], 113 | ) 114 | image = cv2.imread("tests/fixtures/model/layout_detection_reference.jpg") 115 | layout = model.detect(image) 116 | assert ( 117 | load_json("tests/fixtures/model/layout_detection_reference.json") == layout 118 | ) 119 | 120 | 121 | def test_PaddleDetectionModel(is_large_scale=False): 122 | """test PaddleDetection model""" 123 | if is_large_scale: 124 | 125 | for config in ALL_PADDLEDETECTION_MODEL_CONFIGS: 126 | model = PaddleDetectionLayoutModel(config) 127 | 128 | image = cv2.imread("tests/fixtures/model/test_model_image.jpg") 129 | layout = model.detect(image) 130 | else: 131 | _single_config_test_pipeline( 132 | PaddleDetectionLayoutModel, ALL_PADDLEDETECTION_MODEL_CONFIGS[0] 133 | ) 134 | 135 | 136 | def test_EffDetModel(is_large_scale=False): 137 | 138 | if is_large_scale: 139 | 140 | for config in ALL_EFFDET_MODEL_CONFIGS: 141 | model = EfficientDetLayoutModel(config) 142 | 143 | image = cv2.imread("tests/fixtures/model/test_model_image.jpg") 144 | layout = model.detect(image) 145 | else: 146 | _single_config_test_pipeline( 147 | EfficientDetLayoutModel, ALL_EFFDET_MODEL_CONFIGS[0] 148 | ) 149 | 150 | 151 | def test_AutoModel(): 152 | 153 | # Full configs 154 | auto_model_config_1 = [ 155 | "lp://detectron2/PubLayNet/faster_rcnn_R_50_FPN_3x/config", 156 | "lp://paddledetection/PubLayNet/ppyolov2_r50vd_dcn_365e/config", 157 | "lp://efficientdet/PubLayNet/tf_efficientdet_d0/config", 158 | ] 159 | for config in auto_model_config_1: 160 | model = AutoLayoutModel(config) 161 | image = cv2.imread("tests/fixtures/model/test_model_image.jpg") 162 | layout = model.detect(image) 163 | 164 | # Dataset name only 165 | # It will use the first available model 166 | auto_model_config_2 = [ 167 | "lp://PubLayNet", 168 | "lp://MFD", 169 | ] 170 | for config in auto_model_config_1: 171 | model = AutoLayoutModel(config) 172 | model.DETECTOR_NAME == "efficientdet" 173 | 174 | # Automodel name that doesn't work 175 | 176 | # 1. No available backend for the model 177 | with pytest.raises(ValueError): 178 | model = AutoLayoutModel("lp://prima") 179 | 180 | # 2. Completely invalid name 181 | with pytest.raises(ValueError): 182 | model = AutoLayoutModel("lp://test") 183 | -------------------------------------------------------------------------------- /tests/test_ocr.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The Layout Parser team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from layoutparser import ( 16 | GCVAgent, 17 | GCVFeatureType, 18 | TesseractAgent, 19 | TesseractFeatureType, 20 | ) 21 | import json, cv2, os 22 | 23 | image = cv2.imread("tests/fixtures/ocr/test_gcv_image.jpg") 24 | 25 | 26 | def test_gcv_agent(test_detect=False): 27 | 28 | # Test loading the agent with designated credential 29 | ocr_agent = GCVAgent() 30 | 31 | # Test loading the saved response and parse the data 32 | res = ocr_agent.load_response("tests/fixtures/ocr/test_gcv_response.json") 33 | r0 = ocr_agent.gather_text_annotations(res) 34 | r1 = ocr_agent.gather_full_text_annotation(res, GCVFeatureType.SYMBOL) 35 | r2 = ocr_agent.gather_full_text_annotation(res, GCVFeatureType.WORD) 36 | r3 = ocr_agent.gather_full_text_annotation(res, GCVFeatureType.PARA) 37 | r4 = ocr_agent.gather_full_text_annotation(res, GCVFeatureType.BLOCK) 38 | r5 = ocr_agent.gather_full_text_annotation(res, GCVFeatureType.PAGE) 39 | 40 | # Test with a online image detection and compare the results with the stored one 41 | # Warning: there could be updates on the GCV side. So it would be good to not 42 | # frequently test this part. 43 | if test_detect: 44 | res2 = ocr_agent.detect(image, return_response=True) 45 | 46 | assert res == res2 47 | assert r0 == ocr_agent.gather_text_annotations(res2) 48 | assert r1 == ocr_agent.gather_full_text_annotation(res2, GCVFeatureType.SYMBOL) 49 | assert r2 == ocr_agent.gather_full_text_annotation(res2, GCVFeatureType.WORD) 50 | assert r3 == ocr_agent.gather_full_text_annotation(res2, GCVFeatureType.PARA) 51 | assert r4 == ocr_agent.gather_full_text_annotation(res2, GCVFeatureType.BLOCK) 52 | assert r5 == ocr_agent.gather_full_text_annotation(res2, GCVFeatureType.PAGE) 53 | 54 | # Finally, test the response storage and remove the file 55 | ocr_agent.save_response(res, "tests/fixtures/ocr/.test_gcv_response.json") 56 | os.remove("tests/fixtures/ocr/.test_gcv_response.json") 57 | 58 | 59 | def test_tesseract(test_detect=False): 60 | 61 | ocr_agent = TesseractAgent(languages="eng") 62 | res = ocr_agent.load_response("tests/fixtures/ocr/test_tesseract_response.pickle") 63 | r0 = res["text"] 64 | r1 = ocr_agent.gather_data(res, agg_level=TesseractFeatureType.PAGE) 65 | r2 = ocr_agent.gather_data(res, agg_level=TesseractFeatureType.BLOCK) 66 | r3 = ocr_agent.gather_data(res, agg_level=TesseractFeatureType.PARA) 67 | r4 = ocr_agent.gather_data(res, agg_level=TesseractFeatureType.LINE) 68 | r5 = ocr_agent.gather_data(res, agg_level=TesseractFeatureType.WORD) 69 | 70 | # The results could be different is using another version of Tesseract Engine. 71 | # tesseract 4.1.1 is used for generating the pickle test file. 72 | if test_detect: 73 | res = ocr_agent.detect(image, return_response=True) 74 | assert r0 == res["text"] 75 | assert r1 == ocr_agent.gather_data(res, agg_level=TesseractFeatureType.PAGE) 76 | assert r2 == ocr_agent.gather_data(res, agg_level=TesseractFeatureType.BLOCK) 77 | assert r3 == ocr_agent.gather_data(res, agg_level=TesseractFeatureType.PARA) 78 | assert r4 == ocr_agent.gather_data(res, agg_level=TesseractFeatureType.LINE) 79 | assert r5 == ocr_agent.gather_data(res, agg_level=TesseractFeatureType.WORD) -------------------------------------------------------------------------------- /tests/test_tools.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The Layout Parser team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from layoutparser import load_pdf 16 | from layoutparser.tools import ( 17 | generalized_connected_component_analysis_1d, 18 | simple_line_detection, 19 | group_textblocks_based_on_category, 20 | ) 21 | 22 | def test_generalized_connected_component_analysis_1d(): 23 | 24 | A = [1, 2, 3] 25 | 26 | results = generalized_connected_component_analysis_1d( 27 | A, 28 | scoring_func=lambda x,y: abs(x-y)<=1 29 | ) 30 | assert len(results) == 1 31 | 32 | A = [1, 2, 3, 5, 6, 7] 33 | results = generalized_connected_component_analysis_1d( 34 | A, 35 | scoring_func=lambda x,y: abs(x-y)<=1 36 | ) 37 | assert len(results) == 2 38 | 39 | A = [1, 2, 3, 5, 6, 7] 40 | results = generalized_connected_component_analysis_1d( 41 | A, 42 | scoring_func=lambda x,y: abs(x-y)<=2 43 | ) 44 | assert len(results) == 1 45 | 46 | A = [1, 2, 3, 5, 6, 7] 47 | results = generalized_connected_component_analysis_1d( 48 | A, 49 | scoring_func=lambda x,y: abs(x-y)<=1, 50 | aggregation_func=max 51 | ) 52 | assert results == [3, 7] 53 | 54 | def test_simple_line_detection(): 55 | 56 | page_layout = load_pdf("tests/fixtures/io/example.pdf")[0] 57 | 58 | pdf_lines = simple_line_detection(page_layout) 59 | 60 | assert len(pdf_lines) == 15 61 | 62 | def test_group_textblocks_based_on_category(): 63 | 64 | page_layout = load_pdf("tests/fixtures/io/example.pdf")[0] 65 | 66 | pdf_blocks = group_textblocks_based_on_category(page_layout) 67 | 68 | assert len(pdf_blocks) == 3 -------------------------------------------------------------------------------- /tests/test_visualization.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The Layout Parser team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import pytest 16 | 17 | from layoutparser.elements import * 18 | from layoutparser.ocr import * 19 | from layoutparser.visualization import * 20 | import cv2 21 | import numpy as np 22 | 23 | 24 | def test_viz(): 25 | 26 | image = cv2.imread("tests/fixtures/ocr/test_gcv_image.jpg") 27 | ocr_agent = GCVAgent.with_credential( 28 | "tests/fixtures/ocr/test_gcv_credential.json", languages=["en"] 29 | ) 30 | res = ocr_agent.load_response("tests/fixtures/ocr/test_gcv_response.json") 31 | 32 | draw_box(image, Layout([])) 33 | draw_text(image, Layout([])) 34 | 35 | layout = Layout( 36 | [ 37 | Interval(0, 10, axis="x"), 38 | Rectangle(0, 50, 100, 80), 39 | Quadrilateral(np.array([[10, 10], [30, 40], [90, 40], [10, 20]])), 40 | ] 41 | ) 42 | 43 | draw_box(image, layout) 44 | draw_text(image, layout) 45 | 46 | # Test colors 47 | draw_box(image, layout, box_color=["red", "green", "blue"]) 48 | draw_box(image, layout, box_color="red") 49 | 50 | draw_text(image, layout, box_color=["red", "green", "blue"]) 51 | with pytest.raises(ValueError): 52 | draw_box(image, layout, box_color=["red", "green", "blue", "yellow"]) 53 | with pytest.raises(ValueError): 54 | draw_text( 55 | image, 56 | layout, 57 | box_color=["red", "green", "blue", "yellow"], 58 | with_layout=True, 59 | ) 60 | 61 | # Test alphas 62 | draw_box(image, layout, box_alpha=0) 63 | draw_box(image, layout, box_alpha=[0.1, 0.2, 0.3]) 64 | with pytest.raises(ValueError): 65 | draw_box(image, layout, box_color=[0.1, 0.2, 0.3, 0.5]) 66 | with pytest.raises(ValueError): 67 | draw_box(image, layout, box_color=[0.1, 0.2, 0.3, 1.5]) 68 | 69 | # Test widths 70 | draw_box(image, layout, box_width=1) 71 | draw_box(image, layout, box_width=[1, 2, 3]) 72 | with pytest.raises(ValueError): 73 | draw_box(image, layout, box_width=[1, 2, 3, 4]) 74 | 75 | draw_box( 76 | image, 77 | layout, 78 | box_alpha=[0.1, 0.2, 0.3], 79 | box_width=[1, 2, 3], 80 | box_color=["red", "green", "blue"], 81 | ) 82 | 83 | for idx, level in enumerate( 84 | [ 85 | GCVFeatureType.SYMBOL, 86 | GCVFeatureType.WORD, 87 | GCVFeatureType.PARA, 88 | GCVFeatureType.BLOCK, 89 | GCVFeatureType.PAGE, 90 | ] 91 | ): 92 | 93 | layout = ocr_agent.gather_full_text_annotation(res, level) 94 | 95 | draw_text( 96 | image, 97 | layout, 98 | arrangement="ud" if idx % 2 else "ud", 99 | font_size=15, 100 | text_color="pink", 101 | text_background_color="grey", 102 | text_background_alpha=0.1, 103 | with_box_on_text=True, 104 | text_box_width=2, 105 | text_box_color="yellow", 106 | text_box_alpha=0.2, 107 | with_layout=True, 108 | box_width=1, 109 | color_map={None: "blue"}, 110 | show_element_id=True, 111 | id_font_size=8, 112 | box_alpha=0.25, 113 | id_text_background_alpha=0.25, 114 | ) 115 | 116 | draw_box(image, layout) 117 | draw_text(image, layout) 118 | -------------------------------------------------------------------------------- /tests_deps/test_file_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The Layout Parser team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import pytest 16 | 17 | from layoutparser import requires_backends 18 | 19 | def test_when_backends_are_not_loaded(): 20 | 21 | # When all the backeds are not installed, it should 22 | # elicit only ImportErrors 23 | 24 | for backend_name in ["torch", "detectron2", "paddle", "effdet", "pytesseract", "google-cloud-vision"]: 25 | with pytest.raises(ImportError): 26 | requires_backends("a", backend_name) -------------------------------------------------------------------------------- /tests_deps/test_only_detectron2.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The Layout Parser team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import cv2 16 | import pytest 17 | from layoutparser import Detectron2LayoutModel 18 | 19 | def test_only_effdet_model(): 20 | 21 | # When all the backeds are not installed, it should 22 | # elicit only ImportErrors 23 | 24 | config = "lp://PubLayNet/faster_rcnn_R_50_FPN_3x/config" 25 | model = Detectron2LayoutModel(config) 26 | image = cv2.imread("tests/fixtures/model/test_model_image.jpg") 27 | layout = model.detect(image) 28 | 29 | with pytest.raises(ImportError): 30 | from layoutparser import EfficientDetLayoutModel 31 | from layoutparser import PaddleDetectionLayoutModel -------------------------------------------------------------------------------- /tests_deps/test_only_effdet.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The Layout Parser team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import cv2 16 | import pytest 17 | from layoutparser import EfficientDetLayoutModel 18 | 19 | def test_only_effdet_model(): 20 | 21 | # When all the backeds are not installed, it should 22 | # elicit only ImportErrors 23 | 24 | config = "lp://PubLayNet/tf_efficientdet_d0/config" 25 | model = EfficientDetLayoutModel(config) 26 | image = cv2.imread("tests/fixtures/model/test_model_image.jpg") 27 | layout = model.detect(image) 28 | 29 | with pytest.raises(ImportError): 30 | from layoutparser import Detectron2LayoutModel 31 | from layoutparser import PaddleDetectionLayoutModel -------------------------------------------------------------------------------- /tests_deps/test_only_paddledetection.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The Layout Parser team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import cv2 16 | import pytest 17 | from layoutparser import PaddleDetectionLayoutModel 18 | 19 | def test_only_effdet_model(): 20 | 21 | # When all the backeds are not installed, it should 22 | # elicit only ImportErrors 23 | 24 | config = "lp://PubLayNet/ppyolov2_r50vd_dcn_365e/config" 25 | model = PaddleDetectionLayoutModel(config) 26 | image = cv2.imread("tests/fixtures/model/test_model_image.jpg") 27 | layout = model.detect(image) 28 | 29 | with pytest.raises(ImportError): 30 | from layoutparser import EfficientDetLayoutModel 31 | from layoutparser import Detectron2LayoutModel --------------------------------------------------------------------------------