├── .gitignore ├── CHANGELOG.md ├── CODEOWNERS ├── CODE_OF_CONDUCT.md ├── LICENSE ├── MANIFEST.in ├── README.md ├── RELEASE.md ├── SECURITY.md ├── VERSION.txt ├── devops ├── nightly.yml ├── pr-gate.yml ├── release.yml └── templates │ ├── base │ ├── publish-test-results.yml │ ├── run-linter.yml │ └── run-tests.yml │ ├── build_wheel_n_sdist.yml │ ├── install-dependencies.yml │ ├── merge-cov-reports.yml │ └── run-tests-on-multiple-os-py.yml ├── docs ├── .gitignore ├── genalog_docs │ ├── _config.yml │ ├── _toc.yml │ ├── doc_degradation.md │ ├── doc_generation.md │ ├── docstring │ │ ├── genalog.degradation.rst │ │ ├── genalog.generation.rst │ │ ├── genalog.ocr.rst │ │ └── genalog.text.rst │ ├── e2e_dataset_pipeline.md │ ├── generation_pipeline.ipynb │ ├── index.md │ ├── installation.md │ ├── ocr_label_propagation.ipynb │ ├── static │ │ ├── analog_doc_gen_pipeline.png │ │ ├── bleed_through.png │ │ ├── blur.png │ │ ├── close_dilate.png │ │ ├── columns_Times_11px.png │ │ ├── degrader.png │ │ ├── degrader_heavy.png │ │ ├── genalog_demo.gif │ │ ├── genalog_favicon.svg │ │ ├── genalog_full_logo.svg │ │ ├── genalog_logo_no_text.svg │ │ ├── kernel_morph.png │ │ ├── labeled_synthetic_pipeline.png │ │ ├── letter_Times_11px.png │ │ ├── open_erode.png │ │ ├── salt_pepper.png │ │ └── text_block_Times_11px.png │ └── text_alignment.ipynb └── requirements-doc.txt ├── example ├── dataset_generation.ipynb ├── demo_generate.py ├── document_degradation.ipynb ├── document_generation.ipynb ├── generation_pipeline.ipynb ├── ocr_extraction.ipynb ├── ocr_label_propagation.ipynb ├── sample │ ├── degradation │ │ ├── bleed_through.png │ │ ├── blur.png │ │ ├── close_dilate.png │ │ ├── degrader.png │ │ ├── degrader_heavy.png │ │ ├── kernel_morph.png │ │ ├── open_erode.png │ │ ├── salt_pepper.png │ │ ├── text_block.png │ │ └── text_zoomed.png │ └── generation │ │ ├── columns_Times_11px.pdf │ │ ├── columns_Times_11px.png │ │ ├── example.txt │ │ ├── letter_Times_11px.pdf │ │ ├── letter_Times_11px.png │ │ ├── text_block_Times_11px.pdf │ │ ├── text_block_Times_11px.png │ │ ├── text_block_Times_11px_pg_0.png │ │ └── text_block_Times_11px_pg_1.png ├── static │ ├── analog_doc_gen_pipeline.png │ ├── genalog_components.png │ └── labeled_synthetic_pipeline.png └── text_alignment.ipynb ├── genalog ├── README.md ├── __init__.py ├── degradation │ ├── README.md │ ├── __init__.py │ ├── degrader.py │ └── effect.py ├── generation │ ├── .gitignore │ ├── README.md │ ├── __init__.py │ ├── content.py │ ├── document.py │ └── templates │ │ ├── base.css.jinja │ │ ├── base.html.jinja │ │ ├── columns.css.jinja │ │ ├── columns.html.jinja │ │ ├── letter.css.jinja │ │ ├── letter.html.jinja │ │ ├── macro │ │ ├── dimension.css.jinja │ │ ├── page_layout.css.jinja │ │ └── text.css.jinja │ │ ├── text_block.css.jinja │ │ └── text_block.html.jinja ├── ocr │ ├── README.md │ ├── __init__.py │ ├── blob_client.py │ ├── common.py │ ├── grok.py │ ├── metrics.py │ ├── rest_client.py │ └── templates │ │ ├── datasource.json │ │ ├── index.json │ │ ├── indexer.json │ │ ├── knowledge_store.json │ │ └── skillset.json ├── pipeline.py └── text │ ├── README.md │ ├── __init__.py │ ├── alignment.py │ ├── anchor.py │ ├── conll_format.py │ ├── lcs.py │ ├── ner_label.py │ ├── preprocess.py │ └── splitter.py ├── requirements-dev.txt ├── requirements.txt ├── setup.py ├── tests ├── .env ├── __init__.py ├── conftest.py ├── e2e │ ├── data │ │ ├── conll_formatter │ │ │ ├── clean_labels │ │ │ │ ├── 0.txt │ │ │ │ ├── 1.txt │ │ │ │ ├── 11618.txt │ │ │ │ ├── 11656.txt │ │ │ │ ├── 16.txt │ │ │ │ ├── 17.txt │ │ │ │ ├── 1838.txt │ │ │ │ ├── 1839.txt │ │ │ │ ├── 1901.txt │ │ │ │ ├── 2.txt │ │ │ │ ├── 2161.txt │ │ │ │ ├── 3.txt │ │ │ │ ├── 4.txt │ │ │ │ ├── 482.txt │ │ │ │ ├── 5.txt │ │ │ │ ├── 6.txt │ │ │ │ ├── 7.txt │ │ │ │ ├── 7965.txt │ │ │ │ ├── 8.txt │ │ │ │ └── 9.txt │ │ │ └── ocr_text │ │ │ │ ├── 0.txt │ │ │ │ ├── 1.txt │ │ │ │ ├── 11618.txt │ │ │ │ ├── 11656.txt │ │ │ │ ├── 16.txt │ │ │ │ ├── 17.txt │ │ │ │ ├── 1838.txt │ │ │ │ ├── 1839.txt │ │ │ │ ├── 1901.txt │ │ │ │ ├── 2.txt │ │ │ │ ├── 2161.txt │ │ │ │ ├── 3.txt │ │ │ │ ├── 4.txt │ │ │ │ ├── 5.txt │ │ │ │ ├── 6.txt │ │ │ │ ├── 7.txt │ │ │ │ ├── 7965.txt │ │ │ │ ├── 8.txt │ │ │ │ └── 9.txt │ │ ├── splitter │ │ │ ├── example_conll2012.txt │ │ │ └── example_splits │ │ │ │ ├── clean_labels │ │ │ │ ├── 0.txt │ │ │ │ └── 1.txt │ │ │ │ └── clean_text │ │ │ │ ├── 0.txt │ │ │ │ └── 1.txt │ │ └── synthetic_dataset │ │ │ ├── shared │ │ │ ├── test │ │ │ │ └── clean_labels │ │ │ │ │ └── 1901.txt │ │ │ └── train │ │ │ │ └── clean_labels │ │ │ │ └── 2161.txt │ │ │ └── test_version │ │ │ ├── .gitignore │ │ │ ├── test │ │ │ └── ocr │ │ │ │ └── 1901.json │ │ │ └── train │ │ │ └── ocr │ │ │ └── 2161.json │ ├── templates │ │ └── solid_bg.html.jinja │ ├── test_anchor_e2e.py │ ├── test_conll_format_e2e.py │ ├── test_document_generation.py │ ├── test_generaton_n_degradation.py │ ├── test_image_channel.py │ ├── test_ocr_e2e.py │ ├── test_pipeline.py │ └── test_splitter.py ├── required_env.py └── unit │ ├── __init__.py │ ├── cases │ ├── __init__.py │ ├── label_propagation.py │ └── text_alignment.py │ ├── degradation │ ├── __init__.py │ ├── test_degrader.py │ └── test_effect.py │ ├── generation │ ├── 2x2.jpg │ ├── __init__.py │ ├── templates │ │ ├── font_family.html.jinja │ │ ├── mock.html.jinja │ │ └── multipage.html.jinja │ ├── test_content.py │ └── test_document.py │ ├── ocr │ ├── __init__.py │ ├── data │ │ ├── img │ │ │ ├── 0.png │ │ │ ├── 1.png │ │ │ └── 11.png │ │ ├── json │ │ │ ├── 521c38122f783673598856cd81d91c21_0.json │ │ │ ├── 521c38122f783673598856cd81d91c21_1.json │ │ │ └── 521c38122f783673598856cd81d91c21_11.json │ │ ├── metrics.csv │ │ ├── metrics │ │ │ ├── json │ │ │ │ ├── 123_001.json │ │ │ │ ├── 123_002.json │ │ │ │ └── 123_003.json │ │ │ ├── metrics.csv │ │ │ ├── substitution.pkl │ │ │ └── text │ │ │ │ ├── 001.txt │ │ │ │ ├── 002.txt │ │ │ │ └── 003.txt │ │ ├── substitution.json │ │ ├── substitution.pkl │ │ └── text │ │ │ ├── 0.txt │ │ │ ├── 1.txt │ │ │ └── 11.txt │ ├── test_metrics.py │ └── test_ocr.py │ └── text │ ├── data │ ├── gt_1.txt │ ├── gt_2.txt │ ├── gt_3.txt │ ├── label_generator │ │ ├── labels │ │ │ ├── 0.tsv │ │ │ ├── 1.tsv │ │ │ └── 11.tsv │ │ └── text │ │ │ ├── 0.txt │ │ │ ├── 1.txt │ │ │ └── 11.txt │ ├── ocr_1.txt │ ├── ocr_2.txt │ └── ocr_3.txt │ ├── test_alignment.py │ ├── test_anchor.py │ ├── test_conll_format.py │ ├── test_lcs.py │ ├── test_ner_label.py │ ├── test_preprocess.py │ └── test_utf8.py └── tox.ini /.gitignore: -------------------------------------------------------------------------------- 1 | # Test output 2 | test_out 3 | 4 | # Secrets, keys and other credentials 5 | .secret* 6 | .cred* 7 | 8 | # Environments 9 | .env* 10 | .venv* 11 | **/.env/ 12 | env/ 13 | venv/ 14 | ENV/ 15 | env.bak/ 16 | venv.bak/ 17 | 18 | # Credentials 19 | .secrets 20 | .secret* 21 | 22 | # IDE 23 | .vscode 24 | 25 | # Byte-compiled / optimized / DLL files 26 | __pycache__/ 27 | *.py[cod] 28 | *$py.class 29 | 30 | # C extensions 31 | *.so 32 | 33 | # Distribution / packaging 34 | .Python 35 | build/ 36 | develop-eggs/ 37 | dist/ 38 | downloads/ 39 | eggs/ 40 | .eggs/ 41 | lib/ 42 | lib64/ 43 | parts/ 44 | sdist/ 45 | var/ 46 | wheels/ 47 | pip-wheel-metadata/ 48 | share/python-wheels/ 49 | *.egg-info/ 50 | .installed.cfg 51 | *.egg 52 | MANIFEST 53 | 54 | # PyInstaller 55 | # Usually these files are written by a python script from a template 56 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 57 | *.manifest 58 | *.spec 59 | 60 | # Installer logs 61 | pip-log.txt 62 | pip-delete-this-directory.txt 63 | 64 | # Unit test / coverage reports 65 | htmlcov/ 66 | .tox/ 67 | .nox/ 68 | .coverage 69 | .coverage.* 70 | .cache 71 | nosetests.xml 72 | coverage.xml 73 | *.cover 74 | *.py,cover 75 | .hypothesis/ 76 | .pytest_cache/ 77 | junit 78 | 79 | # Translations 80 | *.mo 81 | *.pot 82 | 83 | # Django stuff: 84 | *.log 85 | local_settings.py 86 | db.sqlite3 87 | db.sqlite3-journal 88 | 89 | # Flask stuff: 90 | instance/ 91 | .webassets-cache 92 | 93 | # Scrapy stuff: 94 | .scrapy 95 | 96 | # Sphinx documentation 97 | docs/_build/ 98 | 99 | # PyBuilder 100 | target/ 101 | 102 | # Jupyter Notebook 103 | .ipynb_checkpoints 104 | 105 | # IPython 106 | profile_default/ 107 | ipython_config.py 108 | 109 | # pyenv 110 | .python-version 111 | 112 | # pipenv 113 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 114 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 115 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 116 | # install all needed dependencies. 117 | #Pipfile.lock 118 | 119 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 120 | __pypackages__/ 121 | 122 | # Celery stuff 123 | celerybeat-schedule 124 | celerybeat.pid 125 | 126 | # SageMath parsed files 127 | *.sage.py 128 | 129 | # Environments 130 | .env 131 | .venv 132 | env/ 133 | venv/ 134 | ENV/ 135 | env.bak/ 136 | venv.bak/ 137 | 138 | # Spyder project settings 139 | .spyderproject 140 | .spyproject 141 | 142 | # Rope project settings 143 | .ropeproject 144 | 145 | # mkdocs documentation 146 | /site 147 | 148 | # mypy 149 | .mypy_cache/ 150 | .dmypy.json 151 | dmypy.json 152 | 153 | # Pyre type checker 154 | .pyre/ 155 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Genalog Changelog 2 | All notable changes to this project will be documented in this file. 3 | 4 | Types of changes 5 | 1. `Added` for new features. 6 | 1. `Changed` for changes in existing functionality. 7 | 1. `Deprecated` for soon-to-be removed features. 8 | 1. `Removed` for now removed features. 9 | 1. `Fixed` for any bug fixes. 10 | 1. `Security` in case of vulnerabilities. 11 | 12 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), 13 | and we adopt the [Semantic Versioning](https://semver.org/spec/v2.0.0.html). 14 | 15 | ## [v0.1.0] - 2021-07-19 16 | ### Added 17 | - Initial package release: 18 | - 3 standard HTML document template for generation 19 | - basic image degradation effects including blur, bleed-through, salt & pepper and other morphological operations. 20 | - 2 flavors of text alignment algorithm: Needleman-Wunsch (shorter text segments) and RETAS (longer text segments) 21 | - Full e2e NER-OCR label generation notebooks 22 | - See [documentation](https://microsoft.github.io/genalog/installation.html) for more on the initial features of the package. 23 | -------------------------------------------------------------------------------- /CODEOWNERS: -------------------------------------------------------------------------------- 1 | 2 | # Each line is a file pattern followed by one or more owners. 3 | 4 | # These owners will be the default owners for everything in 5 | # the repo. Unless a later match takes precedence, 6 | # @global-owner1 and @global-owner2 will be requested for 7 | * @microsoft/genalog-admins 8 | 9 | genalog/degradation/ @laserprec 10 | genalog/generation/ @laserprec 11 | genalog/text/ @laserprec 12 | genalog/ocr/ @laserprec 13 | 14 | tests/ @laserprec -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Microsoft Open Source Code of Conduct 2 | 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 4 | 5 | Resources: 6 | 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/) 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns 10 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) Microsoft Corporation. 2 | 3 | MIT License 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include *.md 2 | include *.txt 3 | include LICENSE CODEOWNERS 4 | include .gitignore tox.ini MANIFEST.in 5 | recursive-include genalog *.py *.jinja 6 | recursive-include tests *.py *.jinja *.jpg -------------------------------------------------------------------------------- /RELEASE.md: -------------------------------------------------------------------------------- 1 | # Genalog Release Procedure 2 | 3 | Checklist for the release process of `genalog`: 4 | 5 | ### Preparation 6 | - [ ] Ensure `main` branch contains all relevant changes and PRs relating to the specific release is merged 7 | - [ ] Create and switch to a new release branch (i.e. release-X.Y.Z) 8 | 9 | ### Package Metadata Update 10 | - [ ] Update VERSION.txt with version bump. Please reference [Semantic Versioning](https://semver.org/). 11 | - [ ] Update [CHANGELOG.md](./CHANGELOG.md) 12 | - [ ] Commit the above changes with title "Release vX.Y.Z" 13 | - [ ] Generate a new git tag for the new version (e.g. `git tag -a v0.1.0 -m "Initial Release"`) 14 | - [ ] Push the new tag to remote `git push origin v0.1.0` 15 | - [ ] Create a new PR with the above changes into `main` branch. 16 | 17 | ### Run the Full Test Suites 18 | - [ ] If you haven't, `pip install tox` 19 | - [ ] Run the test suites with `tox -e py -- -m "not azure"` (we will skip the azure related tests as they will be deprecated) 20 | 21 | ### Release to PyPI 22 | - [ ] Manually trigger the [release pipeline](https://dev.azure.com/genalog-dev/genalog/_build?definitionId=2) in DevOps on the release branch, this will publish latest version of `genalog` to PyPI. 23 | - [ ] Select `releaseType` to `Test` to test out the release in [TestPyPI](https://test.pypi.org/project/genalog/) 24 | - [ ] Rerun and switch `releaseType` to production if looks good. 25 | - [ ] If the pipeline ran successfully, check and publish the draft of this release on [Github Release](https://github.com/microsoft/genalog/releases) 26 | - [ ] Latest version is pip-installable with: 27 | - `pip install genalog` 28 | 29 | ### Update Documentation on Github Page 30 | - [ ] Staying on the release branch, `cd docs && pip install -r requirements-doc.txt` 31 | - [ ] Build the jupyter-book with `jupyter-book build --all genalog_docs` 32 | - [ ] Preview the HTML files, if looks good [publish to Github Page](https://jupyterbook.org/start/publish.html#publish-your-book-online-with-github-pages): `ghp-import -n -p -f genalog_docs/_build/html` 33 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Security 4 | 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/). 6 | 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://docs.microsoft.com/en-us/previous-versions/tn-archive/cc751383(v=technet.10)), please report it to us as described below. 8 | 9 | ## Reporting Security Issues 10 | 11 | **Please do not report security vulnerabilities through public GitHub issues.** 12 | 13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://msrc.microsoft.com/create-report). 14 | 15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://www.microsoft.com/en-us/msrc/pgp-key-msrc). 16 | 17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). 18 | 19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: 20 | 21 | * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) 22 | * Full paths of source file(s) related to the manifestation of the issue 23 | * The location of the affected source code (tag/branch/commit or direct URL) 24 | * Any special configuration required to reproduce the issue 25 | * Step-by-step instructions to reproduce the issue 26 | * Proof-of-concept or exploit code (if possible) 27 | * Impact of the issue, including how an attacker might exploit the issue 28 | 29 | This information will help us triage your report more quickly. 30 | 31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://microsoft.com/msrc/bounty) page for more details about our active programs. 32 | 33 | ## Preferred Languages 34 | 35 | We prefer all communications to be in English. 36 | 37 | ## Policy 38 | 39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://www.microsoft.com/en-us/msrc/cvd). 40 | 41 | -------------------------------------------------------------------------------- /VERSION.txt: -------------------------------------------------------------------------------- 1 | 0.1.0 -------------------------------------------------------------------------------- /devops/nightly.yml: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # --------------------------------------------------------- 5 | 6 | name: $(Date:yyyyMMdd).$(Rev:r) 7 | 8 | trigger: none # nightly build is scheduled once per day 9 | 10 | pr: none 11 | 12 | variables: 13 | - group: azureResourceKeys 14 | 15 | stages: 16 | - stage: static_analysis 17 | jobs: 18 | - job: flake8_linux_py36 19 | pool: 20 | vmImage: 'ubuntu-latest' 21 | steps: 22 | - template: templates/base/run-linter.yml 23 | parameters: 24 | pyVersion: '3.6' 25 | - task: ComponentGovernanceComponentDetection@0 26 | 27 | - stage: unit_tests 28 | dependsOn: static_analysis 29 | jobs: 30 | - template: templates/run-tests-on-multiple-os-py.yml 31 | parameters: 32 | pyVersions: ['3.6', '3.7', '3.8'] 33 | testTypes: ['unit', 'io'] 34 | imageOSs: ['ubuntu-18.04'] # 'windows-latest', 'macos-latest' are not supported 35 | 36 | - stage: e2e_tests 37 | dependsOn: static_analysis 38 | jobs: 39 | - template: templates/run-tests-on-multiple-os-py.yml 40 | parameters: 41 | pyVersions: ['3.6', '3.7', '3.8'] 42 | testTypes: ['e2e'] 43 | imageOSs: ['ubuntu-18.04'] # 'windows-latest', 'macos-latest' are not supported 44 | 45 | - stage: collect_final_code_coverage 46 | dependsOn: 47 | - unit_tests 48 | - e2e_tests 49 | jobs: 50 | - template: templates/merge-cov-reports.yml 51 | 52 | - stage: publish_artifacts 53 | jobs: 54 | - job: archive_wheel_and_sdist 55 | pool: 56 | vmImage: 'ubuntu-latest' 57 | steps: 58 | - template: templates/build_wheel_n_sdist.yml 59 | 60 | - task: PublishBuildArtifacts@1 61 | inputs: 62 | PathtoPublish: $(Build.SourcesDirectory)/dist 63 | ArtifactName: distribution_artifacts 64 | publishLocation: 'Container' 65 | displayName: 'Publish wheel and sdist' 66 | -------------------------------------------------------------------------------- /devops/pr-gate.yml: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # --------------------------------------------------------- 5 | 6 | name: $(Date:yyyyMMdd).$(Rev:r) 7 | 8 | trigger: none # trigger only via pr 9 | 10 | pr: 11 | - main 12 | 13 | variables: 14 | - group: azureResourceKeys 15 | 16 | stages: 17 | - stage: static_analysis 18 | jobs: 19 | - job: flake8_linux_py36 20 | pool: 21 | vmImage: 'ubuntu-latest' 22 | steps: 23 | - template: templates/base/run-linter.yml 24 | parameters: 25 | pyVersion: '3.6' 26 | - task: ComponentGovernanceComponentDetection@0 27 | 28 | - stage: unit_tests 29 | dependsOn: static_analysis 30 | jobs: 31 | - template: templates/run-tests-on-multiple-os-py.yml 32 | parameters: 33 | pyVersions: ['3.6', '3.7', '3.8'] 34 | testTypes: ['unit', 'io'] 35 | imageOSs: ['ubuntu-18.04'] # 'windows-latest', 'macos-latest' are not supported 36 | 37 | - stage: e2e_tests 38 | dependsOn: static_analysis 39 | jobs: 40 | - template: templates/run-tests-on-multiple-os-py.yml 41 | parameters: 42 | pyVersions: ['3.6'] 43 | testTypes: ['e2e'] 44 | imageOSs: ['ubuntu-18.04'] # 'windows-latest', 'macos-latest' are not supported 45 | 46 | - stage: collect_final_code_coverage 47 | dependsOn: 48 | - unit_tests 49 | - e2e_tests 50 | jobs: 51 | - template: templates/merge-cov-reports.yml 52 | -------------------------------------------------------------------------------- /devops/release.yml: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # --------------------------------------------------------- 5 | 6 | trigger: none # Need manual trigger 7 | 8 | parameters: 9 | - name: releaseType 10 | displayName: Release Type 11 | type: string 12 | default: Test 13 | values: 14 | - Test 15 | - Production 16 | 17 | strategy: 18 | matrix: 19 | linux_x64_py3.6: 20 | imageName: 'ubuntu-18.04' 21 | python.version: '3.6' 22 | 23 | pool: 24 | vmImage: '$(imageName)' 25 | 26 | steps: 27 | - task: UsePythonVersion@0 28 | inputs: 29 | versionSpec: '$(python.version)' 30 | addToPath: true 31 | architecture: 'x64' 32 | displayName: 'Use Python $(python.version)' 33 | 34 | - bash: | 35 | pip install --upgrade pip 36 | pip install setuptools wheel 37 | python setup.py bdist_wheel --dist-dir dist 38 | python setup.py sdist --dist-dir dist 39 | workingDirectory: $(Build.SourcesDirectory) 40 | displayName: 'Building wheel package & sdist' 41 | 42 | - task: GitHubRelease@1 43 | inputs: 44 | gitHubConnection: 'github.com_laserprec' 45 | repositoryName: 'microsoft/genalog' 46 | action: 'create' 47 | target: '$(Build.SourceVersion)' 48 | tagSource: 'gitTag' 49 | tagPattern: 'v.*' 50 | releaseNotesFilePath: 'CHANGELOG.md' 51 | assets: '$(Build.SourcesDirectory)/dist/*' 52 | isDraft: true 53 | changeLogCompareToRelease: 'lastFullRelease' 54 | changeLogType: 'commitBased' 55 | condition: ${{eq(parameters.releaseType, 'Test')}} 56 | displayName: 'Prepare GitHub Release (Draft)' 57 | 58 | - bash: | 59 | pip install twine 60 | workingDirectory: $(Build.SourcesDirectory) 61 | displayName: 'Install twine' 62 | 63 | - task: TwineAuthenticate@1 64 | inputs: 65 | pythonUploadServiceConnection: testpypi 66 | condition: ${{eq(parameters.releaseType, 'Test')}} 67 | displayName: 'Twine Authentication for Test' 68 | 69 | - task: TwineAuthenticate@1 70 | inputs: 71 | pythonUploadServiceConnection: pypi 72 | condition: ${{eq(parameters.releaseType, 'Production')}} 73 | displayName: 'Twine Authentication for Production' 74 | 75 | - bash: | 76 | twine upload --verbose -r genalog --config-file $(PYPIRC_PATH) dist/*.whl 77 | workingDirectory: $(Build.SourcesDirectory) 78 | displayName: 'Uploading Wheel to ${{parameters.releaseType}} PyPI' -------------------------------------------------------------------------------- /devops/templates/base/publish-test-results.yml: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # --------------------------------------------------------- 5 | 6 | # Template for publishing test result report 7 | parameters: 8 | - name: pyVersion 9 | type: string 10 | 11 | steps: 12 | - task: PublishTestResults@2 13 | inputs: 14 | testResultsFormat: 'JUnit' 15 | testResultsFiles: 'junit/*.xml' 16 | searchFolder: $(Build.SourcesDirectory) 17 | testRunTitle: $(Agent.OS) py$(pyVersion) Build 18 | buildPlatform: $(Agent.OS) 19 | condition: always() # Always publish test results 20 | displayName: 'Publish test report' -------------------------------------------------------------------------------- /devops/templates/base/run-linter.yml: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # --------------------------------------------------------- 5 | 6 | # Template for running linter and other static analysis tools on the code 7 | parameters: 8 | - name: pyVersion 9 | type: string 10 | default: '3.6' 11 | 12 | steps: 13 | - task: UsePythonVersion@0 14 | inputs: 15 | versionSpec: ${{ parameters.pyVersion }} 16 | addToPath: true 17 | architecture: 'x64' 18 | displayName: 'Use Python ${{ parameters.pyVersion }}' 19 | 20 | - bash: | 21 | python -m pip install --upgrade pip setuptools wheel 22 | python -m pip install -r requirements-dev.txt 23 | workingDirectory: $(Build.SourcesDirectory) 24 | displayName: 'Install flake8 and other dev dependencies' 25 | 26 | - bash: | 27 | tox -e flake8 28 | workingDirectory: $(Build.SourcesDirectory) 29 | displayName: 'Run Linter (flake8)' -------------------------------------------------------------------------------- /devops/templates/base/run-tests.yml: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # --------------------------------------------------------- 5 | 6 | # Template for running tests on multiple Python versions and platforms 7 | parameters: 8 | - name: testType 9 | type: string 10 | default: all 11 | values: 12 | - unit 13 | - e2e 14 | - slow 15 | - azure 16 | - io 17 | - all 18 | 19 | steps: 20 | - bash: | 21 | if [[ '${{parameters.testType}}' == 'all' ]] 22 | then 23 | tox -e py 24 | elif [[ '${{parameters.testType}}' == 'unit' ]] 25 | then 26 | tox -e py -- tests/unit 27 | elif [[ '${{parameters.testType}}' == 'e2e' ]] 28 | then 29 | tox -e py -- tests/e2e 30 | else 31 | tox -e py -- -m "${{parameters.testType}}" 32 | fi 33 | env: 34 | # These keys come from azureResourceKeys variable group 35 | BLOB_KEY : $(BLOB_KEY) 36 | SEARCH_SERVICE_KEY: $(SEARCH_SERVICE_KEY) 37 | COGNITIVE_SERVICE_KEY: $(COGNITIVE_SERVICE_KEY) 38 | COMPUTER_VISION_SUBSCRIPTION_KEY: $(COMPUTER_VISION_SUBSCRIPTION_KEY) 39 | workingDirectory: $(Build.SourcesDirectory) 40 | displayName: 'Running (${{parameters.testType}}) Tests' 41 | -------------------------------------------------------------------------------- /devops/templates/build_wheel_n_sdist.yml: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # --------------------------------------------------------- 5 | 6 | # Template to create wheel and source distribution 7 | parameters: 8 | - name: pyVersion 9 | default: '3.6' 10 | 11 | steps: 12 | - task: UsePythonVersion@0 13 | inputs: 14 | versionSpec: ${{ parameters.pyVersion }} 15 | addToPath: true 16 | architecture: 'x64' 17 | displayName: 'Use Python ${{ parameters.pyVersion }}' 18 | 19 | - bash: | 20 | python -m pip install --upgrade pip setuptools wheel 21 | displayName: 'Update pip and setuptools' 22 | 23 | - bash: | 24 | python setup.py bdist_wheel 25 | workingDirectory: $(Build.SourcesDirectory) 26 | displayName: 'Build wheel' 27 | 28 | - bash: | 29 | python setup.py sdist 30 | workingDirectory: $(Build.SourcesDirectory) 31 | displayName: 'Build source distribution' 32 | 33 | - bash: | 34 | ls dist 35 | workingDirectory: $(Build.SourcesDirectory) 36 | displayName: 'Show artifacts in folder' 37 | -------------------------------------------------------------------------------- /devops/templates/install-dependencies.yml: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # --------------------------------------------------------- 5 | 6 | # Assume a python version is enabled with "UsePythonVersion@0" task 7 | steps: 8 | - bash: | 9 | python -m pip install --upgrade pip setuptools wheel 10 | python -m pip install -r requirements-dev.txt 11 | workingDirectory: $(Build.SourcesDirectory) 12 | displayName: 'Install dependencies' 13 | -------------------------------------------------------------------------------- /devops/templates/merge-cov-reports.yml: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # --------------------------------------------------------- 5 | 6 | # Template to merge several code coverage reports (.coverage*) 7 | parameters: 8 | - name: pyVersion 9 | default: '3.6' 10 | 11 | jobs: 12 | - job: 13 | displayName: Merge cov reports 14 | pool: 15 | vmImage: 'ubuntu-latest' 16 | 17 | steps: 18 | - task: UsePythonVersion@0 19 | inputs: 20 | versionSpec: ${{ parameters.pyVersion }} 21 | addToPath: true 22 | architecture: 'x64' 23 | displayName: 'Use Python ${{ parameters.pyVersion }}' 24 | 25 | - bash: | 26 | python -m pip --upgrade pip setuptools 27 | python -m pip install coverage 28 | workingDirectory: $(Build.SourcesDirectory) 29 | displayName: 'Install coverage' 30 | 31 | # See https://docs.microsoft.com/en-us/azure/devops/pipelines/artifacts/pipeline-artifacts?view=azure-devops&tabs=yaml#multiple-artifacts 32 | - download: current 33 | patterns: '**/.coverage*' 34 | 35 | - bash: | 36 | python -m coverage combine $(Pipeline.Workspace)/**/.coverage* 37 | python -m coverage report 38 | python -m coverage xml 39 | workingDirectory: $(Build.SourcesDirectory) 40 | displayName: Show and merge cached coverage report 41 | 42 | - task: PublishCodeCoverageResults@1 43 | inputs: 44 | codeCoverageTool: Cobertura 45 | summaryFileLocation: '$(Build.SourcesDirectory)/coverage.xml' 46 | displayName: 'Publish merged code coverage report' 47 | 48 | 49 | -------------------------------------------------------------------------------- /devops/templates/run-tests-on-multiple-os-py.yml: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # --------------------------------------------------------- 5 | 6 | parameters: 7 | - name: pyVersions 8 | type: object 9 | default: ['3.6', '3.7', '3.8'] 10 | - name: testTypes 11 | type: object 12 | default: ['fast', 'slow'] 13 | - name: imageOSs 14 | type: object 15 | default: ['ubuntu-latest'] 16 | 17 | jobs: 18 | - ${{ each imageOS in parameters.imageOSs }}: 19 | - ${{ each pyVersion in parameters.pyVersions }}: 20 | - job: 21 | displayName: ${{imageOS}} py${{pyVersion}} 22 | pool: 23 | vmImage: ${{imageOS}} 24 | steps: 25 | 26 | - task: UsePythonVersion@0 27 | inputs: 28 | versionSpec: ${{pyVersion}} 29 | addToPath: true 30 | architecture: 'x64' 31 | displayName: 'Use Python ${{pyVersion}}' 32 | 33 | - template: install-dependencies.yml 34 | 35 | - ${{ each testType in parameters.testTypes }}: 36 | - template: base/run-tests.yml 37 | parameters: 38 | testType: ${{testType}} 39 | - template: base/publish-test-results.yml 40 | parameters: 41 | pyVersion: ${{pyVersion}} 42 | 43 | - bash: | 44 | mv .coverage .coverage_$(System.StageName)_${{imageOS}}_${{pyVersion}} 45 | ls .coverage* 46 | workingDirectory: $(Build.SourcesDirectory) 47 | displayName: 'Rename coverage report' 48 | # Cache the coverage report 49 | - publish: $(Build.SourcesDirectory)/.coverage_$(System.StageName)_${{imageOS}}_${{pyVersion}} 50 | artifact: cov_report_$(System.StageName)_${{imageOS}}_${{pyVersion}} -------------------------------------------------------------------------------- /docs/.gitignore: -------------------------------------------------------------------------------- 1 | **/example.txt 2 | **/_build 3 | **/data -------------------------------------------------------------------------------- /docs/genalog_docs/_config.yml: -------------------------------------------------------------------------------- 1 | title : '' 2 | author: Jianjie Liu and Amit Gupte 3 | logo: static/genalog_full_logo.svg 4 | 5 | # Short description about the book 6 | description: >- 7 | Guide for end-to-end synthetic analog document generation 8 | 9 | execute: 10 | execute_notebooks : off 11 | 12 | # Interact link settings 13 | notebook_interface : "notebook" 14 | 15 | # Launch button settings 16 | repository: 17 | url : https://github.com/microsoft/genalog 18 | path_to_book : /docs/genalog_docs 19 | branch : main 20 | 21 | launch_buttons: 22 | notebook_interface : classic 23 | 24 | # HTML-specific settings 25 | html: 26 | favicon : static/genalog_favicon.svg 27 | home_page_in_navbar : false 28 | use_repository_button : true 29 | use_issues_button : true 30 | baseurl : https://microsoft.github.io/genalog/ 31 | extra_footer : "Don't forget to check out our paper from Document Intelligence Workshop at KDD 2021!" 33 | 34 | sphinx: 35 | extra_extensions: 36 | - sphinx_inline_tabs 37 | - sphinx.ext.autodoc 38 | - sphinx.ext.napoleon 39 | - sphinx.ext.viewcode 40 | config: 41 | napoleon_google_docstring: True 42 | autodoc_member_order: groupwise 43 | autoclass_content: both 44 | -------------------------------------------------------------------------------- /docs/genalog_docs/_toc.yml: -------------------------------------------------------------------------------- 1 | root: index 2 | format: jb-book 3 | defaults: 4 | numbered: false 5 | parts: 6 | - caption: Getting Started 7 | chapters: 8 | - file: installation 9 | - file: generation_pipeline 10 | - file: e2e_dataset_pipeline 11 | - caption: Fabricating Document & Noise 12 | chapters: 13 | - file: doc_generation 14 | - file: doc_degradation 15 | - caption: Handling Noisy Text 16 | chapters: 17 | - file: text_alignment 18 | - file: ocr_label_propagation 19 | - caption: API Documentation 20 | chapters: 21 | - file: docstring/genalog.degradation 22 | - file: docstring/genalog.generation 23 | - file: docstring/genalog.ocr 24 | - file: docstring/genalog.text 25 | -------------------------------------------------------------------------------- /docs/genalog_docs/docstring/genalog.degradation.rst: -------------------------------------------------------------------------------- 1 | genalog.degradation 2 | ==================== 3 | 4 | Image Degrader 5 | ----------------------------------- 6 | 7 | .. automodule:: genalog.degradation.degrader 8 | :members: 9 | 10 | Degration Effects 11 | --------------------------------- 12 | 13 | .. automodule:: genalog.degradation.effect 14 | :members: 15 | :show-inheritance: -------------------------------------------------------------------------------- /docs/genalog_docs/docstring/genalog.generation.rst: -------------------------------------------------------------------------------- 1 | genalog.generation 2 | ========================== 3 | 4 | genalog.generation.content module 5 | --------------------------------- 6 | 7 | .. automodule:: genalog.generation.content 8 | :members: 9 | :show-inheritance: 10 | 11 | genalog.generation.document module 12 | ---------------------------------- 13 | 14 | .. automodule:: genalog.generation.document 15 | :members: 16 | :show-inheritance: 17 | -------------------------------------------------------------------------------- /docs/genalog_docs/docstring/genalog.ocr.rst: -------------------------------------------------------------------------------- 1 | genalog.ocr 2 | =================== 3 | 4 | This module will be *deprecated* in favor of the official `Azure Computer Vision SDK `_ . 5 | 6 | genalog.ocr.common module 7 | ------------------------- 8 | 9 | .. automodule:: genalog.ocr.common 10 | :members: 11 | 12 | genalog.ocr.grok module 13 | ----------------------- 14 | 15 | .. automodule:: genalog.ocr.grok 16 | :members: 17 | 18 | genalog.ocr.metrics module 19 | -------------------------- 20 | 21 | .. automodule:: genalog.ocr.metrics 22 | :members: 23 | 24 | genalog.ocr.rest\_client module 25 | ------------------------------- 26 | 27 | .. automodule:: genalog.ocr.rest_client 28 | :members: 29 | 30 | genalog.ocr.blob\_client module 31 | ------------------------------- 32 | 33 | .. automodule:: genalog.ocr.blob_client 34 | :members: 35 | 36 | -------------------------------------------------------------------------------- /docs/genalog_docs/docstring/genalog.text.rst: -------------------------------------------------------------------------------- 1 | genalog.text 2 | ==================== 3 | 4 | genalog.text.alignment module 5 | ----------------------------- 6 | 7 | .. automodule:: genalog.text.alignment 8 | :members: 9 | 10 | genalog.text.anchor module 11 | -------------------------- 12 | 13 | .. automodule:: genalog.text.anchor 14 | :members: 15 | 16 | genalog.text.conll\_format module 17 | --------------------------------- 18 | 19 | .. automodule:: genalog.text.conll_format 20 | :members: 21 | 22 | genalog.text.lcs module 23 | ----------------------- 24 | 25 | .. automodule:: genalog.text.lcs 26 | :members: 27 | 28 | genalog.text.ner\_label module 29 | ------------------------------ 30 | 31 | .. automodule:: genalog.text.ner_label 32 | :members: 33 | :private-members: _propagate_label_to_ocr 34 | 35 | genalog.text.preprocess module 36 | ------------------------------ 37 | 38 | .. automodule:: genalog.text.preprocess 39 | :members: 40 | 41 | genalog.text.splitter module 42 | ---------------------------- 43 | 44 | .. automodule:: genalog.text.splitter 45 | :members: 46 | 47 | 48 | -------------------------------------------------------------------------------- /docs/genalog_docs/e2e_dataset_pipeline.md: -------------------------------------------------------------------------------- 1 | # OCR-NER Dataset Generation 2 | 3 | ```{image} static/labeled_synthetic_pipeline.png 4 | :width: 80% 5 | :align: center 6 | ``` 7 | 8 | If you were brought here by [our paper](https://arxiv.org/abs/2108.02899), you may be interested in the data preparation pipeline built with `genalog`. The figure above shows the steps involved in tranforming a Named-Entity Recognition (NER) dataset like [CoNLL 2003](https://deepai.org/dataset/conll-2003-english) with synthetic Optical Character Recognition (OCR) errors. This OCR-NER dataset is useful to train an error-prune NER model against common OCR mistakes. You can find the full dataset prepration pipeline in this [notebook](https://github.com/microsoft/genalog/blob/main/example/dataset_generation.ipynb) from our repo. 9 | 10 | We believe this methodology of inducing OCR errors onto the dataset can be applied to other NLP tasks to improve model performance against inherent noise from OCR outputs. We welcome the community to contribute if this fits your use cases. 11 | 12 | 13 | -------------------------------------------------------------------------------- /docs/genalog_docs/index.md: -------------------------------------------------------------------------------- 1 | # Synthetic Document Generator 2 | 3 | ![Python Versions](https://img.shields.io/badge/py-3.6%20%7C%203.7%20%7C%203.8%20-blue) [![arxiv link](https://img.shields.io/badge/arxiv-2108.02899-critical)](https://arxiv.org/abs/2108.02899) ![MIT license](https://img.shields.io/badge/License-MIT-blue.svg) 4 | 5 | ````{margin} 6 | ```sh 7 | pip install genalog 8 | ``` 9 | Star Us 10 | ```` 11 | 12 | `genalog` is an open source, cross-platform python package for **gen**erating document images with synthetic noise that mimics scanned an**alog** documents (thus the name `genalog`). You can also add various text degradations to these images. The purpose of this tool is to provide a fast and efficient way to generate synthetic documents from text data by leveraging layout from templates that you can create in simple HTML format. 13 | 14 | ```{figure} static/genalog_demo.gif 15 | :width: 80% 16 | Generate documents and apply degradations 17 | ``` 18 | 19 | `genalog` provides several document templates as a start. You can alter the document layout using standard CSS properties like `font-family`, `font-size`, `text-align`, etc. Here are some of the example generated documents: 20 | 21 | ````{tab} Multi-Column 22 | ```{figure} static/columns_Times_11px.png 23 | :width: 60% 24 | :name: two-columns-index 25 | Document template with 2 columns 26 | ``` 27 | ```` 28 | ````{tab} Letter-like 29 | ```{figure} static/letter_Times_11px.png 30 | :width: 60% 31 | :name: letter-like-index 32 | Letter-like document template 33 | ``` 34 | ```` 35 | ````{tab} Simple Text Block 36 | ```{figure} static/text_block_Times_11px.png 37 | :width: 60% 38 | :name: text-block-index 39 | Simple text block template 40 | ``` 41 | ```` 42 | 43 | Once a document is generated, you can combine various image degradation effects and apply onto the synthetic documents. Here are some of the degradation effects: 44 | 45 | ````{tab} Bleed-through 46 | ```{figure} static/bleed_through.png 47 | :name: bleed-through-index 48 | :width: 80% 49 | Mimics a document printed on two sides 50 | ``` 51 | ```` 52 | ````{tab} Blur 53 | ```{figure} static/blur.png 54 | :name: blur-index 55 | :width: 80% 56 | Lowers image quality 57 | ``` 58 | ```` 59 | ````{tab} Salt/Pepper 60 | ```{figure} static/salt_pepper.png 61 | :name: salt/pepper-index 62 | :width: 50% 63 | Mimics ink degradation 64 | ``` 65 | ```` 66 | `````{tab} Close/Dilate 67 | ```{figure} static/close_dilate.png 68 | :name: close-dilate-index 69 | :width: 90% 70 | Degrades printing quality 71 | ``` 72 | ````{margin} 73 | ```{note} 74 | For more details on this degradation, see [Morphilogical Operations](https://homepages.inf.ed.ac.uk/rbf/HIPR2/morops.htm) 75 | ``` 76 | ```` 77 | ````` 78 | `````{tab} Open/Erode 79 | ```{figure} static/open_erode.png 80 | :name: open-erode-index 81 | :width: 90% 82 | Ink overflows 83 | ``` 84 | ````{margin} 85 | ```{note} 86 | For more details on this degradation, see [Morphilogical Operations](https://homepages.inf.ed.ac.uk/rbf/HIPR2/morops.htm) 87 | ``` 88 | ```` 89 | ````` 90 | ````{tab} Combined Effects 91 | ```{figure} static/degrader.png 92 | :width: 40% 93 | :name: combined-effects-index 94 | Combining various degradation effects: blur, salt, open, and bleed-through 95 | ``` 96 | ```` 97 | 98 | In addition to the document generation and degradation, `genalog` also provide efficient implementation for [text alignment](text-alignment-page) between the source and noise text. 99 | 100 | -------------------------------------------------------------------------------- /docs/genalog_docs/installation.md: -------------------------------------------------------------------------------- 1 | # Installation 2 | 3 | Genalog is supported across Windows, Mac and Linux on Python 3.6+. However there are *additional* installation steps for Windows and Mac users. 4 | 5 | 6 | ````{tab} pip 7 | ```sh 8 | pip install genalog 9 | ``` 10 | ```` 11 | ````{tab} source 12 | ```sh 13 | git clone https://github.com/microsoft/genalog.git && cd genalog && pip install -e . 14 | ``` 15 | ```` 16 | 17 | ## Extra Steps for Windows & Mac Users 18 | 19 | We have a dependency on [`Weasyprint`](https://weasyprint.readthedocs.io/en/stable/install.html) for image generation, which in turn has non-python dependencies including `Pango`, `cairo` and `GDK-PixBuf` that need to be installed separately. 20 | 21 | So far, `Pango`, `cairo` and `GDK-PixBuf` libraries are available in `Ubuntu-18.04` and later by default. 22 | 23 | If you are running on Windows, MacOS, or other Linux distributions, please see [installation instructions from WeasyPrint](https://weasyprint.readthedocs.io/en/stable/install.html). 24 | 25 | ```{note} 26 | If you encounter the errors like `no library called "libcairo-2" was found`, this is probably due to the three extra dependencies missing. 27 | ``` 28 | 29 | -------------------------------------------------------------------------------- /docs/genalog_docs/static/analog_doc_gen_pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/docs/genalog_docs/static/analog_doc_gen_pipeline.png -------------------------------------------------------------------------------- /docs/genalog_docs/static/bleed_through.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/docs/genalog_docs/static/bleed_through.png -------------------------------------------------------------------------------- /docs/genalog_docs/static/blur.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/docs/genalog_docs/static/blur.png -------------------------------------------------------------------------------- /docs/genalog_docs/static/close_dilate.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/docs/genalog_docs/static/close_dilate.png -------------------------------------------------------------------------------- /docs/genalog_docs/static/columns_Times_11px.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/docs/genalog_docs/static/columns_Times_11px.png -------------------------------------------------------------------------------- /docs/genalog_docs/static/degrader.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/docs/genalog_docs/static/degrader.png -------------------------------------------------------------------------------- /docs/genalog_docs/static/degrader_heavy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/docs/genalog_docs/static/degrader_heavy.png -------------------------------------------------------------------------------- /docs/genalog_docs/static/genalog_demo.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/docs/genalog_docs/static/genalog_demo.gif -------------------------------------------------------------------------------- /docs/genalog_docs/static/kernel_morph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/docs/genalog_docs/static/kernel_morph.png -------------------------------------------------------------------------------- /docs/genalog_docs/static/labeled_synthetic_pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/docs/genalog_docs/static/labeled_synthetic_pipeline.png -------------------------------------------------------------------------------- /docs/genalog_docs/static/letter_Times_11px.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/docs/genalog_docs/static/letter_Times_11px.png -------------------------------------------------------------------------------- /docs/genalog_docs/static/open_erode.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/docs/genalog_docs/static/open_erode.png -------------------------------------------------------------------------------- /docs/genalog_docs/static/salt_pepper.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/docs/genalog_docs/static/salt_pepper.png -------------------------------------------------------------------------------- /docs/genalog_docs/static/text_block_Times_11px.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/docs/genalog_docs/static/text_block_Times_11px.png -------------------------------------------------------------------------------- /docs/requirements-doc.txt: -------------------------------------------------------------------------------- 1 | jupyter-book 2 | sphinx 3 | sphinx_inline_tabs 4 | ghp-import -------------------------------------------------------------------------------- /example/demo_generate.py: -------------------------------------------------------------------------------- 1 | #%% 2 | from genalog.pipeline import AnalogDocumentGeneration 3 | from genalog.degradation.degrader import ImageState 4 | 5 | sample_text = "sample/generation/example.txt" 6 | 7 | # Common CSS properties 8 | STYLE_COMBINATIONS = { 9 | "font_family" : ["Times"], # sans-serif, Times, monospace, etc 10 | "font_size" : ["12px"], 11 | "text_align" : ["justify"], # left, right, center, justify 12 | "language" : ["en_US"], # controls how words are hyphenated 13 | "hyphenate" : [True], 14 | } 15 | 16 | # .html.jinja 17 | HTML_TEMPLATE = "columns.html.jinja" 18 | 19 | # Degration effects applied in sequence 20 | DEGRADATIONS = [ 21 | ("blur", {"radius": 3}), # needs to be an odd number 22 | ("bleed_through", { 23 | "src": ImageState.CURRENT_STATE, "background": ImageState.ORIGINAL_STATE, 24 | "alpha": 0.8, 25 | "offset_y": 9, "offset_x": 12 26 | }), 27 | ("morphology", {"operation": "open", "kernel_shape": (3, 3)}), 28 | ("pepper", {"amount": 0.05}), 29 | ("salt", {"amount": 0.05}), 30 | ] 31 | 32 | doc_generation = AnalogDocumentGeneration(styles=STYLE_COMBINATIONS, degradations=DEGRADATIONS) 33 | img_array = doc_generation.generate_img(sample_text, HTML_TEMPLATE, target_folder=None) 34 | 35 | import cv2 36 | from IPython.core.display import Image, display 37 | 38 | _, encoded_image = cv2.imencode('.png', img_array) 39 | display(Image(data=encoded_image, width=600)) 40 | 41 | -------------------------------------------------------------------------------- /example/ocr_label_propagation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## `genalog.text` module: \n", 8 | "This module is responsible for:\n", 9 | "1. Text alignment\n", 10 | "1. NER label propagation using text alignment results" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 1, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "from genalog.text import ner_label\n", 20 | "from genalog.text import preprocess\n", 21 | "\n", 22 | "gt_txt = \"New York is big\"\n", 23 | "ocr_txt = \"New Yo rkis big\"\n", 24 | "\n", 25 | "# Input to the method\n", 26 | "gt_labels = [\"B-P\", \"I-P\", \"O\", \"O\"]\n", 27 | "gt_tokens = preprocess.tokenize(gt_txt) # tokenize into list of tokens\n", 28 | "ocr_tokens = preprocess.tokenize(ocr_txt)" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 2, 34 | "metadata": {}, 35 | "outputs": [ 36 | { 37 | "name": "stdout", 38 | "output_type": "stream", 39 | "text": [ 40 | "['B-P', 'I-P', 'O', 'O']\n", 41 | "['New', 'York', 'is', 'big']\n", 42 | "['New', 'Yo', 'rkis', 'big']\n" 43 | ] 44 | } 45 | ], 46 | "source": [ 47 | "# Inputs to the method\n", 48 | "print(gt_labels)\n", 49 | "print(gt_tokens)\n", 50 | "print(ocr_tokens)" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 5, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "# Method returns a tuple of 4 elements (gt_tokens, gt_labels, ocr_tokens, ocr_labels, gap_char)\n", 60 | "ocr_labels, aligned_gt, aligned_ocr, gap_char = ner_label.propagate_label_to_ocr(gt_labels, gt_tokens, ocr_tokens)" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": 6, 66 | "metadata": {}, 67 | "outputs": [ 68 | { 69 | "name": "stdout", 70 | "output_type": "stream", 71 | "text": [ 72 | "OCR labels: ['B-P', 'I-P', 'I-P', 'O']\n", 73 | "Aligned ground truth: New Yo@rk is big\n", 74 | "Alinged OCR text: New Yo rk@is big\n" 75 | ] 76 | } 77 | ], 78 | "source": [ 79 | "# Outputs\n", 80 | "print(f\"OCR labels: {ocr_labels}\")\n", 81 | "print(f\"Aligned ground truth: {aligned_gt}\")\n", 82 | "print(f\"Alinged OCR text: {aligned_ocr}\")" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": 9, 88 | "metadata": {}, 89 | "outputs": [ 90 | { 91 | "name": "stdout", 92 | "output_type": "stream", 93 | "text": [ 94 | "B-P I-P O O \n", 95 | "New York is big \n", 96 | "New Yo@rk is big\n", 97 | "||||||.||.||||||\n", 98 | "New Yo rk@is big\n", 99 | "New Yo rkis big \n", 100 | "B-P I-P I-P O \n", 101 | "\n" 102 | ] 103 | } 104 | ], 105 | "source": [ 106 | "# Format result for display\n", 107 | "print(ner_label.format_label_propagation(gt_tokens, gt_labels, ocr_tokens, ocr_labels, aligned_gt, aligned_ocr))" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": 12, 113 | "metadata": {}, 114 | "outputs": [ 115 | { 116 | "name": "stdout", 117 | "output_type": "stream", 118 | "text": [ 119 | "B-P I-P O O \n", 120 | "New York is big \n", 121 | "New Yo rkis big \n", 122 | "B-P I-P I-P O \n", 123 | "\n" 124 | ] 125 | } 126 | ], 127 | "source": [ 128 | "# To turn off alignment information:\n", 129 | "print(ner_label.format_label_propagation(gt_tokens, gt_labels, ocr_tokens, ocr_labels, aligned_gt, aligned_ocr, show_alignment=False))" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": 14, 135 | "metadata": {}, 136 | "outputs": [ 137 | { 138 | "name": "stdout", 139 | "output_type": "stream", 140 | "text": [ 141 | "B-P I-P I-P O \n", 142 | "New Yo rkis big \n", 143 | "\n" 144 | ] 145 | } 146 | ], 147 | "source": [ 148 | "# Format tokens and labels\n", 149 | "print(ner_label.format_labels(ocr_tokens, ocr_labels))" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": null, 155 | "metadata": {}, 156 | "outputs": [], 157 | "source": [] 158 | } 159 | ], 160 | "metadata": { 161 | "kernelspec": { 162 | "display_name": "Python 3", 163 | "language": "python", 164 | "name": "python3" 165 | }, 166 | "language_info": { 167 | "codemirror_mode": { 168 | "name": "ipython", 169 | "version": 3 170 | }, 171 | "file_extension": ".py", 172 | "mimetype": "text/x-python", 173 | "name": "python", 174 | "nbconvert_exporter": "python", 175 | "pygments_lexer": "ipython3", 176 | "version": "3.6.9" 177 | } 178 | }, 179 | "nbformat": 4, 180 | "nbformat_minor": 4 181 | } 182 | -------------------------------------------------------------------------------- /example/sample/degradation/bleed_through.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/example/sample/degradation/bleed_through.png -------------------------------------------------------------------------------- /example/sample/degradation/blur.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/example/sample/degradation/blur.png -------------------------------------------------------------------------------- /example/sample/degradation/close_dilate.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/example/sample/degradation/close_dilate.png -------------------------------------------------------------------------------- /example/sample/degradation/degrader.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/example/sample/degradation/degrader.png -------------------------------------------------------------------------------- /example/sample/degradation/degrader_heavy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/example/sample/degradation/degrader_heavy.png -------------------------------------------------------------------------------- /example/sample/degradation/kernel_morph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/example/sample/degradation/kernel_morph.png -------------------------------------------------------------------------------- /example/sample/degradation/open_erode.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/example/sample/degradation/open_erode.png -------------------------------------------------------------------------------- /example/sample/degradation/salt_pepper.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/example/sample/degradation/salt_pepper.png -------------------------------------------------------------------------------- /example/sample/degradation/text_block.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/example/sample/degradation/text_block.png -------------------------------------------------------------------------------- /example/sample/degradation/text_zoomed.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/example/sample/degradation/text_zoomed.png -------------------------------------------------------------------------------- /example/sample/generation/columns_Times_11px.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/example/sample/generation/columns_Times_11px.pdf -------------------------------------------------------------------------------- /example/sample/generation/columns_Times_11px.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/example/sample/generation/columns_Times_11px.png -------------------------------------------------------------------------------- /example/sample/generation/example.txt: -------------------------------------------------------------------------------- 1 | Time magazine , in a move to reduce the costs of wooing new subscribers , is lowering its circulation guarantee to advertisers for the second consecutive year , increasing its subscription rates and cutting back on merchandise giveaways . 2 | In an announcement to its staff last week , executives at Time Warner Inc. 's weekly magazine said Time will `` dramatically de-emphasize '' its use of electronic giveaways such as telephones in television subscription drives ; cut the circulation it guarantees advertisers by 300,000 , to four million ; and increase the cost of its annual subscription rate by about $ 4 to $ 55 . 3 | In a related development , the news - weekly , for the fourth year in a row , said it wo n't increase its advertising rates in 1990 ; a full , four - color page in the magazine costs about $ 120,000 . 4 | However , because the guaranteed circulation base is being lowered , ad rates will be effectively 7.5 % higher per subscriber , according to Richard Heinemann , Time associate publisher . 5 | Time is following the course of some other mass - circulation magazines that in recent years have challenged the publishing myth that maintaining artificially high , and expensive , circulations is the way to draw advertisers . 6 | In recent years , Reader 's Digest , New York Times Co. 's McCall 's , and most recently News Corp. 's TV Guide , have cut their massive circulation rate bases to eliminate marginal circulation and hold down rates for advertisers . 7 | Deep discounts in subscriptions and offers of free clock radios and watches have become accepted forms of attracting new subscribers in the hyper-competitive world of magazine news - weeklies . 8 | But Time , as part of the more cost - conscious Time Warner , wants to wean itself away from expensive gimmicks . 9 | Besides , Time executives think selling a news magazine with a clock radio is tacky . 10 | 11 | 12 | `` Giveaways just give people the wrong image , '' said Mr. Heinemann . 13 | `` That perception takes the focus off the magazine . '' 14 | Time magazine executives predictably paint the circulation cut as a show of strength and actually a benefit to advertisers . 15 | `` What we are doing is screening out the readers who are only casually related to the magazine and do n't really read it , '' said Mr. Heinemann . 16 | `` We are trying to create quality and involvement . '' 17 | However , Time executives used the same explanation when in October 1988 the magazine cut its guaranteed circulation from 4.6 million to 4.3 million . 18 | And Time 's paid circulation , according to Audit Bureau of Circulations , dropped 7.3 % to 4,393,237 in the six months ended June 30 , 1989 . 19 | Still , Time 's move is being received well , once again . 20 | `` It 's terrific for advertisers to know the reader will be paying more , '' said Michael Drexler , national media director at Bozell Inc. ad agency . 21 | `` A few drops in circulation are of no consequence . 22 | It 's not a show of weakness ; they are improving the quality of circulation while insuring their profits . '' 23 | Mr. Heinemann said the changes represent a new focus in the magazine industry : a magazine 's net revenue per subscriber , or the actual revenue from subscribers after discounts and the cost of premiums have been stripped away . 24 | `` The question is how much are we getting from each reader , '' said Mr. Heinemann . 25 | Time 's rivals news - weeklies , Washington Post Co. 's Newsweek and U.S. News & World Report , are less reliant on electronic giveaways , and in recent years both have been increasing their circulation rate bases . 26 | Both magazines are expected to announce their ad rates and circulation levels for 1990 within a month . -------------------------------------------------------------------------------- /example/sample/generation/letter_Times_11px.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/example/sample/generation/letter_Times_11px.pdf -------------------------------------------------------------------------------- /example/sample/generation/letter_Times_11px.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/example/sample/generation/letter_Times_11px.png -------------------------------------------------------------------------------- /example/sample/generation/text_block_Times_11px.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/example/sample/generation/text_block_Times_11px.pdf -------------------------------------------------------------------------------- /example/sample/generation/text_block_Times_11px.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/example/sample/generation/text_block_Times_11px.png -------------------------------------------------------------------------------- /example/sample/generation/text_block_Times_11px_pg_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/example/sample/generation/text_block_Times_11px_pg_0.png -------------------------------------------------------------------------------- /example/sample/generation/text_block_Times_11px_pg_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/example/sample/generation/text_block_Times_11px_pg_1.png -------------------------------------------------------------------------------- /example/static/analog_doc_gen_pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/example/static/analog_doc_gen_pipeline.png -------------------------------------------------------------------------------- /example/static/genalog_components.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/example/static/genalog_components.png -------------------------------------------------------------------------------- /example/static/labeled_synthetic_pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/example/static/labeled_synthetic_pipeline.png -------------------------------------------------------------------------------- /genalog/README.md: -------------------------------------------------------------------------------- 1 | # Genalog Core 2 | 3 | This is the core of the package and contains all core components necessary to generate new docs, degrade the documents and get text out of degraded images using OCR Capabilities of Azure. 4 | 5 | ## Image Generation 6 | 7 | This directory contains the class implementations for image generation. The image generation leverages [Jinja templates](https://jinja.palletsprojects.com/en/2.11.x/templates/) for image generation. You can create a Jinja HTML template for any image layout and specify content variables to add content into images. This allows you the flexibility to be as declarative as possible. 8 | 9 | [Here is our guide to Image Generation](generation/README.md) 10 | 11 | ## Image Degradation 12 | 13 | This directory contains the class implementations for degrading your images such that they simulate real world Document degradations. 14 | 15 | [Here is our guide to Image Degradation](degradation/README.md) 16 | 17 | ## Extract Text from Images 18 | 19 | This directory contains the class implementations for Extract Text from Images using Azure OCR Process. 20 | 21 | [Here is our guide to Extract Text from Images](ocr/README.md) 22 | 23 | ## Text Alignment 24 | 25 | This directory contains the class implementations for text alignment. We expect that these capabilities will be required when you need to align text with its incorrect versions when you degrade documents and then have errors in OCR. We use [Biopython's](https://biopython.org/) implementation of the Needleman-Wunsch algorithm for text alignment as the method `genalog.text.alignment.align()`. This algorithm is an exhaustive search for all possible candidates with dynamic programming. It produces weighted score for each candidate and returns those having the highest score. Note this is an algorithm with quadratic time and space complexity, and is not so efficient on aligning longer strings. 26 | 27 | For more efficient alignment on longer documents, we also include an implementation of the RETAS method from the paper ["A Fast Alignment Scheme for Automatic OCR Evaluation of Books"](https://ieeexplore.ieee.org/document/6065412) in `genalog.text.anchor.align_w_anchor()`. We would recommend using this method for input longer than 200 characters. 28 | 29 | [Here is our guide to Text Alignment](text/README.md) 30 | -------------------------------------------------------------------------------- /genalog/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/genalog/__init__.py -------------------------------------------------------------------------------- /genalog/degradation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/genalog/degradation/__init__.py -------------------------------------------------------------------------------- /genalog/generation/.gitignore: -------------------------------------------------------------------------------- 1 | # output folders for debugging purpose 2 | output/ 3 | # sample input for debugging 4 | sample/ -------------------------------------------------------------------------------- /genalog/generation/README.md: -------------------------------------------------------------------------------- 1 | ## Document Generation 2 | 3 | This folder contains the scripts that allow you generate synthetic documents from any given text. We provide **three** standard templates for with document layouts: 4 | 5 |

6 | 7 | 8 | 9 |

10 | 11 | You can find these templates in path `genalog/generation/templates`. 12 | 13 | ### 1. Document Content 14 | 15 | The goal is to be able to generate synthetic documents on ANY text input. However, to properly initiate the content populating a document template, we need to create the `CompositeContent` class. 16 | 17 | ```python 18 | from genalog.generation.content import CompositeContent, ContentType 19 | 20 | # Here we are loading an sample text file in the root "example" directory 21 | # You may use any text as well. 22 | with open("example/sample/generation/example.txt", 'r') as f: 23 | text = f.read() 24 | 25 | # Initialize CompositeContent Object 26 | paragraphs = text.split('\n\n') # split paragraphs by `\n\n` 27 | content_types = [ContentType.PARAGRAPH] * len(paragraphs) 28 | content = CompositeContent(paragraphs, content_types) 29 | ``` 30 | The `CompositeContent` is a list of pairs of bodies of text and their `ContentType`. Here we can declaring a list of multiple `ContentType.PARAGRAPH`s. 31 | 32 | ### 2. Populate Content into Template 33 | 34 | Once we initialized a `CompositeContent` object, we can populate the content into any standard template, via `DocumentGenerator` class. 35 | 36 | ```python 37 | from genalog.generation.document import DocumentGenerator 38 | default_generator = DocumentGenerator() 39 | 40 | print(f"Available default templates: {default_generator.template_list}") 41 | print(f"Default styles to generate: {default_generator.styles_to_generate}") 42 | ``` 43 | The `DocumentGenerator` has default styles. The above code snippet will show the default configurations and the names of the 3 standard templates. You will use the information to select the template you want to generate. The three templates are `["columns.html.jinja", "letter.html.jinja", "text_block.html.jinja"]` 44 | 45 | ```python 46 | # Select specific template, content and create the generator 47 | doc_gen = default_generator.create_generator(content, ["columns.html.jinja", "letter.html.jinja", "text_block.html.jinja"]) 48 | # we will use the `CompositeContent` object initialized from above cell 49 | 50 | # python generator 51 | for doc in doc_gen: 52 | template_name = doc.template.name.replace(".html.jinja", "") 53 | doc.render_png(target=f"example_{template_name}.png", resolution=300) #in dots per inch 54 | ``` 55 | You can also retrieve the raw image byte information without specifying the `target` 56 | 57 | ```python 58 | from genalog.generation.document import DocumentGenerator 59 | from IPython.core.display import Image, display 60 | 61 | doc_gen = default_generator.create_generator(content, ['text_block.html.jinja']) 62 | 63 | for doc in doc_gen: 64 | image_byte = doc.render_png(resolution=100) 65 | display(Image(image_byte)) 66 | ``` 67 | 68 | Alternative, you can also save the document as a PDF file. 69 | 70 | ```python 71 | # Select specific template, content and create the generator 72 | doc_gen = default_generator.create_generator(content, ['text_block.html.jinja']) 73 | # we will use the `CompositeContent` object initialized from above cell 74 | 75 | # python generator 76 | for doc in doc_gen: 77 | doc.render_pdf(target="example_text_block.png") 78 | ``` 79 | 80 | ### Changing Document Styles 81 | 82 | You can alter the document styles including font family, font size, enabling hyphenation, and text alignment. These are mock style properties of their CSS counterparts. You can find standard CSS values replace the following properties. 83 | 84 | ```python 85 | from genalog.generation.document import DocumentGenerator 86 | from IPython.core.display import Image, display 87 | 88 | # You can add as many options as possible. A new document will be generated per combination of the styles 89 | new_style_combinations = { 90 | "hyphenate": [True], 91 | "font_size": ["11px", "12px"], # most CSS units are supported `px`, `cm`, `em`, etc... 92 | "font_family": ["Times"], 93 | "text_align": ["justify"] 94 | } 95 | 96 | default_generator = DocumentGenerator() 97 | default_generator.set_styles_to_generate(new_style_combinations) 98 | # Example the list of all style combination to generate 99 | print(f"Styles to generate: {default_generator.styles_to_generate}") 100 | 101 | doc_gen = default_generator.create_generator(titled_content, ["columns.html.jinja", "letter.html.jinja"]) 102 | 103 | for doc in doc_gen: 104 | print(doc.styles) 105 | print(doc.template.name) 106 | image_byte = doc.render_png(resolution=300) 107 | display(Image(image_byte)) 108 | ``` 109 | -------------------------------------------------------------------------------- /genalog/generation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/genalog/generation/__init__.py -------------------------------------------------------------------------------- /genalog/generation/content.py: -------------------------------------------------------------------------------- 1 | from enum import auto, Enum 2 | 3 | 4 | class ContentType(Enum): 5 | PARAGRAPH = auto() 6 | TITLE = auto() 7 | IMAGE = auto() 8 | COMPOSITE = auto() 9 | 10 | 11 | class Content: 12 | def __init__(self): 13 | self.iterable = True 14 | self._content = None 15 | 16 | def set_content_type(self, content_type): 17 | if type(content_type) != ContentType: 18 | raise TypeError( 19 | f"Invalid content type: {content_type}, valid types are {list(ContentType)}" 20 | ) 21 | self.content_type = content_type 22 | 23 | def validate_content(self): 24 | NotImplementedError 25 | 26 | def __str__(self): 27 | return self._content.__str__() 28 | 29 | def __iter__(self): 30 | return self._content.__iter__() 31 | 32 | def __getitem__(self, key): 33 | return self._content.__getitem__(key) 34 | 35 | 36 | class Paragraph(Content): 37 | def __init__(self, content): 38 | self.set_content_type(ContentType.PARAGRAPH) 39 | self.validate_content(content) 40 | self._content = content 41 | 42 | def validate_content(self, content): 43 | if not isinstance(content, str): 44 | raise TypeError(f"Expect a str, but got {type(content)}") 45 | 46 | 47 | class Title(Content): 48 | def __init__(self, content): 49 | self.set_content_type(ContentType.TITLE) 50 | self.validate_content(content) 51 | self._content = content 52 | 53 | def validate_content(self, content): 54 | if not isinstance(content, str): 55 | raise TypeError(f"Expect a str, but got {type(content)}") 56 | 57 | 58 | class CompositeContent(Content): 59 | def __init__(self, content_list, content_type_list): 60 | self.set_content_type(ContentType.COMPOSITE) 61 | self.validate_content(content_list) 62 | self.construct_content(content_list, content_type_list) 63 | self.iterable = True 64 | 65 | def validate_content(self, content_list): 66 | if not isinstance(content_list, list): 67 | raise TypeError(f"Expect a list of content, but got {type(content_list)}") 68 | 69 | def construct_content(self, content_list, content_type_list): 70 | self._content = [] 71 | for content, content_type in zip(content_list, content_type_list): 72 | if content_type == ContentType.TITLE: 73 | self._content.append(Title(content)) 74 | elif content_type == ContentType.PARAGRAPH: 75 | self._content.append(Paragraph(content)) 76 | else: 77 | raise NotImplementedError(f"{content_type} is not currently supported") 78 | 79 | def insert_content(self, new_content, index): 80 | NotImplementedError 81 | 82 | def delete_content(self, index): 83 | NotImplementedError 84 | 85 | def __repr__(self): 86 | return "CompositeContent(" + self._content.__repr__() + ")" 87 | 88 | def __str__(self): 89 | """get a string transparent of the nested object types""" 90 | transparent_str = "[" 91 | for content in self._content: 92 | transparent_str += '"' + content.__str__() + '", ' 93 | return transparent_str + "]" 94 | -------------------------------------------------------------------------------- /genalog/generation/templates/base.css.jinja: -------------------------------------------------------------------------------- 1 | {# Copyright (c) Microsoft Corporation. All rights reserved. #} -------------------------------------------------------------------------------- /genalog/generation/templates/base.html.jinja: -------------------------------------------------------------------------------- 1 | {# Copyright (c) Microsoft Corporation. All rights reserved. #} 2 | 3 | 4 | 5 | {% if language %} 6 | 7 | {% else %} 8 | 9 | {% endif %} 10 | 11 | 12 | {%- block head %} 13 | 18 | {% endblock head %} 19 | 20 | 21 | 22 | {% block body %} {% endblock body %} 23 | 24 | -------------------------------------------------------------------------------- /genalog/generation/templates/columns.css.jinja: -------------------------------------------------------------------------------- 1 | {# Copyright (c) Microsoft Corporation. All rights reserved. #} 2 | 3 | .title, .authors { 4 | margin: auto; 5 | width: 80%; 6 | text-align: center; 7 | } 8 | 9 | .title { 10 | font-weight: bold; 11 | } 12 | 13 | .authors { 14 | font-style: italic; 15 | margin: 15px auto ; 16 | } 17 | 18 | .abstract { 19 | margin: auto; 20 | width: 100%; 21 | text-align: justify; 22 | margin-bottom: 5px; 23 | } 24 | 25 | .abstract-title { 26 | font-weight: bold; 27 | font-size: 14px; 28 | text-align: center; 29 | margin-bottom: 5px; 30 | } 31 | 32 | .columns { 33 | margin-top: 0; 34 | } 35 | .columns { 36 | column-gap: 40px; 37 | {% if column_num %} 38 | column-count: {{ column_num }}; 39 | {% else %} 40 | column-count: 2; 41 | {% endif %} 42 | } 43 | .title { 44 | font-size: 16px; 45 | } 46 | .section-title { 47 | font-weight: bold; 48 | font-size: {{ font_size_title }}; 49 | } 50 | .section-content { 51 | 52 | } 53 | img { 54 | max-width:100%; 55 | height:auto; 56 | } -------------------------------------------------------------------------------- /genalog/generation/templates/columns.html.jinja: -------------------------------------------------------------------------------- 1 | {# Copyright (c) Microsoft Corporation. All rights reserved. #} 2 | 3 | {% extends "base.html.jinja" %} 4 | {%- block style %} 5 | {# Global Style #} 6 | {% import "macro/dimension.css.jinja" as dimension %} 7 | {{ dimension.a4_paper() }} 8 | {% import "macro/text.css.jinja" as text %} 9 | {{ text.set_font(font_family, font_size) }} 10 | {{ text.set_hyphenation(hyphenate) }} 11 | {{ text.set_text_align(text_align) }} 12 | {% import "macro/page_layout.css.jinja" as layout %} 13 | {{ layout.set_page_num() }} 14 | {# Element-Specific Style #} 15 | {%- include "columns.css.jinja" with context %} 16 | {% endblock style %} 17 | 18 | {% block body %} 19 |
20 |

A Study of Wild Unicorns in a Rainbow-rich Habitat

21 |
22 | 23 |
24 | Pony Tail, Sweet Rock, Umbrella Mushroom
25 | Colourful University of Magic
26 | One Rainbow Road
27 | Utopia, 001
28 | everyone@happiness.joy 29 |
30 | 31 |
32 |
Abstract
33 | A study of wild unicorns in a rainbow-rich habitat, 34 | in an effort to understand the dynamics of this unusual animal. 35 | "Rainbows are considered a sign of life," explained Lise Saut ter, 36 | a scientist at the University of Ber gen in Norway and lead author 37 | of the study. "The unicorn also has a very interesting evolutionary 38 | history. This study is a first step toward understanding why unicorns 39 | behave the way they do." In order to better understand these unique 40 | animals, researchers collected four wild females from the rain forest 41 | in Northern Norway in 2006. They spent several weeks with them, feeding 42 | them on different types of wild fruit, grass and mushrooms, and recording 43 | the activity and responses of the wild animals. 44 |
45 | 46 | {% if content and content.content_type.__str__() == "ContentType.COMPOSITE"%} 47 |
48 | {% for c in content %} 49 | {% if c.content_type.__str__() == "ContentType.TITLE"%} 50 |

{{ c }}

51 | {% elif c.content_type.__str__() == "ContentType.PARAGRAPH" %} 52 |

{{ c }}

53 | {% else %} 54 |

Unsupported Content Type: {{c.content_type.__str__()}}

55 | {% endif %} 56 | {% endfor %} 57 |
58 | {% else %} 59 |
60 | No content loaded or content is not an instance of CompositeContent Class 61 |
62 | {% endif %} 63 | 64 | {% endblock body %} 65 | -------------------------------------------------------------------------------- /genalog/generation/templates/letter.css.jinja: -------------------------------------------------------------------------------- 1 | {# Copyright (c) Microsoft Corporation. All rights reserved. #} 2 | 3 | .section-title { 4 | font-weight: bold; 5 | font-size: {{ font_size_title }}; 6 | } 7 | 8 | .letter-head { 9 | margin: auto; 10 | width: 50%; 11 | text-align: center; 12 | font-size: 16px; 13 | font-weight: bold; 14 | font-style: italic; 15 | } 16 | 17 | .letter-head p { 18 | margin-top: 0; 19 | } 20 | 21 | .addressee { 22 | margin: 30px 0 15px 0 ; 23 | } 24 | 25 | -------------------------------------------------------------------------------- /genalog/generation/templates/letter.html.jinja: -------------------------------------------------------------------------------- 1 | {# Copyright (c) Microsoft Corporation. All rights reserved. #} 2 | 3 | {% extends "base.html.jinja" %} 4 | {%- block style %} 5 | {% import "macro/dimension.css.jinja" as dimension %} 6 | {{ dimension.a4_paper() }} 7 | {% import "macro/text.css.jinja" as text %} 8 | {{ text.set_font(font_family, font_size) }} 9 | {{ text.set_hyphenation(hyphenate) }} 10 | {{ text.set_text_align(text_align) }} 11 | {% import "macro/page_layout.css.jinja" as layout %} 12 | {{ layout.set_page_num() }} 13 | {%- include "letter.css.jinja" with context %} 14 | {% endblock style %} 15 | 16 | {% block body %} 17 |
18 | 19 |

Company X
20 | One Company Road
21 | City, State, 0001
22 | January 1st, 2020

23 |
24 | 25 |
26 | Dear Mr/Ms. X 27 |
28 | 29 | {% if content and content.content_type.__str__() == "ContentType.COMPOSITE"%} 30 |
31 | {% for c in content %} 32 | {% if c.content_type.__str__() == "ContentType.TITLE"%} 33 |

{{ c }}

34 | {% elif c.content_type.__str__() == "ContentType.PARAGRAPH" %} 35 |

{{ c }}

36 | {% else %} 37 |

Unsupported Content Type: {{c.content_type.__str__()}}

38 | {% endif %} 39 | {% endfor %} 40 |
41 | {% else %} 42 |
43 | No content loaded or content is not an instance of CompositeContent Class 44 |
45 | {% endif %} 46 | {% endblock body %} -------------------------------------------------------------------------------- /genalog/generation/templates/macro/dimension.css.jinja: -------------------------------------------------------------------------------- 1 | {# Copyright (c) Microsoft Corporation. All rights reserved. #} 2 | 3 | {% macro set_page_dimension(width, height, margin) -%} 4 | @page { 5 | size: {{ width }}cm {{ height }}cm; 6 | margin: {{ margin }}cm; 7 | } 8 | {% endmacro %} 9 | 10 | {% macro a4_paper(margin=2) %} 11 | {{ set_page_dimension(21, 30, margin) }} 12 | {% endmacro %} -------------------------------------------------------------------------------- /genalog/generation/templates/macro/page_layout.css.jinja: -------------------------------------------------------------------------------- 1 | {# Copyright (c) Microsoft Corporation. All rights reserved. #} 2 | 3 | {% macro set_page_num() -%} 4 | @page { 5 | @bottom-right { content: counter(page); } 6 | } 7 | {% endmacro %} 8 | 9 | {% macro set_page_bg() %} 10 | @page { 11 | background: white; 12 | } 13 | {% endmacro%} -------------------------------------------------------------------------------- /genalog/generation/templates/macro/text.css.jinja: -------------------------------------------------------------------------------- 1 | {# Copyright (c) Microsoft Corporation. All rights reserved. #} 2 | 3 | {% macro set_font(font_family, size) -%} 4 | html { 5 | font-family: {{ font_family }}; 6 | font-size: {{ size }}; 7 | } 8 | {% endmacro %} 9 | 10 | {% macro set_hyphenation(hyphenate=True) -%} 11 | {% if hyphenate %} 12 | html { hyphens: auto; } 13 | {% else %} 14 | html { hyphens: none; } 15 | {% endif %} 16 | {% endmacro %} 17 | 18 | {% macro set_text_align(alignment) -%} 19 | html { text-align: {{ alignment }} } 20 | {% endmacro %} -------------------------------------------------------------------------------- /genalog/generation/templates/text_block.css.jinja: -------------------------------------------------------------------------------- 1 | {# Copyright (c) Microsoft Corporation. All rights reserved. #} -------------------------------------------------------------------------------- /genalog/generation/templates/text_block.html.jinja: -------------------------------------------------------------------------------- 1 | {# Copyright (c) Microsoft Corporation. All rights reserved. #} 2 | 3 | {% extends "base.html.jinja" %} 4 | {%- block style %} 5 | {# Global Style #} 6 | {% import "macro/dimension.css.jinja" as dimension %} 7 | {{ dimension.a4_paper() }} 8 | {% import "macro/text.css.jinja" as text %} 9 | {{ text.set_font(font_family, font_size) }} 10 | {{ text.set_hyphenation(hyphenate) }} 11 | {{ text.set_text_align(text_align) }} 12 | {# Element-Specific Style #} 13 | {%- include "text_block.css.jinja" with context %} 14 | {% endblock style %} 15 | 16 | {% block body %} 17 | {% if content and content.content_type.__str__() == "ContentType.COMPOSITE"%} 18 |
19 | {% for c in content %} 20 | {% if c.content_type.__str__() == "ContentType.PARAGRAPH" %} 21 |

{{ c }}

22 | {% else %} 23 |

Unsupported Content Type: {{c.content_type.__str__()}}

24 | {% endif %} 25 | {% endfor %} 26 |
27 | {% else %} 28 |
29 | No content loaded or content is not an instance of CompositeContent Class 30 |
31 | {% endif %} 32 | {% endblock body %} -------------------------------------------------------------------------------- /genalog/ocr/README.md: -------------------------------------------------------------------------------- 1 | # GROK Client 2 | 3 | Use the GROK client to make rest calls to the Azure Search Service to create and run the indexing pipeline. Blob client is used to transfer the images to blob and download the extracted OCR from blob. 4 | 5 | Example usage: 6 | 7 | 1. Create an .env file with the environment variables that includes the names of you index, indexer, skillset, and datasource to create on the search service. Include keys to the blob that contains the documents you want to index, keys to the cognitive service and keys to you computer vision subscription and search service. In order to index more than 20 documents, you must have a computer services subscription. You can find the keys for the services in the Azure Portal. An example of the .env file content is given below: 8 | 9 | ```bash 10 | 11 | SEARCH_SERVICE_NAME = "ocr-ner-pipeline" 12 | SKILLSET_NAME = "ocrskillset" 13 | INDEX_NAME = "ocrindex" 14 | INDEXER_NAME = "ocrindexer" 15 | DATASOURCE_NAME = "syntheticimages" 16 | DATASOURCE_CONTAINER_NAME = "ocrimages" 17 | PROJECTIONS_CONTAINER_NAME = "ocrprojection" 18 | 19 | BLOB_NAME = "syntheticimages" 20 | BLOB_KEY = "" 21 | SEARCH_SERVICE_KEY = "" 22 | COGNITIVE_SERVICE_KEY = "" 23 | ``` 24 | 25 | 2. Source this .env file to load the variables then you can create and use the Grok class , REST client or blob client. 26 | 27 | 3. First, we need to upload our image files to azure blob. To do this, we use the blob client and call the `upload_images_to_blob` function. This function takes in the local and remote path and an optional parameter to specify whether to use asyncio asynchronous uploads [https://docs.python.org/3/library/asyncio.html]. Asynchronous uploads are faster, however, some setups of python may not support them. In such cases, sychronous uploads can be made using `use_async=False`. 28 | 29 | ```python 30 | from genalog.ocr.blob_client import GrokBlobClient 31 | from dotenv import load_dotenv 32 | load_dotenv(".env") 33 | destination_folder_name, upload_task = blob_client.upload_images_to_blob(local_path, remote_path, use_async=True) 34 | await upload_task 35 | ``` 36 | 37 | 4. Once files are uploaded, use the rest client to create an indexing pipeline to extract the text from the images on blob. The results are stored as json blobs in a projection blob container where the names of these json blobs are the base64 encoded paths of the source blob images. The name of this projection container is specified in the env file. The `poll_indexer_till_complete` will block and continuosly poll the indexer until it completly processes all docs. 38 | 39 | ```python 40 | from genalog.ocr.rest_client import GrokRestClient 41 | from dotenv import load_dotenv 42 | load_dotenv(".env") 43 | 44 | grok_rest_client = GrokRestClient. 45 | grok_rest_client.create_indexing_pipeline() 46 | grok_rest_client.run_indexer() 47 | indexer_status = grok_rest_client.poll_indexer_till_complete() 48 | 49 | ``` 50 | 51 | 5. Once the indexer completes, use the blob client to download the results from the projections blob. 52 | 53 | ```python 54 | from genalog.ocr.blob_client import GrokBlobClient 55 | from dotenv import load_dotenv 56 | load_dotenv(".env") 57 | 58 | output_folder = "./ocr" 59 | async_download_task = blob_client.get_ocr_json( remote_path, output_folder, use_async=True) 60 | await async_download_task 61 | ``` 62 | 63 | 6. Alternatively, steps 3, 4 and 5 can be skipped by using the Grok class. This class is wrapper of the rest and blob clients. It upload images from src_folder_path to blob, runs the indexer, then donwloads the ocr projections to dest_folder_path 64 | 65 | 66 | ```python 67 | from genalog.ocr.grok import Grok 68 | from dotenv import load_dotenv 69 | load_dotenv("tests/unit/ocr/.env") 70 | 71 | grok = Grok.create_from_env_var() 72 | grok.run_grok(src_folder_path = "tests/unit/ocr/data/img", dest_folder_path = "tests/unit/ocr/data/json") 73 | ``` 74 | 75 | -------------------------------------------------------------------------------- /genalog/ocr/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/genalog/ocr/__init__.py -------------------------------------------------------------------------------- /genalog/ocr/common.py: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # --------------------------------------------------------- 5 | 6 | DEFAULT_PROJECTIONS_CONTAINER_NAME = "ocrprojections" 7 | -------------------------------------------------------------------------------- /genalog/ocr/grok.py: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # --------------------------------------------------------- 5 | 6 | import time 7 | 8 | from .blob_client import GrokBlobClient 9 | from .rest_client import GrokRestClient 10 | 11 | 12 | class Grok: 13 | @staticmethod 14 | def create_from_env_var(): 15 | """Initializes Grok based on keys in the environment variables. 16 | 17 | Returns: 18 | Grok: the Grok client 19 | """ 20 | grok_rest_client = GrokRestClient.create_from_env_var() 21 | grok_blob_client = GrokBlobClient.create_from_env_var() 22 | return Grok(grok_rest_client, grok_blob_client) 23 | 24 | def __init__( 25 | self, grok_rest_client: GrokRestClient, grok_blob_client: GrokBlobClient 26 | ): 27 | self.grok_rest_client = grok_rest_client 28 | self.grok_blob_client = grok_blob_client 29 | 30 | def run_grok( 31 | self, 32 | src_folder_path, 33 | dest_folder_path, 34 | blob_dest_folder=None, 35 | cleanup=False, 36 | use_async=True, 37 | ): 38 | """Uploads images in the source folder to blob, sets up an indexing pipeline to run 39 | GROK OCR on this blob storage as a source, then dowloads the OCR output json to the destination 40 | folder. There resulting json files are of the same name as the original images except prefixed 41 | with the name of their folder on the blob storages and suffixed with the .json extension. 42 | 43 | Args: 44 | src_folder_path (str): Path to folder holding the images. This folder must only contain png or jpg files 45 | dest_folder_path (str): Path to folder where OCR json files will be placed 46 | blob_dest_folder (str, optional): Folder tag to use on the blob storage. If set to None, a hash is generated 47 | based on the names of files in the src folder. Defaults to None. 48 | cleanup (bool, optional): If set to True, the indexing pipeline is deleted, and the files uploaded to the blob are 49 | deleted from blob after running. Defaults to True. 50 | use_multiprocessing (boo, optional): If set to True, this will use multiprocessing to increase blob transfers speed. 51 | 52 | Returns: 53 | indexer_status json, blob folder name 54 | """ 55 | print("uploading images to blob") 56 | blob_folder_name, _ = self.grok_blob_client.upload_images_to_blob( 57 | src_folder_path, dest_folder_name=blob_dest_folder, use_async=use_async 58 | ) 59 | print(f"images upload under folder {blob_folder_name}") 60 | try: 61 | print("creating and running indexer") 62 | self.grok_rest_client.create_indexing_pipeline() 63 | time.sleep(2) 64 | 65 | indexer_status = self.grok_rest_client.get_indexer_status() 66 | if indexer_status["status"] == "error": 67 | raise RuntimeError(f"indexer error: {indexer_status}") 68 | 69 | # if not already running start the indexer 70 | print("indexer_status", indexer_status) 71 | if ( 72 | indexer_status["lastResult"] is None 73 | or indexer_status["lastResult"]["status"] != "inProgress" 74 | ): 75 | self.grok_rest_client.run_indexer() 76 | 77 | time.sleep(1) 78 | print("\nrunning indexer") 79 | indexer_status = self.grok_rest_client.poll_indexer_till_complete() 80 | if indexer_status["lastResult"]["status"] == "success": 81 | time.sleep(30) 82 | print("fetching ocr json results.") 83 | self.grok_blob_client.get_ocr_json( 84 | blob_folder_name, dest_folder_path, use_async=use_async 85 | ) 86 | print(f"indexer status {indexer_status}") 87 | print( 88 | f"finished running indexer. json files saved to {dest_folder_path}" 89 | ) 90 | else: 91 | print("GROK failed", indexer_status["status"]) 92 | raise RuntimeError("GROK failed", indexer_status["status"]) 93 | return indexer_status, blob_folder_name 94 | finally: 95 | if cleanup: 96 | print("cleaning up indexer pipeline and blob store") 97 | self.cleanup(blob_folder_name) 98 | 99 | def cleanup(self, folder_name): 100 | """Deletes the indexing pipeline (index, indexer, datasource, skillset) from the search service. 101 | Deletes uploaded files from the blob 102 | 103 | Args: 104 | folder_name (str): blob folder name tag to remove 105 | """ 106 | self.grok_blob_client.delete_blobs_folder(folder_name) 107 | self.grok_rest_client.delete_indexer_pipeline() 108 | -------------------------------------------------------------------------------- /genalog/ocr/templates/datasource.json: -------------------------------------------------------------------------------- 1 | { 2 | "description" : "ocr image datasource", 3 | "credentials" : { "connectionString" : "" }, 4 | "container" : {"name": ""} 5 | } -------------------------------------------------------------------------------- /genalog/ocr/templates/index.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "[INDEX_NAME]", 3 | "fields": [ 4 | { 5 | "name": "content", 6 | "type": "Edm.String", 7 | "facetable": false, 8 | "filterable": false, 9 | "key": false, 10 | "retrievable": true, 11 | "searchable": true, 12 | "sortable": false, 13 | "analyzer": "standard.lucene", 14 | "indexAnalyzer": null, 15 | "searchAnalyzer": null, 16 | "synonymMaps": [], 17 | "fields": [] 18 | }, 19 | { 20 | "name": "metadata_storage_content_type", 21 | "type": "Edm.String", 22 | "facetable": false, 23 | "filterable": false, 24 | "key": false, 25 | "retrievable": false, 26 | "searchable": false, 27 | "sortable": false, 28 | "analyzer": null, 29 | "indexAnalyzer": null, 30 | "searchAnalyzer": null, 31 | "synonymMaps": [], 32 | "fields": [] 33 | }, 34 | { 35 | "name": "metadata_storage_size", 36 | "type": "Edm.Int64", 37 | "facetable": false, 38 | "filterable": false, 39 | "retrievable": false, 40 | "sortable": false, 41 | "analyzer": null, 42 | "indexAnalyzer": null, 43 | "searchAnalyzer": null, 44 | "synonymMaps": [], 45 | "fields": [] 46 | }, 47 | { 48 | "name": "metadata_storage_last_modified", 49 | "type": "Edm.DateTimeOffset", 50 | "facetable": false, 51 | "filterable": false, 52 | "retrievable": true, 53 | "sortable": false, 54 | "analyzer": null, 55 | "indexAnalyzer": null, 56 | "searchAnalyzer": null, 57 | "synonymMaps": [], 58 | "fields": [] 59 | }, 60 | { 61 | "name": "metadata_storage_content_md5", 62 | "type": "Edm.String", 63 | "facetable": false, 64 | "filterable": false, 65 | "key": false, 66 | "retrievable": true, 67 | "searchable": false, 68 | "sortable": false, 69 | "analyzer": null, 70 | "indexAnalyzer": null, 71 | "searchAnalyzer": null, 72 | "synonymMaps": [], 73 | "fields": [] 74 | }, 75 | { 76 | "name": "metadata_storage_name", 77 | "type": "Edm.String", 78 | "facetable": false, 79 | "filterable": false, 80 | "key": false, 81 | "retrievable": true, 82 | "searchable": true, 83 | "sortable": true, 84 | "analyzer": null, 85 | "indexAnalyzer": null, 86 | "searchAnalyzer": null, 87 | "synonymMaps": [], 88 | "fields": [] 89 | }, 90 | { 91 | "name": "metadata_storage_path", 92 | "type": "Edm.String", 93 | "facetable": false, 94 | "filterable": false, 95 | "key": true, 96 | "retrievable": true, 97 | "searchable": false, 98 | "sortable": false, 99 | "analyzer": null, 100 | "indexAnalyzer": null, 101 | "searchAnalyzer": null, 102 | "synonymMaps": [], 103 | "fields": [] 104 | }, 105 | { 106 | "name": "metadata_content_type", 107 | "type": "Edm.String", 108 | "facetable": false, 109 | "filterable": false, 110 | "key": false, 111 | "retrievable": false, 112 | "searchable": false, 113 | "sortable": false, 114 | "analyzer": null, 115 | "indexAnalyzer": null, 116 | "searchAnalyzer": null, 117 | "synonymMaps": [], 118 | "fields": [] 119 | }, 120 | { 121 | "name": "merged_content", 122 | "type": "Edm.String", 123 | "facetable": false, 124 | "filterable": false, 125 | "key": false, 126 | "retrievable": true, 127 | "searchable": true, 128 | "sortable": false, 129 | "analyzer": "standard.lucene", 130 | "indexAnalyzer": null, 131 | "searchAnalyzer": null, 132 | "synonymMaps": [], 133 | "fields": [] 134 | }, 135 | { 136 | "name": "text", 137 | "type": "Collection(Edm.String)", 138 | "facetable": false, 139 | "filterable": false, 140 | "retrievable": true, 141 | "searchable": true, 142 | "analyzer": "standard.lucene", 143 | "indexAnalyzer": null, 144 | "searchAnalyzer": null, 145 | "synonymMaps": [], 146 | "fields": [] 147 | }, 148 | { 149 | "name": "layoutText", 150 | "type": "Collection(Edm.String)", 151 | "facetable": false, 152 | "filterable": false, 153 | "retrievable": true, 154 | "searchable": true, 155 | "analyzer": "standard.lucene", 156 | "indexAnalyzer": null, 157 | "searchAnalyzer": null, 158 | "synonymMaps": [], 159 | "fields": [] 160 | } 161 | ], 162 | "suggesters": [], 163 | "scoringProfiles": [], 164 | "defaultScoringProfile": "", 165 | "corsOptions": null, 166 | "analyzers": [], 167 | "charFilters": [], 168 | "tokenFilters": [], 169 | "tokenizers": [] 170 | } -------------------------------------------------------------------------------- /genalog/ocr/templates/indexer.json: -------------------------------------------------------------------------------- 1 | { 2 | "fieldMappings": [ 3 | { 4 | "sourceFieldName": "metadata_storage_path", 5 | "targetFieldName": "metadata_storage_path", 6 | "mappingFunction": { 7 | "name": "base64Encode" 8 | } 9 | } 10 | ], 11 | "outputFieldMappings": [ 12 | { 13 | "sourceFieldName": "/document/merged_content", 14 | "targetFieldName": "merged_content" 15 | }, 16 | { 17 | "sourceFieldName": "/document/normalized_images/*/text", 18 | "targetFieldName": "text" 19 | }, 20 | { 21 | "sourceFieldName": "/document/normalized_images/*/layoutText", 22 | "targetFieldName": "layoutText" 23 | } 24 | ], 25 | "parameters": { 26 | "maxFailedItems": -1, 27 | "configuration": { 28 | "dataToExtract": "contentAndMetadata", 29 | "imageAction": "generateNormalizedImages" 30 | } 31 | } 32 | } -------------------------------------------------------------------------------- /genalog/ocr/templates/knowledge_store.json: -------------------------------------------------------------------------------- 1 | { 2 | "projections": [ 3 | { 4 | "tables": [ ], 5 | "objects": [ 6 | { 7 | "storageContainer": "projections", 8 | "source": null, 9 | "generatedKeyName": "myobject", 10 | "sourceContext": "/document", 11 | "inputs": [ 12 | { 13 | "name": "metadata_storage_name", 14 | "source": "/document/metadata_storage_name" 15 | }, 16 | { 17 | "name": "metadata_storage_path", 18 | "source": "/document/metadata_storage_path" 19 | }, 20 | { 21 | "name": "ocrText", 22 | "source": "/document/normalized_images/*/text" 23 | }, 24 | { 25 | "name": "ocrLayoutText", 26 | "source": "/document/normalized_images/*/layoutText" 27 | } 28 | ] 29 | 30 | } 31 | ], 32 | "files": [] 33 | } 34 | ] 35 | } -------------------------------------------------------------------------------- /genalog/ocr/templates/skillset.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "example_skillset", 3 | "description": "Skillset to run ocr on docs ;", 4 | "skills": [ 5 | { 6 | "@odata.type": "#Microsoft.Skills.Text.MergeSkill", 7 | "name": "#1", 8 | "context": "/document", 9 | "insertPreTag": " ", 10 | "insertPostTag": " ", 11 | "inputs": [ 12 | { 13 | "name": "text", 14 | "source": "/document/content" 15 | }, 16 | { 17 | "name": "itemsToInsert", 18 | "source": "/document/normalized_images/*/text" 19 | }, 20 | { 21 | "name": "offsets", 22 | "source": "/document/normalized_images/*/contentOffset" 23 | } 24 | ], 25 | "outputs": [ 26 | { 27 | "name": "mergedText", 28 | "targetName": "merged_content" 29 | } 30 | ] 31 | }, 32 | { 33 | "@odata.type": "#Microsoft.Skills.Vision.OcrSkill", 34 | "name": "#2", 35 | "context": "/document/normalized_images/*", 36 | "lineEnding": "Space", 37 | "defaultLanguageCode": "en", 38 | "detectOrientation": true, 39 | "inputs": [ 40 | { 41 | "name": "image", 42 | "source": "/document/normalized_images/*" 43 | } 44 | ], 45 | "outputs": [ 46 | { 47 | "name": "text", 48 | "targetName": "text" 49 | }, 50 | { 51 | "name": "layoutText", 52 | "targetName": "layoutText" 53 | } 54 | ] 55 | } 56 | ], 57 | "cognitiveServices": { 58 | "@odata.type": "#Microsoft.Azure.Search.CognitiveServicesByKey", 59 | "description": "cognitive service provider", 60 | "key": "" 61 | } 62 | } -------------------------------------------------------------------------------- /genalog/text/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/genalog/text/__init__.py -------------------------------------------------------------------------------- /genalog/text/lcs.py: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # --------------------------------------------------------- 5 | 6 | class LCS: 7 | """ Compute the Longest Common Subsequence (LCS) of two given string.""" 8 | 9 | def __init__(self, str_m, str_n): 10 | self.str_m_len = len(str_m) 11 | self.str_n_len = len(str_n) 12 | dp_table = self._construct_dp_table(str_m, str_n) 13 | self._lcs_len = dp_table[self.str_m_len][self.str_n_len] 14 | self._lcs = self._find_lcs_str(str_m, str_n, dp_table) 15 | 16 | def _construct_dp_table(self, str_m, str_n): 17 | m = self.str_m_len 18 | n = self.str_n_len 19 | 20 | # Initialize DP table 21 | dp = [[0 for j in range(n + 1)] for i in range(m + 1)] 22 | 23 | for i in range(1, m + 1): 24 | for j in range(1, n + 1): 25 | # Case 1: if char1 == char2 26 | if str_m[i - 1] == str_n[j - 1]: 27 | dp[i][j] = 1 + dp[i - 1][j - 1] 28 | # Case 2: take the max of the values in the top and left cell 29 | else: 30 | dp[i][j] = max(dp[i - 1][j], dp[i][j - 1]) 31 | return dp 32 | 33 | def _find_lcs_str(self, str_m, str_n, dp_table): 34 | m = self.str_m_len 35 | n = self.str_n_len 36 | lcs = "" 37 | while m > 0 and n > 0: 38 | # same char 39 | if str_m[m - 1] == str_n[n - 1]: 40 | # prepend the character 41 | lcs = str_m[m - 1] + lcs 42 | m -= 1 43 | n -= 1 44 | # top cell > left cell 45 | elif dp_table[m - 1][n] > dp_table[m][n - 1]: 46 | m -= 1 47 | else: 48 | n -= 1 49 | return lcs 50 | 51 | def get_len(self): 52 | return self._lcs_len 53 | 54 | def get_str(self): 55 | return self._lcs 56 | -------------------------------------------------------------------------------- /genalog/text/preprocess.py: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # --------------------------------------------------------- 5 | 6 | import re 7 | 8 | END_OF_TOKEN = {" ", "\t", "\n"} 9 | NON_ASCII_REPLACEMENT = "_" 10 | 11 | 12 | def remove_non_ascii(token, replacement=NON_ASCII_REPLACEMENT): 13 | """Remove non ascii characters in a token 14 | 15 | Arguments: 16 | token (str) : a word token 17 | replacement (str, optional) : a replace character for non-ASCII characters. 18 | Defaults to ``NON_ASCII_REPLACEMENT``. 19 | Returns: 20 | str -- a word token with non-ASCII characters removed 21 | """ 22 | # Remove non-ASCII characters in the token 23 | ascii_token = str(token.encode("utf-8").decode("ascii", "ignore")) 24 | # If token becomes an empty string as a result 25 | if len(ascii_token) == 0 and len(token) != 0: 26 | ascii_token = replacement # replace with a default character 27 | return ascii_token 28 | 29 | 30 | def tokenize(s): 31 | """Tokenize string 32 | 33 | Arguments: 34 | s (str) : aligned string 35 | 36 | Returns: 37 | a list of tokens 38 | """ 39 | # split alignment tokens by spaces, tabs and newline (and excluding them in the tokens) 40 | return s.split() 41 | 42 | 43 | def join_tokens(tokens): 44 | """Join a list of tokens into a string 45 | 46 | Arguments: 47 | tokens (list) : a list of tokens 48 | 49 | Returns: 50 | a string with space-separated tokens 51 | """ 52 | return " ".join(tokens) 53 | 54 | 55 | def _is_spacing(c): 56 | """ Determine if the character is ignorable """ 57 | return True if c in END_OF_TOKEN else False 58 | 59 | 60 | def split_sentences(text, delimiter="\n"): 61 | """ Split a text into sentences with a delimiter""" 62 | return re.sub(r"(( /?[.!?])+ )", rf"\1{delimiter}", text) 63 | 64 | 65 | def is_sentence_separator(token): 66 | """ Returns true if the token is a sentence splitter """ 67 | return re.match(r"^/?[.!?]$", token) is not None 68 | -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | flake8 2 | flake8-import-order 3 | pytest 4 | pytest-cov 5 | pytest-mock 6 | pytest-xdist[psutil] 7 | pytest-lazy-fixture 8 | tox 9 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | biopython 2 | numpy 3 | python-dotenv 4 | requests 5 | azure-core 6 | azure-common 7 | azure-storage-blob 8 | tqdm 9 | Jinja2==2.11.1 10 | WeasyPrint 11 | matplotlib 12 | scikit-image 13 | pandas 14 | aiofiles 15 | aiohttp -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import setuptools 4 | 5 | with open(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'VERSION.txt')) as version_file: 6 | BUILD_VERSION = version_file.read().strip() 7 | 8 | # Loading dependencies from requirements.txt 9 | with open('requirements.txt') as f: 10 | requirements = f.read().splitlines() 11 | 12 | with open("README.md", "r", encoding="utf8") as fh: 13 | long_description = fh.read() 14 | 15 | setuptools.setup( 16 | name="genalog", 17 | install_requires=requirements, 18 | version=BUILD_VERSION, 19 | author="Jianjie Liu & Amit Gupte", 20 | author_email="ta_maidap_fy20_h2@microsoft.com", 21 | description="Tools for generating analog document (images) from raw text", 22 | long_description=long_description, 23 | long_description_content_type="text/markdown", 24 | url='https://github.com/microsoft/genalog', 25 | packages=setuptools.find_packages(exclude=['tests', 'tests.*']), 26 | package_data={'': [ 27 | 'genalog/generation/templates/*.jinja' 28 | ]}, 29 | include_package_data=True, 30 | classifiers=[ 31 | "Programming Language :: Python :: 3", 32 | "Operating System :: OS Independent", 33 | ], 34 | python_requires='>=3.6', 35 | ) 36 | -------------------------------------------------------------------------------- /tests/.env: -------------------------------------------------------------------------------- 1 | COMPUTER_VISION_ENDPOINT = "https://enki-vision.cognitiveservices.azure.com/" 2 | SEARCH_SERVICE_NAME = "ocr-ner-pipeline" 3 | SKILLSET_NAME = "testocrskillset" 4 | INDEX_NAME = "testocrindex" 5 | INDEXER_NAME = "testocrindexer" 6 | DATASOURCE_NAME = "syntheticimages" 7 | DATASOURCE_CONTAINER_NAME = "testocrimages" 8 | BLOB_NAME = "syntheticimages" 9 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/tests/__init__.py -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | 4 | import pytest 5 | from dotenv import load_dotenv 6 | 7 | from tests.required_env import RequiredEnvVar 8 | 9 | ENV_FILEPATH = "tests/.env" 10 | 11 | 12 | @pytest.fixture(scope="session") 13 | def load_azure_resources(): 14 | # Loading the non-secrets 15 | load_dotenv(ENV_FILEPATH) 16 | logging.info(f"Loading .env from {ENV_FILEPATH}") 17 | logging.debug("Printing environment vars: ") 18 | for env in RequiredEnvVar: 19 | logging.debug(f"\t{env.value}: {os.environ.get(env.value)}") 20 | -------------------------------------------------------------------------------- /tests/e2e/data/conll_formatter/clean_labels/2161.txt: -------------------------------------------------------------------------------- 1 | who O 2 | would O 3 | be O 4 | elevated O 5 | to O 6 | Heaven O 7 | and O 8 | not O 9 | be O 10 | burned O 11 | in O 12 | etermal O 13 | damnation O 14 | , O 15 | only O 16 | slants O 17 | the O 18 | facts O 19 | : O 20 | & O 21 | quot O 22 | ; O 23 | . O 24 | -------------------------------------------------------------------------------- /tests/e2e/data/conll_formatter/ocr_text/17.txt: -------------------------------------------------------------------------------- 1 | So , al - jazeera TV station seized this opportunity to get hold of , this encies group of people by faring them , or , at their reporters , editors , or anchors at high salaries , Uhuh . So, they had a relatively good team of reporters . Un fun . Well , this way , later it followed that , or ! or. . , er , six - Chanese character prix loss of news reporting cased , er , independence , neutraity , an , neveraity , basece and freedom Un fun . Uh-huh. So , what it reported was in a completely different style from that of some other Arab TV stations. Right . in thas respect , that is , bet me add one powx , that is , this al- Jazeera TV station, ah , it's style is very much characterized by direct borrowing from the west , for in stance , the two mainstream media outlets un-ten . Unsoon. Right . Un toh One is CAN, and the other is BSC . Yeah , Well , I think that BBC , in particular . hes grate s bag influence on it . Just now that is to say , many of its reporters directly came from the societe fast charmed of that time . Un fam . That was jointty pin by Bec and Saudi Arsexia ! So I had s very good foundation! in adition , that is . actually . this aljazeera TV station has a quite unique structure. That's because the one of this country is caked Hamad : He studied in Britain and therefore had quite a good knowsedge of Bream's BBC TV station , He also ques adriwed . .So, in this way, it has borrowed some of the BBC style . For instance, Britain's BNC is a very old TV group established in 1927 . Un hon: Though it is funded by the government with many of is properties owned by the government , at the fodoes the guideline of ta ter's independence . Un-hun For iatance, the top deckion . making body of BBC is called the board of director which are composed of twelve members sweetly appointed by the Queen . Unnun, So at Jazeera TV station has onto adopted that structure . It has a top seven . member board of directors . Un huh . Un tech . However , even! it's property, an , and funds can come from the government . R gis relatively has its . Mortal independence. En, but in essence, be Wy, is it ready a private TV station or government . nun TV station ? Er , they themselves claim that & is a private TV station , but in revery I could not have been established weneed a large amount of financial support given by the govern mart. That's because it, eh, has an extremely small number of ads during as around the clock, an. TVbroadcast Un hun Un hun ! well, in audition , at has such a large team , especially with high , high er, wages , and reporters based abroad, so many reporters abroad. If the government had not sug ported wich exper Stures , on, It would be impossible for a private TV station to survive , According to your knowledge , how much is its yearty expenditure ? it was said that the yearly expenditures seem to be about 7 bastion US dollars. About 7 bibion US dollars, that is equivalent to more than 50 bastion Road We showed say this is a very huge financial expenditure, Un fun. Extremely large. wan. Therefore, some people call it gaining woke but kising money because it has relatively imas revenues due to few ads Ut hun Unfun Union , Un tan. But it has a tremendous influence. Right. So. It appears that * is sod not bad in gaining voice as it does achieve some effect, wan. He was, speaking of its andtu ence , we have ano nocked that or, during the Alphon was, er, because it has the exclusive interview right to enter Mohantion to conduct independent and cocksive interviews, we could say this is one of is advantages, Uhiuhi, & ano presented an opportunity for as sex. wet, to the development of a TV station , both opportunity and real strength are actually very important ) well , as al Jazeera TV sta tion has been was to develop see as current status. what do you think has & robed upon so that its competitiveness and invential power , even surpassed CAN and BAC during the iraq war ? The great west feature of al jazeera TV station is that it is a small station that competes with large ones and has become wed . known through wars . That is, the reason why it can establish itself is that it first resed on the Action war, and then the way war after the 9/ 11, Unfun Chifue So, It made fod use of these two opportunities . in addition, as the iraq war occurred right in an Arab country , everyone is very much concerned with what is going on in the war . So , thus provides it with a large viewership. making it instantly famous . Un hurt , Un fish . Besides , i has a lot of resources , including it's exckative coverage right in Afghanistan as you mentioned wist now . Un ton , un tan . in addition , al Jazeera TV station is actualy quite ious in the Arab rection, For instance , a very tough commut ie this region is the conflict between Palestine and barzel unfun Utton. Well, in gerard, Arabs wie nick wake the trash leader in make a speech on their TV. However, for al jazeera TV station , It could inyee Barak the israel prime minister at that time , to debver a speech at al Jazeera TV station , This was un precedented in the Arab world . Right . Un huh . Well , this also gave it a very unique perspective . Un -------------------------------------------------------------------------------- /tests/e2e/data/conll_formatter/ocr_text/1839.txt: -------------------------------------------------------------------------------- 1 | would n't k be more & quot ; deflational & quot ; to excise bebefs where we can ? Box manyways were and still are . convinced that the memory lobotomy was intentional . part of washington 's plans to ex cige the strong . rooted nation that was andreplace it with their own model , And you can also exche the topnerseparatce wow needed to return the pendant to which explains it be that & quot ; Rumor is that a personal kem of a priest 's may be used to drew & out or excise a from is hours & quot ; and the wants to use the pendant to & quot : remove a particularly burdensome spire from a property I'm loking to kwest in & quot : Would n't it be more deflations to excise beeats where we can ? You're probably right , you ca n't excise an entire category LAB unless you 've budgeted for entertainment and vacations , in which case that should be the first to go to zero ARE , but I suspect you need to book twit at the areas of large expense wich for most people are housing and cars. My famay practice doctor was going to excise it and although I have a very high level of confidence in him . I posted ask. big # I should be referred to a surgeon . But we loise something in the world when are have to excise at imagery of chadren , they innocence and joy from our world, In order to protect them? The doctor ex cises the biopsy and does n't stitch up the wound site to preserve healthy tiswe for a week or more be fore he gives me the results of has exam. That would be more evident had you not excised the crap to which I was responding , but of course you had to leave that out in order to come up on your high horse and found superior. wore : You're probably right, you ca n't excise an entire category LAD unless you've budgeted for entertainment and vacations , In which case that should be the first to go to zero. And. , beat I suspect you need to look first at the areas of large expense which for most people are housing and cars ? " ARB. war quote , which your neatly excised from this post, could be interpreted several offerent ways : and humorous was envy one of them . Diebold and the dubious voting machines voting machine company Diebold apparently excised long paragraphs detawny the US security indus. try's concerns over the integrity of thes voting machines, and information about the company 's chief executive 's neweasing for President bush, It extinguishes the small; it inflames the great. This made me into s meany , but at aise extinguished the whining. He said that we had to extinguish the bigies of the world , and when we would see the lights of New wat go out, we would know the our job was done & quot ; too have done nothing that extinguishes others " homes to use the land . just lee the he wrists , they are trying to extinguish the flames of the jewish sox & quot ; Father , extinguishes the thast of our poverty . Then I took tom from & and extinguished it with my hands , which made the evening news . The reason for this is that the owner of the house is treated as paying off part of the Gett wah the /MY of the house . however, because the debt is a personal lately. the transfer of the house don't not extinguish the remainder of the date, which continues is be a personal Sabaity of the owner ! You're paying attention to the new behavior and letting the bid one go, so the old one extin guithes . If you did n't extinguish the fame, you 'd have good kick for a fod year ! The present Tet madaw Government extinguished the conducts of fire the could not be extinguished at the time of pre visit Governeverts If he 'd 've done ., he I've found 's way to extinguish es before . prestened to turn them as crispy . The Vista fire Department extinguished the bare before i grew out of control the statement said. Thus little by better, It extinguishes their sports and enervates their souls .. R Is understood that the rain drop reserves of the tree must wolter in capecky to the plan of government * has to extinguish, They tell your that if you accidentally start a fire and you have n't managed to extin guish it in the first name , you're probably not going to be able to and you should get help ined ately . This mual harkens back to the days when Somann was one of only two days the other being Betaire when a was considered correct to extinguish the & quot ; hearth fire & quot ; and then to re light it, The two other meds are sbout to teach thes investigations, and according to our sources , the White House exerts pressures so that they extinguish the business . You have to be careful what you extinguish , It only takes a few minutes to appopriately extinguish any behavior , but you 've got to know HOWE : The suspect's son, a fee forover, was ised enter this year while trying to extinguish s factory bare , according to police. I know k is to extinguish a five but how old could they be and what we they worth ? ? ? 17 )? when the new bankruptcy be was passed by Congress last spring, bankers predicted & would turn many people away from the protection of the courts by making & harder to ex ianguish dete , A flashlight shone in one comer of a dark room does not extinguish the rest of the room we just ca n't SEE R . -------------------------------------------------------------------------------- /tests/e2e/data/conll_formatter/ocr_text/2161.txt: -------------------------------------------------------------------------------- 1 | who would be elevated to Heaven and not be burned in etermal damnation , only slants the facts : & quot ; . -------------------------------------------------------------------------------- /tests/e2e/data/conll_formatter/ocr_text/5.txt: -------------------------------------------------------------------------------- 1 | well, this is because he is all using the pre Cold . was automatic thinking after the Cold War, that is the United States is the number one superpower in the world . Un ten : Un hat, Japan is the second largest economy in the world . Un hun As long as these two countries stand together , then it seems that other countries could not do anything to them . Un hun . well, this is actually a very overbearing image Un hit . Well, looking at the situation in Ava, actually , I'is exactly the other way round . Un. hun. That is to say , I it does not get on good terms with is Asian neighboring countries. then in rest. ly the US mis become even more hard . mine in its diplomacy with japan. As for japan, to the US. . As thermore has lost some assets , diplomneck assets when making negotiations with the US. Un fun. Un hun. Well, just recently, it's nee diets for us to say, or, that Japan's relations , we can say , wich it's neighboring countries that your have suffered overat deterioration . These inchoate Japan 's relations with South Korea, Chine, and Russia which are all deteriorating, was, even including it's relations with the US. in fact,. even today , we sho noticed a piece of news that athough at, uns, the recers two ! pills . two meeting, or , the US and japan again reached an intermediate report, Un tun, Un fun . UP hun. Well . today , actualy the head of japan's Defense Agency again mentioned to the US that seems to feel regretted, why ? Uh-huh, That is , a be regreens, Because, according to the two - pais . two intermediate report jist reached. the US matary bases in japan should undergo adjustment . Now ever . I was met with strong opposition from the general pubic in Japan . So it may be hard to inple ment . Union So, now the DA head came to talk with the US, saying, whether our interests can be taken wito comideration again . or. in this inverme face report . That is, between the US and Japan. centering upon this intermediate report, actually the deal is again partially completed . Uh ton. So . such a prime minister is forum , who sandy follows the US as the passions of his diplomacy . was rarely seen even during the Coal wes ersUh-huh, We know that in 1957 Japan had three diplomatic principles . That is to say , It ment take good care of as relations with the western countries, Asian countries , and the United Nations : Uni out, Utsaun, But now! Keizer is lee a stick , let 's say . a post. Right it takes three points to support s plane . Wan . So it turned out that they had such son on the surface : Well ; some prime ministers before Kotzuns of beast would stil property deal with and butsince the relations with America , with the US and Asia , In particular , with them . an Un hun . Yes when a comes to Kozuers , he only takes good care of the relations with the US, Right . in fact, he wisely fail to take good care of the relations wah the US because if you can not win the trust of your Asin neighboring countries, the US we not by respect such a country . Un tan ! Un has . The US also wants to go beyond japan to keep good relations wan Chains , South korea , and other Asian coun. tries . Therefore , the more he follows the US, the lover will be ise actual status in Ass, Unfan. Ris apossible . Right , his kies was actually erakired and opposed by some postxian's even in japan. He was strongly criticized and increasingly isolated . Ah . The domestic economy was also affected . Many people afto mentioned , by young the US alone, you are inoring Atla . was. Now many meds of is sued statements and accorians. Therefore, some people also compared this diplomacy of has to a lame diplomacy . Yeah . He - Could he wake stewkey with a lame diplomacy ? Eh, as far as kolrumi himself is concerned , what is his probeem ? Uh ten , He himself, oh, or what showed we say ? He does not re spect and acknowledge reality . Actually , some people say he is deceiving himself as well as others Un hat. for instance , that time at the APEC meeting in Puton. when he mentioned, or , japan 's reis When's with Chains, he win used this kind of, that is to say, thecork, to offend manned, Un hen Union He said, or an , right now, Japan. China is ations are not lee, or, what the internacional comes nety is worried about Un ton. well the ecoreank trade development in our two countries is going on gate well, in fact , we can say he does not understand the current status of Sino . Japanese relations Un hut. Actually, due to his visit to resusun Shine and the cooling down of the political relationship Sing . japanese relations have suffered severe blows , He is only indulging in his unilateral winds thinking . I is only what he thinks. thi han , we can see some concrete figures , for instance, from larwary to August its's year, the latest statistics show that japan's experts to Chine grew only by 3.2 % according to China's statistics . Union Uni butt . It grew only by 5.8 % according to japan's statistics If tum . Well , in prevex's years . It was always more than 25 % . in other words , Japan's exports to China were declining sharply . More than 25 % . Un hun . -------------------------------------------------------------------------------- /tests/e2e/data/conll_formatter/ocr_text/7965.txt: -------------------------------------------------------------------------------- 1 | A it the ultimate room with a view for formula one fans , racing afficionados all next week be able to spend the night and the race day in the heart of the action at the malaysian grand prix on the sepang frack . between march 27 and 29 , visitors wit site have a unique midnight tour of the track , a trade tional matrian breakfast in the morning with former git reporter sanjeev palar , and an exclusive be And the scenes pit tour. enjoy the world 's hottest race , /1 , from the comforts of your safe , with a unique stay in the heart of the action inside sepang track in malaysia . the apartment boasts unrivaled views of the racing track and will allow fans to witness their favourite start up close , the apartment sleeps four and provides a unique luxury experience with panoramic wows . guests can enjoy the open plan apartment for all those sessions of the race including practice . qualifying and race day Itself . the home Is described as an " basis at the heart of the world 's hottest race ' and comes complete with a sige living room featuring panorama windows offering uninteripted views of the racetrack , one mas let bedroom ,'s kitchen , a divine area and a bathroom : fans can witness every second of the world 's fastest engines from the comfort of the living room couch . they can also bring along those friends of their own to share in the exciting event ! from monaco to melbourne , the world 's fastest and most prestigious race has traversed five continents, renowned for it's mooring engines and sky high tempers ture . depending on the dates assigned . quests will be treated to a race pa tour , a midnight track tour and an sunday breakfast with an auto host , malaysia 's pit lane reporter , sanjeev palar . in or det to secure your place in this once in a sictime racing experience , enter at airbnb before march 22 the sepang track stay forms part of awoes 's 's night at ' campaign, which aims to convert unique to cations around the world , where no one has ever been able to send the night before , into unforget table wight says . this has included a night at the top of use holmenkollen ski jump in norway . inside the open plan apartment , there is a large living room . with panorama views of the racetrack , one master bedroom , a small kitchen, # dining area and a bathroom . quests will get to watch all three sessions of the race - practice , qualifying and race day fock -------------------------------------------------------------------------------- /tests/e2e/data/splitter/example_splits/clean_labels/1.txt: -------------------------------------------------------------------------------- 1 | On O 2 | July B-DATE 3 | 22 I-DATE 4 | , I-DATE 5 | 1940 I-DATE 6 | , O 7 | a O 8 | campaign O 9 | preparation O 10 | order O 11 | to O 12 | attack O 13 | the B-FAC 14 | Zhengtai I-FAC 15 | Railway I-FAC 16 | , O 17 | jointly O 18 | signed O 19 | by O 20 | Zhu B-PERSONNAME 21 | De I-PERSONNAME 22 | , O 23 | Peng B-PERSONNAME 24 | Dehuai I-PERSONNAME 25 | , O 26 | and O 27 | Zuo B-PERSONNAME 28 | Quan I-PERSONNAME 29 | , O 30 | was O 31 | sent O 32 | to O 33 | Yan'an B-GPE 34 | and O 35 | all O 36 | units O 37 | of O 38 | the B-ORGANIZATION 39 | Eighth I-ORGANIZATION 40 | Route I-ORGANIZATION 41 | Army I-ORGANIZATION 42 | . O 43 | 44 | What O 45 | was O 46 | the O 47 | , O 48 | purpose O 49 | and O 50 | goal O 51 | of O 52 | this O 53 | campaign O 54 | ? O 55 | ? O 56 | ? O 57 | ? O 58 | 59 | It O 60 | was O 61 | to O 62 | break O 63 | through O 64 | the O 65 | Japanese B-NORP 66 | army O 67 | 's O 68 | siege O 69 | policy O 70 | against O 71 | base O 72 | areas O 73 | behind O 74 | enemy O 75 | lines O 76 | , O 77 | and O 78 | to O 79 | avert O 80 | the O 81 | crisis O 82 | of O 83 | China B-GPE 84 | 's O 85 | compromise O 86 | and O 87 | surrender O 88 | . O 89 | 90 | It O 91 | was O 92 | to O 93 | overcome O 94 | this O 95 | crisis O 96 | . O 97 | 98 | Well O 99 | , O 100 | the B-EVENT 101 | Hundred I-EVENT 102 | Regiments I-EVENT 103 | Offensive I-EVENT 104 | was O 105 | divided O 106 | into O 107 | three B-CARDINAL 108 | phases O 109 | . O 110 | 111 | Beginning O 112 | from O 113 | August B-DATE 114 | 20 I-DATE 115 | , O 116 | from O 117 | August B-DATE 118 | 20 I-DATE 119 | to I-DATE 120 | September I-DATE 121 | 10 I-DATE 122 | , O 123 | the O 124 | main O 125 | purpose O 126 | of O 127 | the O 128 | ... O 129 | . O 130 | 131 | -------------------------------------------------------------------------------- /tests/e2e/data/splitter/example_splits/clean_text/0.txt: -------------------------------------------------------------------------------- 1 | What kind of memory ? 2 | We respectfully invite you to watch a special edition of Across China ! ! ! 3 | WW II Landmarks on the Great Earth of China : Eternal Memories of Taihang Mountain Standing tall on Taihang Mountain is the Monument to the Hundred Regiments Offensive . 4 | It is composed of a primary stele , secondary steles , a huge round sculpture and beacon tower , and the Great Wall , among other things . 5 | A primary stele , three secondary steles , and two inscribed steles . 6 | The Hundred Regiments Offensive was the campaign of the largest scale launched by the Eighth Route Army during the War of Resistance against Japan . 7 | This campaign broke through the Japanese army 's blockade to reach base areas behind enemy lines , stirring up anti-Japanese spirit throughout the nation and influencing the situation of the anti-fascist war of the people worldwide . 8 | This is Zhuanbi Village , Wuxiang County of Shanxi Province , where the Eighth Route Army was headquartered back then . 9 | On a wall outside the headquarters we found a map . 10 | This map was the Eighth Route Army 's depiction of the Mediterranean Sea situation at that time . 11 | This map reflected the European battlefield situation . 12 | In 1940 , the German army invaded and occupied Czechoslovakia , Poland , the Netherlands , Belgium , and France . 13 | It was during this year that the Japanese army developed a strategy to rapidly force the Chinese people into submission by the end of 1940 . 14 | In May , the Japanese army launched -- From one side , it seized an important city in China called Yichang . 15 | Um , , uh , through Yichang , it could directly reach Chongqing . 16 | Ah , that threatened Chongqing . 17 | Then they would , ah , bomb these large rear areas such as Chongqing . 18 | So , along with the coordinated , er , economic blockade , military offensives , and strategic bombings , er , a simultaneous attack was launched in Hong Kong to lure the KMT government into surrender . 19 | The progress of this coordinated offensive was already very entrenched by then . 20 | By 1940 , China 's War of Resistance against Japan had entered a stalemate . 21 | The situation on our side and the enemy 's side was intertwined . 22 | The Eighth Route Army guerrillas were extraordinarily active , creating more and more trouble for the Japanese army in North China . 23 | Hayao Tada , commander of the Japanese North China Area Army , adopted a strategy of siege warfare to deal with the Eighth Route Army . 24 | The specific method was building a closely connected transport network , with a road for every village and defensive towers on every road . 25 | Roads and railways were used as links to connect all of North China into a solid , widespread siege , in order to strangle the Eighth Route Army and its base areas in this net . 26 | As part of the Japanese army 's strategy of siege warfare , railways and roads had actually become the Japanese army 's weapons of war , becoming a great threat to the base areas . 27 | In December 1939 , Commander - in - chief Zhu De and Vice Commander Peng Dehuai of the Eighth Route Army received a top - secret telegram from Commander Lu Zhengcao of the Jizhong Military District , among other people . 28 | The telegram said that the Japanese troops were building blockade trenches and chessboard - like roads to divide the Jizhong base area into small isolated blocks without the ability to mutually communicate and support each other , causing the Eighth Route Army and the guerrillas to lose maneuverability . 29 | Before the Hundred Regiments Offensive in 1940 , an inclination to compromise , ah , surrender , was an extremely serious crisis in the frontline situation in China . 30 | Well , on the battlefield behind enemy lines , in order to take over , consolidate the area under its occupation , Japan began a new strategy . 31 | That was to use railways as a pillar , roads as a chain , and strongholds as a lock , to carry out siege warfare in an attempt to divide the base areas behind enemy lines , ah , so as , er , to cut off their communication with one another . 32 | In addition , it relied on this cage , ah , to further strengthen its assaults against the base areas . 33 | Er . 34 | So , it was amidst such a grave international and domestic situation that the Eighth Route Army led by the Chinese Communist Party , ah , launched , ah , a strategic offensive called the Hundred Regiments Offensive . 35 | This plot of the Japanese army drew great attention from Zhu De and Peng Dehuai of Eighth Route Army headquarters . 36 | After meticulous studies and painstaking preparations by many parties , a battle plan based on surprise was formulated . 37 | -------------------------------------------------------------------------------- /tests/e2e/data/splitter/example_splits/clean_text/1.txt: -------------------------------------------------------------------------------- 1 | On July 22 , 1940 , a campaign preparation order to attack the Zhengtai Railway , jointly signed by Zhu De , Peng Dehuai , and Zuo Quan , was sent to Yan'an and all units of the Eighth Route Army . 2 | What was the , purpose and goal of this campaign ? ? ? ? 3 | It was to break through the Japanese army 's siege policy against base areas behind enemy lines , and to avert the crisis of China 's compromise and surrender . 4 | It was to overcome this crisis . 5 | Well , the Hundred Regiments Offensive was divided into three phases . 6 | Beginning from August 20 , from August 20 to September 10 , the main purpose of the ... . 7 | -------------------------------------------------------------------------------- /tests/e2e/data/synthetic_dataset/shared/train/clean_labels/2161.txt: -------------------------------------------------------------------------------- 1 | who O 2 | would O 3 | be O 4 | elevated O 5 | to O 6 | Heaven O 7 | and O 8 | not O 9 | be O 10 | burned O 11 | in O 12 | etermal O 13 | damnation O 14 | , O 15 | only O 16 | slants O 17 | the O 18 | facts O 19 | : O 20 | & O 21 | quot O 22 | ; O 23 | . O 24 | -------------------------------------------------------------------------------- /tests/e2e/data/synthetic_dataset/test_version/.gitignore: -------------------------------------------------------------------------------- 1 | **/ocr_labels 2 | **/ocr_text -------------------------------------------------------------------------------- /tests/e2e/data/synthetic_dataset/test_version/train/ocr/2161.json: -------------------------------------------------------------------------------- 1 | [{"language": "en", "text": "who would be elevated to Heaven and not be burned in etermal damnation , only slants the facts : & quot ; .", "lines": [{"boundingBox": [{"x": 146, "y": 157}, {"x": 1252, "y": 156}, {"x": 1253, "y": 179}, {"x": 147, "y": 180}], "text": "who would be elevated to Heaven and not be burned in etermal damnation , only slants the facts : &"}, {"boundingBox": [{"x": 147, "y": 184}, {"x": 228, "y": 183}, {"x": 229, "y": 203}, {"x": 148, "y": 204}], "text": "quot ; ."}], "words": [{"boundingBox": [{"x": 147, "y": 158}, {"x": 192, "y": 158}, {"x": 192, "y": 179}, {"x": 147, "y": 179}], "text": "who"}, {"boundingBox": [{"x": 199, "y": 158}, {"x": 263, "y": 158}, {"x": 264, "y": 179}, {"x": 199, "y": 179}], "text": "would"}, {"boundingBox": [{"x": 271, "y": 158}, {"x": 299, "y": 157}, {"x": 299, "y": 180}, {"x": 271, "y": 179}], "text": "be"}, {"boundingBox": [{"x": 307, "y": 157}, {"x": 400, "y": 157}, {"x": 400, "y": 180}, {"x": 308, "y": 180}], "text": "elevated"}, {"boundingBox": [{"x": 407, "y": 157}, {"x": 429, "y": 157}, {"x": 430, "y": 180}, {"x": 407, "y": 180}], "text": "to"}, {"boundingBox": [{"x": 436, "y": 157}, {"x": 518, "y": 157}, {"x": 518, "y": 180}, {"x": 437, "y": 180}], "text": "Heaven"}, {"boundingBox": [{"x": 528, "y": 157}, {"x": 567, "y": 157}, {"x": 567, "y": 180}, {"x": 528, "y": 180}], "text": "and"}, {"boundingBox": [{"x": 574, "y": 157}, {"x": 613, "y": 157}, {"x": 614, "y": 180}, {"x": 574, "y": 180}], "text": "not"}, {"boundingBox": [{"x": 618, "y": 157}, {"x": 646, "y": 157}, {"x": 646, "y": 180}, {"x": 618, "y": 180}], "text": "be"}, {"boundingBox": [{"x": 653, "y": 157}, {"x": 730, "y": 157}, {"x": 730, "y": 180}, {"x": 653, "y": 180}], "text": "burned"}, {"boundingBox": [{"x": 736, "y": 157}, {"x": 757, "y": 157}, {"x": 757, "y": 180}, {"x": 736, "y": 180}], "text": "in"}, {"boundingBox": [{"x": 765, "y": 157}, {"x": 854, "y": 157}, {"x": 854, "y": 180}, {"x": 765, "y": 180}], "text": "etermal"}, {"boundingBox": [{"x": 858, "y": 157}, {"x": 970, "y": 157}, {"x": 970, "y": 180}, {"x": 858, "y": 180}], "text": "damnation"}, {"boundingBox": [{"x": 979, "y": 157}, {"x": 990, "y": 157}, {"x": 990, "y": 180}, {"x": 979, "y": 180}], "text": ","}, {"boundingBox": [{"x": 994, "y": 157}, {"x": 1041, "y": 157}, {"x": 1041, "y": 180}, {"x": 994, "y": 180}], "text": "only"}, {"boundingBox": [{"x": 1046, "y": 157}, {"x": 1114, "y": 157}, {"x": 1114, "y": 180}, {"x": 1046, "y": 180}], "text": "slants"}, {"boundingBox": [{"x": 1118, "y": 157}, {"x": 1154, "y": 157}, {"x": 1154, "y": 180}, {"x": 1118, "y": 180}], "text": "the"}, {"boundingBox": [{"x": 1161, "y": 157}, {"x": 1216, "y": 157}, {"x": 1216, "y": 179}, {"x": 1161, "y": 180}], "text": "facts"}, {"boundingBox": [{"x": 1220, "y": 157}, {"x": 1233, "y": 157}, {"x": 1233, "y": 179}, {"x": 1220, "y": 179}], "text": ":"}, {"boundingBox": [{"x": 1237, "y": 157}, {"x": 1253, "y": 157}, {"x": 1253, "y": 179}, {"x": 1237, "y": 179}], "text": "&"}, {"boundingBox": [{"x": 148, "y": 185}, {"x": 198, "y": 184}, {"x": 198, "y": 204}, {"x": 149, "y": 205}], "text": "quot"}, {"boundingBox": [{"x": 202, "y": 184}, {"x": 213, "y": 184}, {"x": 213, "y": 204}, {"x": 202, "y": 204}], "text": ";"}, {"boundingBox": [{"x": 217, "y": 184}, {"x": 229, "y": 184}, {"x": 228, "y": 204}, {"x": 216, "y": 204}], "text": "."}]}] -------------------------------------------------------------------------------- /tests/e2e/templates/solid_bg.html.jinja: -------------------------------------------------------------------------------- 1 | 8 | {% if content and content.content_type.__str__() == "ContentType.COMPOSITE" %} 9 | {% for c in content %} 10 | {% if c.content_type.__str__() == "ContentType.PARAGRAPH"%} 11 |

12 | {{ c }} 13 |

14 | {% else %} 15 |

Unsupported Content Type: {{c.content_type.__str__()}}

16 | {% endif %} 17 | {% endfor %} 18 | {% else %} 19 | No content loaded 20 | {% endif %} 21 | -------------------------------------------------------------------------------- /tests/e2e/test_anchor_e2e.py: -------------------------------------------------------------------------------- 1 | import difflib 2 | import glob 3 | import warnings 4 | 5 | import pytest 6 | 7 | from genalog.text import alignment, anchor, preprocess 8 | 9 | 10 | @pytest.mark.slow 11 | @pytest.mark.parametrize( 12 | "gt_file, ocr_file", 13 | zip( 14 | sorted(glob.glob("tests/unit/text/data/gt_*.txt")), 15 | sorted(glob.glob("tests/unit/text/data/ocr_*.txt")), 16 | ), 17 | ) 18 | def test_align_w_anchor_and_align(gt_file, ocr_file): 19 | gt_text = open(gt_file, "r").read() 20 | ocr_text = open(ocr_file, "r").read() 21 | aligned_anchor_gt, aligned_anchor_noise = anchor.align_w_anchor(gt_text, ocr_text) 22 | aligned_gt, aligned_noise = alignment.align(gt_text, ocr_text) 23 | 24 | if aligned_gt != aligned_anchor_gt: 25 | aligned_anchor_gt = aligned_anchor_gt.split(".") 26 | aligned_gt = aligned_gt.split(".") 27 | str_diff = "\n".join(difflib.unified_diff(aligned_gt, aligned_anchor_gt)) 28 | warnings.warn( 29 | UserWarning( 30 | "\n" 31 | + f"{str_diff}" 32 | + "\n\n**** Inconsistent Alignment Results between align() and " 33 | + "align_w_anchor(). Ignore this if the delta is not significant. ****\n" 34 | ) 35 | ) 36 | 37 | 38 | @pytest.mark.slow 39 | @pytest.mark.parametrize( 40 | "gt_file, ocr_file", 41 | zip( 42 | sorted(glob.glob("tests/unit/text/data/gt_*.txt")), 43 | sorted(glob.glob("tests/unit/text/data/ocr_*.txt")), 44 | ), 45 | ) 46 | @pytest.mark.parametrize("max_seg_length", [25, 50, 75, 100, 150]) 47 | def test_find_anchor_recur_e2e(gt_file, ocr_file, max_seg_length): 48 | gt_text = open(gt_file, "r").read() 49 | ocr_text = open(ocr_file, "r").read() 50 | gt_tokens = preprocess.tokenize(gt_text) 51 | ocr_tokens = preprocess.tokenize(ocr_text) 52 | gt_anchors, ocr_anchors = anchor.find_anchor_recur( 53 | gt_tokens, ocr_tokens, max_seg_length=max_seg_length 54 | ) 55 | for gt_anchor, ocr_anchor in zip(gt_anchors, ocr_anchors): 56 | # Ensure that each anchor word is the same word in both text 57 | assert gt_tokens[gt_anchor] == ocr_tokens[ocr_anchor] 58 | -------------------------------------------------------------------------------- /tests/e2e/test_conll_format_e2e.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import itertools 3 | 4 | import pytest 5 | 6 | from genalog.text import conll_format 7 | 8 | 9 | @pytest.mark.slow 10 | @pytest.mark.parametrize( 11 | "required_args", [(["tests/e2e/data/synthetic_dataset", "test_version"])] 12 | ) 13 | @pytest.mark.parametrize( 14 | "optional_args", 15 | [ 16 | (["--train_subset"]), 17 | (["--test_subset"]), 18 | (["--gt_folder", "shared"]), 19 | ], 20 | ) 21 | def test_conll_format(required_args, optional_args): 22 | parser = conll_format.create_parser() 23 | arg_list = required_args + optional_args 24 | args = parser.parse_args(args=arg_list) 25 | conll_format.main(args) 26 | 27 | 28 | basepath = "tests/e2e/data/conll_formatter/" 29 | 30 | 31 | @pytest.mark.slow 32 | @pytest.mark.parametrize( 33 | "clean_label_filename, ocr_text_filename", 34 | zip( 35 | sorted(glob.glob("tests/e2e/data/conll_formatter/clean_labels/*.txt")), 36 | sorted(glob.glob("tests/e2e/data/conll_formatter/ocr_text/*.txt")), 37 | ), 38 | ) 39 | def test_propagate_labels_sentence_single_file(clean_label_filename, ocr_text_filename): 40 | with open(clean_label_filename, "r", encoding="utf-8") as clf: 41 | tokens_labels_str = clf.readlines() 42 | clean_tokens = [ 43 | line.split()[0].strip() for line in tokens_labels_str if len(line.split()) == 2 44 | ] 45 | clean_labels = [ 46 | line.split()[1].strip() for line in tokens_labels_str if len(line.split()) == 2 47 | ] 48 | clean_sentences = conll_format.get_sentences_from_iob_format(tokens_labels_str) 49 | # read ocr tokens 50 | with open(ocr_text_filename, "r", encoding="utf-8") as otf: 51 | ocr_text_str = " ".join(otf.readlines()) 52 | ocr_tokens = [ 53 | token.strip() for token in ocr_text_str.split() 54 | ] # already tokenized in data 55 | 56 | ocr_text_sentences, ocr_labels_sentences = conll_format.propagate_labels_sentences( 57 | clean_tokens, clean_labels, clean_sentences, ocr_tokens 58 | ) 59 | ocr_sentences_flatten = list(itertools.chain(*ocr_text_sentences)) 60 | assert len(ocr_text_sentences) == len(clean_sentences) 61 | assert len(ocr_text_sentences) == len(ocr_labels_sentences) 62 | assert len(ocr_sentences_flatten) == len( 63 | ocr_tokens 64 | ) # ensure aligned ocr tokens == ocr tokens 65 | -------------------------------------------------------------------------------- /tests/e2e/test_generaton_n_degradation.py: -------------------------------------------------------------------------------- 1 | from genalog.degradation.degrader import Degrader 2 | from genalog.generation.content import CompositeContent, ContentType 3 | from genalog.generation.document import DocumentGenerator 4 | 5 | 6 | TEST_OUTPUT_DIR = "test_out/" 7 | SAMPLE_TXT = """Everton 's Duncan Ferguson , who scored twice against Manchester United on Wednesday , 8 | was picked on Thursday for the Scottish squad after a 20-month exile .""" 9 | DEFAULT_TEMPLATE = "text_block.html.jinja" 10 | DEGRADATION_EFFECTS = [ 11 | ("blur", {"radius": 5}), 12 | ("bleed_through", {"alpha": 0.8}), 13 | ( 14 | "morphology", 15 | {"operation": "open", "kernel_shape": (3, 3), "kernel_type": "plus"}, 16 | ), 17 | ("morphology", {"operation": "close"}), 18 | ("morphology", {"operation": "dilate"}), 19 | ("morphology", {"operation": "erode"}), 20 | ] 21 | 22 | 23 | def test_generation_and_degradation(): 24 | # Initiate content 25 | content = CompositeContent([SAMPLE_TXT], [ContentType.PARAGRAPH]) 26 | doc_gen = DocumentGenerator() 27 | assert DEFAULT_TEMPLATE in doc_gen.template_list 28 | # Initate template generator 29 | generator = doc_gen.create_generator(content, [DEFAULT_TEMPLATE]) 30 | # Initiate degrader 31 | degrader = Degrader(DEGRADATION_EFFECTS) 32 | 33 | for doc in generator: 34 | # get the image in bytes in RGBA channels 35 | src = doc.render_array(resolution=100, channel="GRAYSCALE") 36 | # run each degradation effect 37 | degrader.apply_effects(src) 38 | -------------------------------------------------------------------------------- /tests/e2e/test_image_channel.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import pytest 3 | 4 | from genalog.generation.content import CompositeContent, ContentType 5 | from genalog.generation.document import DocumentGenerator 6 | 7 | TEMPLATE_PATH = "tests/e2e/templates" 8 | TEST_OUT_FOLDER = "test_out/" 9 | SAMPLE_TXT = "foo" 10 | CONTENT = CompositeContent([SAMPLE_TXT], [ContentType.PARAGRAPH]) 11 | 12 | 13 | @pytest.fixture 14 | def doc_generator(): 15 | return DocumentGenerator(template_path=TEMPLATE_PATH) 16 | 17 | 18 | @pytest.mark.io 19 | def test_red_channel(doc_generator): 20 | generator = doc_generator.create_generator(CONTENT, ["solid_bg.html.jinja"]) 21 | for doc in generator: 22 | doc.update_style(background_color="red") 23 | img_array = doc.render_array(resolution=100, channel="BGRA") 24 | # css "red" is rgb(255,0,0) or bgra(0,0,255,255) 25 | assert tuple(img_array[0][0]) == (0, 0, 255, 255) 26 | cv2.imwrite(TEST_OUT_FOLDER + "red.png", img_array) 27 | 28 | 29 | @pytest.mark.io 30 | def test_green_channel(doc_generator): 31 | generator = doc_generator.create_generator(CONTENT, ["solid_bg.html.jinja"]) 32 | for doc in generator: 33 | doc.update_style(background_color="green") 34 | img_array = doc.render_array(resolution=100, channel="BGRA") 35 | # css "green" is rgb(0,128,0) or bgra(0,128,0,255) 36 | assert tuple(img_array[0][0]) == (0, 128, 0, 255) 37 | cv2.imwrite(TEST_OUT_FOLDER + "green.png", img_array) 38 | 39 | 40 | @pytest.mark.io 41 | def test_blue_channel(doc_generator): 42 | generator = doc_generator.create_generator(CONTENT, ["solid_bg.html.jinja"]) 43 | for doc in generator: 44 | doc.update_style(background_color="blue") 45 | img_array = doc.render_array(resolution=100, channel="BGRA") 46 | # css "blue" is rgb(0,0,255) or bgra(255,0,0,255) 47 | assert tuple(img_array[0][0]) == (255, 0, 0, 255) 48 | cv2.imwrite(TEST_OUT_FOLDER + "blue.png", img_array) 49 | -------------------------------------------------------------------------------- /tests/e2e/test_ocr_e2e.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import pytest 4 | 5 | from genalog.ocr.blob_client import GrokBlobClient 6 | from genalog.ocr.grok import Grok 7 | 8 | 9 | @pytest.fixture(scope="module", autouse=True) 10 | def load_azure_config(load_azure_resources): 11 | # Loading the non-secrets 12 | # Assume the secrets are set in the environment variable prior 13 | pass 14 | 15 | 16 | @pytest.mark.azure 17 | class TestBlobClient: 18 | @pytest.mark.parametrize("use_async", [True, False]) 19 | def test_upload_images(self, use_async): 20 | blob_client = GrokBlobClient.create_from_env_var() 21 | subfolder = "tests/unit/ocr/data/img" 22 | subfolder.replace("/", "_") 23 | dst_folder, _ = blob_client.upload_images_to_blob( 24 | subfolder, use_async=use_async 25 | ) 26 | uploaded_items, _ = blob_client.list_blobs(dst_folder) 27 | uploaded_items = sorted(list(uploaded_items), key=lambda x: x.name) 28 | assert uploaded_items[0].name == f"{dst_folder}/0.png" 29 | assert uploaded_items[1].name == f"{dst_folder}/1.png" 30 | assert uploaded_items[2].name == f"{dst_folder}/11.png" 31 | blob_client.delete_blobs_folder(dst_folder) 32 | assert ( 33 | len(list(blob_client.list_blobs(dst_folder)[0])) == 0 34 | ), f"folder {dst_folder} was not deleted" 35 | 36 | dst_folder, _ = blob_client.upload_images_to_blob( 37 | subfolder, "test_images", use_async=use_async 38 | ) 39 | assert dst_folder == "test_images" 40 | uploaded_items, _ = blob_client.list_blobs(dst_folder) 41 | uploaded_items = sorted(list(uploaded_items), key=lambda x: x.name) 42 | assert uploaded_items[0].name == f"{dst_folder}/0.png" 43 | assert uploaded_items[1].name == f"{dst_folder}/1.png" 44 | assert uploaded_items[2].name == f"{dst_folder}/11.png" 45 | blob_client.delete_blobs_folder(dst_folder) 46 | assert ( 47 | len(list(blob_client.list_blobs(dst_folder)[0])) == 0 48 | ), f"folder {dst_folder} was not deleted" 49 | 50 | 51 | @pytest.mark.skip(reason=( 52 | "Flaky test. Going to deprecate the ocr module in favor of the official python SDK:\n" 53 | "https://docs.microsoft.com/en-us/azure/cognitive-services/computer-vision/quickstarts-sdk/client-library?tabs=visual-studio&pivots=programming-language-python" # noqa:E501 54 | )) 55 | @pytest.mark.azure 56 | class TestGROKe2e: 57 | @pytest.mark.parametrize("use_async", [False]) 58 | def test_grok_e2e(self, tmpdir, use_async): 59 | grok = Grok.create_from_env_var() 60 | src_folder = "tests/unit/ocr/data/img" 61 | grok.run_grok( 62 | src_folder, 63 | tmpdir, 64 | blob_dest_folder="testimages", 65 | use_async=use_async, 66 | cleanup=True, 67 | ) 68 | assert json.load(open(f"{tmpdir}/0.json", "r"))[0]["text"] 69 | assert json.load(open(f"{tmpdir}/1.json", "r"))[0]["text"] 70 | assert json.load(open(f"{tmpdir}/11.json", "r"))[0]["text"] 71 | -------------------------------------------------------------------------------- /tests/e2e/test_pipeline.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | 4 | import numpy as np 5 | import pytest 6 | 7 | from genalog.generation.document import DocumentGenerator 8 | from genalog.pipeline import AnalogDocumentGeneration, generate_dataset_multiprocess 9 | 10 | EXAMPLE_TEXT_FILE = "tests/unit/text/data/gt_1.txt" 11 | INPUT_TEXT_FILENAMES = glob.glob("tests/unit/text/data/gt_*.txt") 12 | 13 | STYLES = {"font_size": ["5px"]} 14 | STYLES_COMBINATION = {"font_size": ["5px", "6px"]} # Multiple values per style are not supported right now 15 | DEGRATIONS = [ 16 | ("blur", {"radius": 3}), 17 | ("morphology", {"operation": "close"}) 18 | ] 19 | 20 | 21 | @pytest.fixture 22 | def default_doc_generator(): 23 | return AnalogDocumentGeneration() 24 | 25 | 26 | @pytest.fixture 27 | def custom_doc_generator(): 28 | return AnalogDocumentGeneration(styles=STYLES, degradations=DEGRATIONS, resolution=300) 29 | 30 | 31 | @pytest.fixture 32 | def empty_style_doc_generator(): 33 | return AnalogDocumentGeneration(styles={}) 34 | 35 | 36 | @pytest.mark.parametrize("doc_generator", [ 37 | pytest.lazy_fixture('default_doc_generator'), 38 | pytest.lazy_fixture('custom_doc_generator') 39 | ]) 40 | def test_generate_img_array(doc_generator): 41 | # Precondition checks 42 | assert len(doc_generator.list_templates()) > 0 43 | 44 | example_template = doc_generator.list_templates()[0] 45 | sample_img = doc_generator.generate_img( 46 | EXAMPLE_TEXT_FILE, example_template, target_folder=None 47 | ) 48 | assert sample_img is not None 49 | assert isinstance(sample_img, np.ndarray) 50 | 51 | 52 | def test_generate_img_array_empty(empty_style_doc_generator): 53 | # Precondition checks 54 | assert len(empty_style_doc_generator.list_templates()) > 0 55 | 56 | example_template = empty_style_doc_generator.list_templates()[0] 57 | sample_img = empty_style_doc_generator.generate_img( 58 | EXAMPLE_TEXT_FILE, example_template, target_folder=None 59 | ) 60 | assert sample_img is None 61 | 62 | 63 | @pytest.mark.io 64 | @pytest.mark.parametrize("doc_generator", [ 65 | pytest.lazy_fixture('default_doc_generator'), 66 | pytest.lazy_fixture('custom_doc_generator') 67 | ]) 68 | def test_generate_img_write_to_disk(tmpdir, doc_generator): 69 | os.makedirs(os.path.join(tmpdir, "img")) # TODO: generate_img() store image under "img" folder 70 | output_img_wildcard = os.path.join(tmpdir, "img", "*.png") 71 | num_generated_img = glob.glob(output_img_wildcard) 72 | # Precondition checks 73 | assert len(num_generated_img) == 0 74 | assert len(doc_generator.list_templates()) > 0 75 | 76 | example_template = doc_generator.list_templates()[0] 77 | doc_generator.generate_img( 78 | EXAMPLE_TEXT_FILE, example_template, target_folder=tmpdir 79 | ) 80 | num_generated_img = glob.glob(output_img_wildcard) # look for any jpg on file 81 | assert len(num_generated_img) > 0 82 | 83 | 84 | @pytest.mark.io 85 | @pytest.mark.parametrize("styles", [ 86 | STYLES, 87 | pytest.param( 88 | STYLES_COMBINATION, marks=pytest.mark.xfail( 89 | reason="Style combinations are not supported. Only one value per style", strict=True) 90 | ) 91 | ]) 92 | @pytest.mark.parametrize("folder_name", ["result", "result/"]) 93 | def test_generate_dataset_multiprocess(tmpdir, folder_name, styles): 94 | assert len(INPUT_TEXT_FILENAMES) > 0 95 | output_folder = os.path.join(tmpdir, folder_name) 96 | generate_dataset_multiprocess( 97 | INPUT_TEXT_FILENAMES, output_folder, styles, DEGRATIONS, "text_block.html.jinja" 98 | ) 99 | num_generated_img = glob.glob(os.path.join(output_folder, "**", "*.png")) 100 | assert len(num_generated_img) > 0 101 | assert len(num_generated_img) == len(INPUT_TEXT_FILENAMES) * len(DocumentGenerator.expand_style_combinations(styles)) 102 | -------------------------------------------------------------------------------- /tests/e2e/test_splitter.py: -------------------------------------------------------------------------------- 1 | import difflib 2 | import os 3 | 4 | from genalog.text.splitter import CONLL2003_DOC_SEPERATOR, generate_splits 5 | 6 | 7 | def _compare_content(file1, file2): 8 | txt1 = open(file1, "r").read() 9 | txt2 = open(file2, "r").read() 10 | sentences_txt1 = txt1.split("\n") 11 | sentences_txt2 = txt2.split("\n") 12 | if txt1 != txt2: 13 | str_diff = "\n".join(difflib.unified_diff(sentences_txt1, sentences_txt2)) 14 | assert False, f"Delta between outputs: \n {str_diff}" 15 | 16 | 17 | def test_splitter(tmpdir): 18 | # tmpdir = "test_out" 19 | os.makedirs(f"{tmpdir}/clean_labels") 20 | os.makedirs(f"{tmpdir}/clean_text") 21 | 22 | generate_splits( 23 | "tests/e2e/data/splitter/example_conll2012.txt", 24 | tmpdir, 25 | doc_seperator=CONLL2003_DOC_SEPERATOR, 26 | sentence_seperator="", 27 | ) 28 | 29 | _compare_content( 30 | "tests/e2e/data/splitter/example_splits/clean_text/0.txt", 31 | f"{tmpdir}/clean_text/0.txt", 32 | ) 33 | _compare_content( 34 | "tests/e2e/data/splitter/example_splits/clean_text/1.txt", 35 | f"{tmpdir}/clean_text/1.txt", 36 | ) 37 | _compare_content( 38 | "tests/e2e/data/splitter/example_splits/clean_labels/0.txt", 39 | f"{tmpdir}/clean_labels/0.txt", 40 | ) 41 | _compare_content( 42 | "tests/e2e/data/splitter/example_splits/clean_labels/1.txt", 43 | f"{tmpdir}/clean_labels/1.txt", 44 | ) 45 | -------------------------------------------------------------------------------- /tests/required_env.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | from itertools import chain 3 | 4 | 5 | class RequiredSecrets(Enum): 6 | BLOB_KEY = 'BLOB_KEY' 7 | SEARCH_SERVICE_KEY = 'SEARCH_SERVICE_KEY' 8 | COGNITIVE_SERVICE_KEY = 'COGNITIVE_SERVICE_KEY' 9 | 10 | 11 | class RequiredConstants(Enum): 12 | COMPUTER_VISION_ENDPOINT = 'COMPUTER_VISION_ENDPOINT' 13 | SEARCH_SERVICE_NAME = 'SEARCH_SERVICE_NAME' 14 | SKILLSET_NAME = 'SKILLSET_NAME' 15 | INDEX_NAME = "INDEX_NAME" 16 | INDEXER_NAME = "INDEXER_NAME" 17 | DATASOURCE_NAME = "DATASOURCE_NAME" 18 | DATASOURCE_CONTAINER_NAME = "DATASOURCE_CONTAINER_NAME" 19 | BLOB_NAME = "BLOB_NAME" 20 | 21 | 22 | RequiredEnvVar = Enum("RequiredEnvVar", [ 23 | (i.name, i.value) for i in chain(RequiredSecrets, RequiredConstants) 24 | ]) 25 | -------------------------------------------------------------------------------- /tests/unit/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/tests/unit/__init__.py -------------------------------------------------------------------------------- /tests/unit/cases/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/tests/unit/cases/__init__.py -------------------------------------------------------------------------------- /tests/unit/cases/label_propagation.py: -------------------------------------------------------------------------------- 1 | # Test cases for genalog.text.ner_label.propagate_label_to_ocr() method. 2 | # For READABILITY purpose, ground truth and noisy text are presented as 3 | # a whole string, not in their tokenized format. 4 | 5 | # Notice the `propagate_label_to_ocr()` method has the contract of 6 | # (list, list, list) -> (list, list, list) 7 | # consuming both ground truth text and noisy text as lists of tokens. 8 | # We will use `genalog.text.preprocess.tokenize()` to tokenize these strings 9 | from genalog.text import preprocess 10 | 11 | ner_labels = [] 12 | gt_txt = [] 13 | ns_txt = [] 14 | desired_ocr_labels = [] 15 | 16 | # Alignment is one-to-one 17 | ner_labels.append(["B-PLACE", "I-PLACE"]) 18 | gt_txt.append("New York") 19 | ns_txt.append("New York") 20 | desired_ocr_labels.append(["B-PLACE", "I-PLACE"]) 21 | 22 | # Alignment is one-to-many 23 | ner_labels.append(["B-PLACE", "I-PLACE"]) 24 | gt_txt.append("New York") 25 | ns_txt.append("N ew York") 26 | desired_ocr_labels.append(["B-PLACE", "I-PLACE", "I-PLACE"]) 27 | 28 | # Trailing B-Labels 29 | ner_labels.append(["B-PLACE", "I-PLACE", "O", "B-PLACE", "O", "B-PLACE"]) 30 | gt_txt.append("New York , Boston , Sidney") 31 | ns_txt.append("N ew York Boston Sidney") 32 | desired_ocr_labels.append(["B-PLACE", "I-PLACE", "I-PLACE", "B-PLACE", "B-PLACE"]) 33 | 34 | # Alignment is many-to-one 35 | ner_labels.append(["B-PLACE", "I-PLACE"]) 36 | gt_txt.append("New York") 37 | ns_txt.append("NewYork") 38 | desired_ocr_labels.append(["B-PLACE"]) 39 | 40 | # Alignment is many-to-many 41 | ner_labels.append(["B-PLACE", "I-PLACE", "O", "O"]) 42 | gt_txt.append("New York is big") 43 | ns_txt.append("N ewYorkis big") 44 | desired_ocr_labels.append(["B-PLACE", "I-PLACE", "O"]) 45 | 46 | # Missing tokens (I-label) 47 | ner_labels.append(["B-PLACE", "I-PLACE", "V", "O"]) 48 | gt_txt.append("New York is big") 49 | ns_txt.append("New is big") 50 | desired_ocr_labels.append(["B-PLACE", "V", "O"]) 51 | 52 | # Missing tokens (B-label) 53 | ner_labels.append(["B-PLACE", "I-PLACE", "V", "O"]) 54 | gt_txt.append("New York is big") 55 | ns_txt.append(" York is big") 56 | desired_ocr_labels.append(["B-PLACE", "V", "O"]) 57 | 58 | ner_labels.append(["O", "O", "B-PLACE"]) 59 | gt_txt.append("This is home") 60 | ns_txt.append("Th isis ho me") 61 | desired_ocr_labels.append(["O", "O", "B-PLACE", "I-PLACE"]) 62 | 63 | # Missing tokens + many-to-many 64 | ner_labels.append(["B-PLACE", "I-PLACE", "O", "O"]) 65 | gt_txt.append("New York is big") 66 | ns_txt.append("N ewYo rkis big") 67 | desired_ocr_labels.append(["B-PLACE", "I-PLACE", "I-PLACE", "O"]) 68 | 69 | # Missing tokens + many-to-many 70 | ner_labels.append(["B-PLACE", "O", "O"]) 71 | gt_txt.append("Boston is big ") 72 | ns_txt.append("B oston bi g") 73 | desired_ocr_labels.append(["B-PLACE", "I-PLACE", "O", "O"]) 74 | 75 | # Single char tokens 76 | ner_labels.append(["O", "O", "B-PLACE"]) 77 | gt_txt.append("a big city") 78 | ns_txt.append("abigcity") 79 | desired_ocr_labels.append(["O"]) 80 | 81 | # Splitted into single-char token 82 | ner_labels.append(["O", "O", "B-PLACE"]) 83 | gt_txt.append("a big city") 84 | ns_txt.append("abig c it y") 85 | desired_ocr_labels.append(["O", "B-PLACE", "I-PLACE", "I-PLACE"]) 86 | 87 | # Tokens with repeating characters 88 | ner_labels.append(["O", "FRUIT"]) 89 | gt_txt.append("an apple") 90 | ns_txt.append("aa aaple") 91 | desired_ocr_labels.append(["O", "FRUIT"]) 92 | 93 | # Tokens with regex special characters 94 | ner_labels.append(["O", "FRUIT", "O"]) 95 | gt_txt.append("an apple .*/") 96 | ns_txt.append("@n @ @p|e *. |") 97 | desired_ocr_labels.append(["O", "FRUIT", "FRUIT", "O", "O"]) 98 | 99 | # Tokens with regex special characters with B-labels 100 | ner_labels.append(["O", "B-FRUIT", "O"]) 101 | gt_txt.append("an apple .*/") 102 | ns_txt.append("@n @ @p|e *. |") 103 | desired_ocr_labels.append(["O", "B-FRUIT", "I-FRUIT", "O", "O"]) 104 | 105 | # Tokens with regex special characters in BOTH clean and noisy text 106 | ner_labels.append(["O", "O", "ENTERTAINMENT", "O"]) 107 | gt_txt.append("@ new TV !") 108 | ns_txt.append("@ n ow T\\/ |") 109 | desired_ocr_labels.append(["O", "O", "O", "ENTERTAINMENT", "O"]) 110 | 111 | # Tokenize ground truth and noisy text strings 112 | gt_tokens = [preprocess.tokenize(txt) for txt in gt_txt] 113 | ns_tokens = [preprocess.tokenize(txt) for txt in ns_txt] 114 | 115 | # test function expect params in tuple of 116 | # (gt_label, gt_tokens, ocr_tokens, desired_ocr_labels) 117 | LABEL_PROPAGATION_REGRESSION_TEST_CASES = list( 118 | zip(ner_labels, gt_tokens, ns_tokens, desired_ocr_labels) 119 | ) 120 | -------------------------------------------------------------------------------- /tests/unit/degradation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/tests/unit/degradation/__init__.py -------------------------------------------------------------------------------- /tests/unit/degradation/test_degrader.py: -------------------------------------------------------------------------------- 1 | import copy 2 | from unittest.mock import patch 3 | 4 | import numpy as np 5 | import pytest 6 | 7 | from genalog.degradation.degrader import DEFAULT_METHOD_PARAM_TO_INCLUDE 8 | from genalog.degradation.degrader import Degrader, ImageState 9 | 10 | MOCK_IMAGE_SHAPE = (4, 3) 11 | MOCK_IMAGE = np.arange(12, dtype=np.uint8).reshape(MOCK_IMAGE_SHAPE) 12 | 13 | 14 | @pytest.fixture 15 | def empty_degrader(): 16 | effects = [] 17 | return Degrader(effects) 18 | 19 | 20 | @pytest.fixture( 21 | params=[ 22 | [("blur", {"radius": 5})], 23 | [("blur", {"src": ImageState.ORIGINAL_STATE, "radius": 5})], 24 | [("blur", {"src": ImageState.CURRENT_STATE, "radius": 5})], 25 | [ 26 | ("morphology", {"src": ImageState.ORIGINAL_STATE, "operation": "open"}), 27 | ("morphology", {"operation": "close"}), 28 | ("morphology", {"src": ImageState.ORIGINAL_STATE, "operation": "dilate"}), 29 | ("morphology", {"operation": "erode"}), 30 | ], 31 | [ 32 | ("blur", {"radius": 5}), 33 | ( 34 | "bleed_through", 35 | { 36 | "src": ImageState.CURRENT_STATE, 37 | "alpha": 0.7, 38 | "background": ImageState.ORIGINAL_STATE, 39 | }, 40 | ), 41 | ( 42 | "morphology", 43 | {"operation": "open", "kernel_shape": (3, 3), "kernel_type": "ones"}, 44 | ), 45 | ], 46 | ] 47 | ) 48 | def degrader(request): 49 | effects = request.param 50 | return Degrader(effects) 51 | 52 | 53 | def test_empty_degrader_init(empty_degrader): 54 | assert empty_degrader.effects_to_apply == [] 55 | 56 | 57 | def test_degrader_init(degrader): 58 | assert degrader.effects_to_apply is not [] 59 | for effect_tuple in degrader.effects_to_apply: 60 | method_name, method_kwargs = effect_tuple 61 | assert DEFAULT_METHOD_PARAM_TO_INCLUDE in method_kwargs 62 | param_value = method_kwargs[DEFAULT_METHOD_PARAM_TO_INCLUDE] 63 | assert ( 64 | param_value is ImageState.ORIGINAL_STATE 65 | or param_value is ImageState.CURRENT_STATE 66 | ) 67 | 68 | 69 | @pytest.mark.parametrize( 70 | "effects, error_thrown", 71 | [ 72 | ([], None), # Empty effect 73 | (None, TypeError), 74 | ([("blur", {"radius": 5})], None), # Validate input 75 | ([("not_a_func", {"radius": 5})], ValueError), # Invalid method name 76 | ([("blur", {"not_a_argument": 5})], ValueError), # Invalid kwargs 77 | ([("blur")], ValueError), # Missing kwargs 78 | ( 79 | [ 80 | ("blur", {"radius": 5}), 81 | ("bleed_through", {"alpha": "0.8"}), 82 | ("morphology", {"operation": "open"}), 83 | ], 84 | None, 85 | ), # Multiple effects 86 | ( 87 | [ 88 | ("blur", {"radius": 5}), 89 | ("bleed_through", {"not_argument": "0.8"}), 90 | ("morphology", {"missing value"}), 91 | ], 92 | ValueError, 93 | ), # Multiple effects 94 | ], 95 | ) 96 | def test_degrader_validate_effects(effects, error_thrown): 97 | if error_thrown: 98 | with pytest.raises(error_thrown): 99 | Degrader.validate_effects(effects) 100 | else: 101 | Degrader.validate_effects(effects) 102 | 103 | 104 | def test_degrader_apply_effects(degrader): 105 | method_names = [effect[0] for effect in degrader.effects_to_apply] 106 | with patch("genalog.degradation.effect") as mock_effect: 107 | degrader.apply_effects(MOCK_IMAGE) 108 | for method in method_names: 109 | assert mock_effect[method].is_called() 110 | # assert degraded.shape == MOCK_IMAGE_SHAPE 111 | 112 | 113 | def test_degrader_apply_effects_e2e(degrader): 114 | degraded = degrader.apply_effects(MOCK_IMAGE) 115 | assert degraded.shape == MOCK_IMAGE_SHAPE 116 | assert degraded.dtype == np.uint8 117 | 118 | 119 | def test_degrader_instructions(degrader): 120 | original_instruction = copy.deepcopy(degrader.effects_to_apply) 121 | degrader.apply_effects(MOCK_IMAGE) 122 | degrader.apply_effects(MOCK_IMAGE) 123 | # Make sure the degradation instructions are not altered 124 | assert len(original_instruction) == len(degrader.effects_to_apply) 125 | for i in range(len(original_instruction)): 126 | org_method_name, org_method_arg = original_instruction[i] 127 | method_name, method_arg = degrader.effects_to_apply[i] 128 | assert org_method_name == method_name 129 | assert len(org_method_arg) == len(method_arg) 130 | for key in org_method_arg.keys(): 131 | assert isinstance(org_method_arg[key], type(method_arg[key])) 132 | assert org_method_arg[key] == method_arg[key] 133 | -------------------------------------------------------------------------------- /tests/unit/generation/2x2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/tests/unit/generation/2x2.jpg -------------------------------------------------------------------------------- /tests/unit/generation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/tests/unit/generation/__init__.py -------------------------------------------------------------------------------- /tests/unit/generation/templates/font_family.html.jinja: -------------------------------------------------------------------------------- 1 | {{font_family}} -------------------------------------------------------------------------------- /tests/unit/generation/templates/mock.html.jinja: -------------------------------------------------------------------------------- 1 | {{ content }} -------------------------------------------------------------------------------- /tests/unit/generation/templates/multipage.html.jinja: -------------------------------------------------------------------------------- 1 | {% if content and content.content_type.__str__() == "ContentType.COMPOSITE" %} 2 | {% for c in content %} 3 | {% if c.content_type.__str__() == "ContentType.PARAGRAPH"%} 4 |

5 | {{ c }} 6 |

7 | {% else %} 8 |

Unsupported Content Type: {{c.content_type.__str__()}}

9 | {% endif %} 10 | {% endfor %} 11 | {% else %} 12 | No content loaded 13 | {% endif %} 14 | -------------------------------------------------------------------------------- /tests/unit/generation/test_content.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from genalog.generation.content import CompositeContent, Content, ContentType 4 | from genalog.generation.content import Paragraph, Title 5 | 6 | CONTENT_LIST = ["foo", "bar"] 7 | COMPOSITE_CONTENT_TYPE = [ContentType.TITLE, ContentType.PARAGRAPH] 8 | TEXT = "foo bar" 9 | 10 | 11 | @pytest.fixture 12 | def content_base_class(): 13 | return Content() 14 | 15 | 16 | @pytest.fixture 17 | def paragraph(): 18 | return Paragraph(TEXT) 19 | 20 | 21 | @pytest.fixture 22 | def title(): 23 | return Title(TEXT) 24 | 25 | 26 | @pytest.fixture 27 | def section(): 28 | return CompositeContent(CONTENT_LIST, COMPOSITE_CONTENT_TYPE) 29 | 30 | 31 | def test_content_set_content_type(content_base_class): 32 | with pytest.raises(TypeError): 33 | content_base_class.set_content_type("NOT VALID CONTENT TYPE") 34 | content_base_class.set_content_type(ContentType.PARAGRAPH) 35 | 36 | 37 | def test_paragraph_init(paragraph): 38 | with pytest.raises(TypeError): 39 | Paragraph([]) 40 | assert paragraph.content_type == ContentType.PARAGRAPH 41 | 42 | 43 | def test_paragraph_print(paragraph): 44 | assert paragraph.__str__() 45 | 46 | 47 | def test_paragraph_iterable_indexable(paragraph): 48 | for index, character in enumerate(paragraph): 49 | assert character == paragraph[index] 50 | 51 | 52 | def test_title_init(title): 53 | with pytest.raises(TypeError): 54 | Title([]) 55 | assert title.content_type == ContentType.TITLE 56 | 57 | 58 | def test_title_iterable_indexable(title): 59 | for index, character in enumerate(title): 60 | assert character == title[index] 61 | 62 | 63 | def test_composite_content_init(section): 64 | with pytest.raises(TypeError): 65 | CompositeContent((), []) 66 | assert section.content_type == ContentType.COMPOSITE 67 | 68 | 69 | def test_composite_content_iterable(section): 70 | for index, content in enumerate(section): 71 | assert content.content_type == COMPOSITE_CONTENT_TYPE[index] 72 | 73 | 74 | def test_composite_content_print(section): 75 | assert "foo" in section.__str__() 76 | assert "bar" in section.__str__() 77 | -------------------------------------------------------------------------------- /tests/unit/ocr/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/tests/unit/ocr/__init__.py -------------------------------------------------------------------------------- /tests/unit/ocr/data/img/0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/tests/unit/ocr/data/img/0.png -------------------------------------------------------------------------------- /tests/unit/ocr/data/img/1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/tests/unit/ocr/data/img/1.png -------------------------------------------------------------------------------- /tests/unit/ocr/data/img/11.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/tests/unit/ocr/data/img/11.png -------------------------------------------------------------------------------- /tests/unit/ocr/data/metrics.csv: -------------------------------------------------------------------------------- 1 | edit_insert,edit_delete,edit_replace,edit_insert_spacing,edit_delete_spacing,insert,delete,replace,spacing,total_chars,total_words,total_alnum_words,matching_chars,matching_alnum_words,matching_words,alnum_word_accuracy,word_accuracy,char_accuracy,txt_path,ocr_json_path,filename 2 | 1,0,0,1,13,1,0,0,14,1027,166,159,1025,144,150,0.9056603773584906,0.9036144578313253,0.9980525803310614,tests/unit/ocr/data/text/0.txt,tests/unit/ocr/data/json/521c38122f783673598856cd81d91c21_0.png.json,0.txt 3 | 3,0,0,0,5,3,0,0,5,958,182,176,955,165,171,0.9375,0.9395604395604396,0.9968684759916493,tests/unit/ocr/data/text/1.txt,tests/unit/ocr/data/json/521c38122f783673598856cd81d91c21_1.png.json,1.txt 4 | 2,0,0,0,9,2,0,0,9,1022,188,183,1020,170,175,0.9289617486338798,0.9308510638297872,0.9980430528375733,tests/unit/ocr/data/text/11.txt,tests/unit/ocr/data/json/521c38122f783673598856cd81d91c21_11.png.json,11.txt 5 | -------------------------------------------------------------------------------- /tests/unit/ocr/data/metrics/json/123_001.json: -------------------------------------------------------------------------------- 1 | [{"text": "BIRDS\n\nOF\n\nGREAT BRITAIN AND IRELAND\n\nORDER PASSERES\n\nFAMILY ORIOLID^.\n\nTHIS famil}^ consists of a tropical group of brightly coloured birds in whicli\nyellow and black, or scarlet and black, are the prevailing hues. Although\nin the general form of their heads they somewhat remind one of Starlings,\nthey must not be confounded with the so-called \"Orioles\" of the New World,\nwhich belong to the family IdcridcE or Hang-nests and Troupials, a group of birds\nlinking the Finches and the Starlings, and feeding largely upon seeds and insects.\n\nThe late Henry Seebohm was of opinion that the Orioles were nearly related\nto the Crows ; he, therefore, placed the genus Oriolus in his Subfamily Corvince, from\nwhich he said that they chiefly differed in their exposed nostrils, although he\nadmitted that the tarsus might perhaps be slightly shorter, and the prevailing\ncolours different ; whilst the sexes also were dissimilar.*\n\nIn addition to the above distinctive characters, the third primary of the wing\n(not the fourth or fifth) appears to be the longest, in the Orioles; whilst the\n"}] -------------------------------------------------------------------------------- /tests/unit/ocr/data/metrics/json/123_002.json: -------------------------------------------------------------------------------- 1 | [{"text": "whole character of the nest, which Seebohm often made much of iu his classifi-\ncation, is quite unlike that of a Crow ; being neatly woven, and slung like a\nhammock between the forks of a branch : moreover, whereas the eggs of the Crows\nare usually of some shade of green or blue, heavily spotted and speckled, or\nblotched and mottled, with various shades of olive or brown, those of the Orioles\nvary from white to salmon-pink, clearly spotted with blackish-brown, and some-\ntimes with lilacine-greyisli shell-markings.\n\nThe call-notes and songs of the Orioles are bright and melodious ; but this\nfact would not be a sufficient reason for dissociating them from the Crows ;\nalthough our native species of Corvidcr do not shine as whistlers, in their wild\nstate. I think, however, that Howard Saunders was fully justified in adopting\nthe present family for the Orioles.\n\nFamilx- ORIOL ID^E.\n\nThe Golden Oriole.\n\nOriolus ga/bula, LiNN.\n\nBREEDS in suitable localities throughout Europe south of the Baltic and in\nAlgeria ; passes through Greece, Asia Minor, Palestine, Egypt, and Nubia,\non migration ; and winters in North Africa, south-eastwards to Madagascar,\nNatal, and westwards to Damara Land : stragglers sometimes occur in Madeira,\nand the Azores.\n\nThe Golden Oriole is a regular visitor to our shores in spring, the largest\nnumber having been seen in the Scilly Islands, and Cornwall ; it has, however,\nbeen met with in not a few of the southern and south-eastern counties, and several\ninstances of its breeding with us have been recorded. In 1868, I saw a male\nspecimen of this bird near Linton, in Devonshire, and in July, 1887, I was just\ntoo late to see the species in Essex ; Mr. Fitch, of Maldon (whom I was visiting)\ninformed me that the bird had been seen in one of his thickets during the previous\n"}] -------------------------------------------------------------------------------- /tests/unit/ocr/data/metrics/json/123_003.json: -------------------------------------------------------------------------------- 1 | [{"text": "week. We visited the place iu the hope of discovering a uest, but were unsuc-\ncessful.\n\nIn Ireland it has chiefly occurred on the east coast, most of the examples\nbeing females, or immature males ; a specimen was recorded as shot in the Faroe\nIslands, in Maj^ 1893, by Col. H. W. Feildeu. In June, 1906, one was killed\nby a cat on the Marine Parade at Brighton. Perhaps the nearest point to\nLondon at which it has been recognized, was noted in the \"Zoologist\" for 1892,\nan example having apparently been seen in Richmond Park.\n\nThe male of this species is bright gamboge-yellow, the lores, wings (excepting\nthe terminal third of the primary-coverts) and a great part of the tail black ; the\nprimaries, excepting the two outermost, are edged externally, and the secondaries\nare tipped with j^ellowish-white ; the two central tail-feathers are yellowish at the\nbase, and yellow at the tip, and the other feathers have the terminal third of the\nouter webs, and borders of the inner webs yellow ; bill reddish-ochreous ; feet\nleaden-grey ; iris bright red. The female is much duller than the male, greener,\nand with the black colouring replaced by deep brown ; the throat, breast, and\ncentre of belly whitish ; the throat, breast, and flanks streaked with greyish.\nYoung birds are greener and browner than the female, but otherwise similar ;\nnestlings have the upper parts olivaceous, spotted with yellow.\n\nThe Golden Oriole frequents gardens, groves, plantations, thickets, and the\noutskirts of large woods, especially in the neighbourhood of water ; it seems to\nprefer the haunts of man, yet is so shy that it rarely remains in view for more\nthan a minute as it flies rapidly, in somewhat Thrush-like, though more undulating\nfashion, from cover to cover ; choosing ever the densest foliage, as if aware of the\nperilous brilliance of its plumage : possibly it may slowly be acquiring a hereditary\nknowledge of the fact that, if but a glimpse is obtained of it, an attempt at least\nis made to put an end to its life ; or if it fails to comprehend so much, it may\ninherit a dread of the thunder and lightning which, for generations, have heralded\nits appearance : birds are not naturally fearful of man ; for even those which have\nbeen taught by their parents to dread him, can be generally converted by gentleness\nand petting : moreover the fact that a grown man can tame a small bird, whereas\neven the tamest will always show the greatest fear of a little boy, certainly seems\nto prove that the instinctive dread of the monkey-nature in the latter is deeply\nimplanted in all birds ; j ust as is that of a cat, even though that animal may\nnever have been seen by the bird previously.*\n"}] -------------------------------------------------------------------------------- /tests/unit/ocr/data/metrics/metrics.csv: -------------------------------------------------------------------------------- 1 | edit_insert,edit_delete,edit_replace,edit_insert_spacing,edit_delete_spacing,insert,delete,replace,spacing,total_chars,total_words,total_alnum_words,matching_chars,matching_alnum_words,matching_words,alnum_word_accuracy,word_accuracy,char_accuracy,txt_path,ocr_json_path,filename 2 | 2,5,5,0,2,1,1,5,2,1068,176,176,1061,169,169,0.9602272727272727,0.9602272727272727,0.9934456928838952,tests/unit/ocr/data/metrics/text/001.txt,tests/unit/ocr/data/metrics/json/123_001.png.json,001.txt 3 | 0,5,17,0,11,0,2,8,11,1789,301,301,1772,283,283,0.9401993355481728,0.9401993355481728,0.9904974846282839,tests/unit/ocr/data/metrics/text/002.txt,tests/unit/ocr/data/metrics/json/123_002.png.json,002.txt 4 | 0,1,6,0,17,0,0,5,17,2659,460,459,2653,436,437,0.9498910675381264,0.95,0.9977435125987213,tests/unit/ocr/data/metrics/text/003.txt,tests/unit/ocr/data/metrics/json/123_003.png.json,003.txt 5 | -------------------------------------------------------------------------------- /tests/unit/ocr/data/metrics/substitution.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/tests/unit/ocr/data/metrics/substitution.pkl -------------------------------------------------------------------------------- /tests/unit/ocr/data/metrics/text/001.txt: -------------------------------------------------------------------------------- 1 | BIRDS 2 | 3 | OF 4 | 5 | GREAT BRITAIN AND IRELAND 6 | 7 | ORDER PASSERES 8 | 9 | FAMILY ORIOLIDA. 10 | 11 | THIS family consists of a tropical group of brightly coloured birds in which 12 | yellow and black, or scarlet and black, are the prevailing hues. Although 13 | in the general form of their heads they somewhat remind one of Starlings, 14 | they must not be confounded with the so-called "Orioles" of the New World, 15 | which belong to the family Icterida or Hang-nests and Troupials, a group of birds 16 | linking the Finches and the Starlings, and feeding largely upon seeds and insects. 17 | 18 | The late Henry Seebohm was of opinion that the Orioles were nearly related 19 | to the Crows; he, therefore, placed the genus Oriolus in his Subfamily Corvina, from 20 | which he said that they chiefly differed in their exposed nostrils, although he 21 | admitted that the tarsus might perhaps be slightly shorter, and the prevailing 22 | colours different; whilst the sexes also were dissimilar.* 23 | 24 | In addition to the above distinctive characters, the third primary of the wing 25 | (not the fourth or fifth) appears to be the longest, in the Orioles; whilst the 26 | -------------------------------------------------------------------------------- /tests/unit/ocr/data/metrics/text/002.txt: -------------------------------------------------------------------------------- 1 | whole character of the nest, which Seebohm often made much of in his classifi- 2 | cation, is quite unlike that of a Crow; being neatly woven, and slung like a 3 | hammock between the forks of a branch: moreover, whereas the eggs of the Crows 4 | are usually of some shade of green or blue, heavily spotted and speckled, or 5 | blotched and mottled, with various shades of olive or brown, those of the Orioles 6 | vary from white to salmon-pink, clearly spotted with blackish-brown, and some- 7 | times with lilacine-greyish shell-markings. 8 | 9 | The call-notes and songs of the Orioles are bright and melodious; but this 10 | fact would not be a sufficient reason for dissociating them from the Crows; 11 | although our native species of Corvid do not shine as whistlers, in their wild 12 | state. I think, however, that Howard Saunders was fully justified in adopting 13 | the present family for the Orioles. 14 | 15 | Family-ORIOLID. 16 | 17 | THE GOLDEN ORIOLE. 18 | 19 | Oriolus galbula, LINN. 20 | 21 | BREEDS in suitable localities throughout Europe south of the Baltic and in 22 | Algeria; passes through Greece, Asia Minor, Palestine, Egypt, and Nubia, 23 | on migration; and winters in North Africa, south-eastwards to Madagascar, 24 | Natal, and westwards to Damara Land: stragglers sometimes occur in Madeira, 25 | and the Azores. 26 | 27 | The Golden Oriole is a regular visitor to our shores in spring, the largest 28 | number having been seen in the Scilly Islands, and Cornwall; it has, however, 29 | been met with in not a few of the southern and south-eastern counties, and several 30 | instances of its breeding with us have been recorded. In 1868, I saw a male 31 | specimen of this bird near Linton, in Devonshire, and in July, 1887, I was just 32 | too late to see the species in Essex; Mr. Fitch, of Maldon (whom I was visiting) 33 | informed me that the bird had been seen in one of his thickets during the previous 34 | -------------------------------------------------------------------------------- /tests/unit/ocr/data/metrics/text/003.txt: -------------------------------------------------------------------------------- 1 | week. We visited the place in the hope of discovering a nest, but were unsuc- 2 | cessful. 3 | 4 | In Ireland it has chiefly occurred on the east coast, most of the examples 5 | being females, or immature males; a specimen was recorded as shot in the Faroe 6 | Islands, in May, 1893, by Col. H. W. Feilden. In June, 1906, one was killed 7 | by a cat on the Marine Parade at Brighton. Perhaps the nearest point to 8 | London at which it has been recognized, was noted in the "Zoologist" for 1892, 9 | an example having apparently been seen in Richmond Park. 10 | 11 | The male of this species is bright gamboge-yellow, the lores, wings (excepting 12 | the terminal third of the primary-coverts) and a great part of the tail black; the 13 | primaries, excepting the two outermost, are edged externally, and the secondaries 14 | are tipped with yellowish-white; the two central tail-feathers are yellowish at the 15 | base, and yellow at the tip, and the other feathers have the terminal third of the 16 | outer webs, and borders of the inner webs yellow; bill reddish-ochreous; feet 17 | leaden-grey; iris bright red. The female is much duller than the male, greener, 18 | and with the black colouring replaced by deep brown; the throat, breast, and 19 | centre of belly whitish; the throat, breast, and flanks streaked with greyish. 20 | Young birds are greener and browner than the female, but otherwise similar; 21 | nestlings have the upper parts olivaceous, spotted with yellow. 22 | 23 | The Golden Oriole frequents gardens, groves, plantations, thickets, and the 24 | outskirts of large woods, especially in the neighbourhood of water; it seems to 25 | prefer the haunts of man, yet is so shy that it rarely remains in view for more 26 | than a minute as it flies rapidly, in somewhat Thrush-like, though more undulating 27 | fashion, from cover to cover; choosing ever the densest foliage, as if aware of the 28 | perilous brilliance of its plumage: possibly it may slowly be acquiring a hereditary 29 | knowledge of the fact that, if but a glimpse is obtained of it, an attempt at least 30 | is made to put an end to its life; or if it fails to comprehend so much, it may 31 | inherit a dread of the thunder and lightning which, for generations, have heralded 32 | its appearance: birds are not naturally fearful of man ; for even those which have 33 | been taught by their parents to dread him, can be generally converted by gentleness 34 | and petting: moreover the fact that a grown man can tame a small bird, whereas 35 | even the tamest will always show the greatest fear of a little boy, certainly seems 36 | to prove that the instinctive dread of the monkey-nature in the latter is deeply 37 | implanted in all birds; just as is that of a cat, even though that animal may 38 | never have been seen by the bird previously.* 39 | -------------------------------------------------------------------------------- /tests/unit/ocr/data/substitution.json: -------------------------------------------------------------------------------- 1 | {"tests/unit/ocr/data/text\\0.txt": {}, "tests/unit/ocr/data/text\\1.txt": {}, "tests/unit/ocr/data/text\\11.txt": {}} -------------------------------------------------------------------------------- /tests/unit/ocr/data/substitution.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/tests/unit/ocr/data/substitution.pkl -------------------------------------------------------------------------------- /tests/unit/ocr/data/text/0.txt: -------------------------------------------------------------------------------- 1 | basically ,it was unanimously agreed by the various relevant parties .To its determination ,the Chinese regulatory department compares this reform to a die that has been cast . takes time to prove whether the stock can really meet expectations ,and any deviations that arise during the reform can be promptly corrected . viewers ,the China News program will here .This is Xu Li .Thank you for watching .Coming up is the Focus program hosted by Wang Shilin . ,dear viewers .Hello ,dear viewers . to Focus Today .Today ,let 's turn attention to a road cave -in accident happened in Beijing over the holiday Before dawn on January 3 ,a sewage leakage accident occurred at the main side roads of Jingguang Bridge ,East Ring Road ,Beijing Municipality , in the road caving in .Relevant from Beijing Municipality promptly emergency contingency plans .The administration department carried out supervision near the accident scene . ,how did the emergency response activated by governmental departments effectively during the holiday ? -------------------------------------------------------------------------------- /tests/unit/ocr/data/text/1.txt: -------------------------------------------------------------------------------- 1 | After the holiday ,what will be done handle citizens ' peak commute ? In ,what measures did relevant take to resolve issues such as waste ,heating ,and communication ,in order ensure that the lives of citizens not affected ? Well ,we have invited honorable guests to the studio today follow this topic with us .One of the honorable guests in the studio is Zhou Hanhua from the Institute of Law the Chinese Academy of Social .Hello .Next is Yang Yang ,a host of Traffic Radio Station .Hello .Welcome of you to the studio to participate our program .Well ,I especially want know ,ha ,how the two of you found the news on the day of the accident ? ,,about 11:00 m. yesterday ,ah ,I to find out through an SMS when I was .Uh-huh .Uh-huh .It happened that I going to have lunch with a friend ,um at noon .And then ,the friend first me an SMS ,Uh-huh .saying he would pick me up to go together .After that I received an SMS from 1860 .Uh-huh , was through an SMS . -------------------------------------------------------------------------------- /tests/unit/ocr/data/text/11.txt: -------------------------------------------------------------------------------- 1 | Furthermore ,Chaoyang Road is an .Uh-huh .Whether it is Chaoyang Road the east -west direction or the main side roads of East Third Ring Road in south -north direction ,as we can see this diagram ,it can be said that the at the main and side roads of East Ring Road normally has quite heavy ,especially during commuting times . ,Chaoyang Road is a very important in the east -west direction .Yes . people living in the west want to over from the city ,they have to go this road .Hence ,if a traffic occurs at this place ,we can indeed ,ha ,how widespread ,ah ,the extent the impact will be ,such as the of cars caught in traffic jams .Yes , I think everyone can see that from buses that cross Jingguang Bridge . .As buses that cross the Third Ring are currently ,right now affected by Jingguang Bridge accident ,ah ,the results this morning show that 32 bus throughout the neighborhood have had be rerouted .Uh-huh .Well ,I think perhaps many friends in other places wondering how one place is able to 32 commuter routes . -------------------------------------------------------------------------------- /tests/unit/ocr/test_ocr.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | import pytest 5 | import requests 6 | 7 | from genalog.ocr.rest_client import GrokRestClient 8 | 9 | 10 | @pytest.fixture(scope="module", autouse=True) 11 | def set_azure_dummy_secrets(load_azure_resources): 12 | os.environ['BLOB_KEY'] = "" 13 | os.environ['SEARCH_SERVICE_KEY'] = "" 14 | os.environ['COGNITIVE_SERVICE_KEY'] = "" 15 | 16 | 17 | @pytest.fixture(autouse=True) 18 | def setup_monkeypatch(monkeypatch): 19 | def mock_http(*args, **kwargs): 20 | return MockedResponse(args, kwargs) 21 | 22 | # apply the monkeypatch for requests.get to mock_get 23 | monkeypatch.setattr(requests, "put", mock_http) 24 | monkeypatch.setattr(requests, "post", mock_http) 25 | monkeypatch.setattr(requests, "get", mock_http) 26 | monkeypatch.setattr(requests, "delete", mock_http) 27 | 28 | 29 | class MockedResponse: 30 | def __init__(self, args, kwargs): 31 | self.url = args[0] 32 | self.text = "response" 33 | # self.data = args[1] 34 | self.headers = kwargs["headers"] 35 | 36 | def json(self): 37 | if "search.windows.net/skillsets/" in self.url: 38 | return {} 39 | 40 | if "search.windows.net/indexers/" in self.url: 41 | if "status" in self.url: 42 | return {"lastResult": {"status": "success"}, "status": "finished"} 43 | return {} 44 | 45 | if "search.windows.net/indexes/" in self.url: 46 | if "docs/search" in self.url: 47 | return { 48 | "value": [ 49 | { 50 | "metadata_storage_name": "521c38122f783673598856cd81d91c21_0.png", 51 | "layoutText": json.load( 52 | open( 53 | "tests/unit/ocr/data/json/521c38122f783673598856cd81d91c21_0.png.json", 54 | "r", 55 | ) 56 | ), 57 | }, 58 | { 59 | "metadata_storage_name": "521c38122f783673598856cd81d91c21_1.png", 60 | "layoutText": json.load( 61 | open( 62 | "tests/unit/ocr/data/json/521c38122f783673598856cd81d91c21_1.png.json", 63 | "r", 64 | ) 65 | ), 66 | }, 67 | { 68 | "metadata_storage_name": "521c38122f783673598856cd81d91c21_11.png", 69 | "layoutText": json.load( 70 | open( 71 | "tests/unit/ocr/data/json/521c38122f783673598856cd81d91c21_11.png.json", 72 | "r", 73 | ) 74 | ), 75 | }, 76 | ] 77 | } 78 | return json.dumps({}) 79 | if "search.windows.net/datasources/" in self.url: 80 | return {} 81 | 82 | raise ValueError(f"{self.url} not valid") 83 | 84 | def raise_for_status(self): 85 | pass 86 | 87 | 88 | class TestGROK: 89 | def test_creating_indexing_pipeline(self): 90 | grok_rest_client = GrokRestClient.create_from_env_var() 91 | grok_rest_client.create_indexing_pipeline() 92 | grok_rest_client.delete_indexer_pipeline() 93 | 94 | def test_running_indexer(self): 95 | grok_rest_client = GrokRestClient.create_from_env_var() 96 | grok_rest_client.create_indexing_pipeline() 97 | 98 | indexer_status = grok_rest_client.get_indexer_status() 99 | if indexer_status["status"] == "error": 100 | raise RuntimeError(f"indexer error: {indexer_status}") 101 | 102 | # if not already running start the indexer 103 | if indexer_status["lastResult"]["status"] != "inProgress": 104 | grok_rest_client.run_indexer() 105 | 106 | grok_rest_client.run_indexer() 107 | indexer_status = grok_rest_client.poll_indexer_till_complete() 108 | assert indexer_status["lastResult"]["status"] == "success" 109 | grok_rest_client.delete_indexer_pipeline() 110 | -------------------------------------------------------------------------------- /tests/unit/text/data/gt_1.txt: -------------------------------------------------------------------------------- 1 | The book Between You and Me /. Hello /. Good evening /. Hi /. I 'd like to ask Mr. Wallace if he 's ever turned down an inter- /- Turned down a what /? Interview /. Turned down an interview /? Were you ever asked by CBS to say go do this guy /. and you said no /? If so I do n't remember /. No I do n't think so /. Orlando Florida hello /. Mike I wanted to know /. Go ahead /. the first time I was in New York I saw a nice looking young man on TV in a show Mike and Buffy /. was that you /? That was me he he /. That was me and Buff Cobb who was my /- That 's not Buff talking , is it /? No no ha ha /. cause Buff is up in New Hampshire /. She lives in a home up there /. She 's not well /. Um yeah she and I used to do a show on CBS when I first came to New York /. and it was a fascinating /. it was a little bit like Regis and uh Kathy or uh Regis and Kelly /. But you were married , right /? Yes /. What was it like to do a show with the wife /? Not easy /. Ha ha /. I 'm serious /. You know uh I 'd love to see that /. uh We used to bicker on the air /. and what happened was after a while the bickering continued after we got off the air /. After you got off the air /. You know what I mean /. I know /. Detroit hello /. Hi /. Hi /. How are you /? Fine /. Mr. Wallace this is a big pleasure for me to talk to you /. But um uh what is your most difficult interview that you had in Sixty Minutes the most difficult person that you could have ever interviewed /? I think probably the Aiatola really because he was not anxious to do it /. It was um just after the US hostages had been taken in Iran /. and I was surprised that he was willing to talk to us /. and it was a very very difficult business /. We did it in the holy city of which uh we /- and the circumstances were difficult /. They took good care to see that we did n't get into trouble /. Ha ha /. We 'll take a break /. And he just /- We 'll be back with more of Mike Wallace /. The book is Between You and Me /. the DVD is included /. oh what can one say it 's a terrific work /. We 'll be right back /. That voice was the subject of The Insider /. That man /. that man remains my hero /. Jeff Wygan who took on the tobacco cartel if you will /. And you remember when all those guys who ran the companies raised their hands and said Oh it 's not addictive /. they knew it was addictive /. And he has succeeded /. I mean really he has succeeded /. He runs a foundation for Smoke Free Kids /. and he 's gotten all kinds of success in all kinds of ways in foreign countries and so forth /. The man is my hero /. And you are mine /. And we have a minute and a half left /. I know you 're asked this all the time /. but how long you going to keep on keeping on /. How long you /- you know what the dickens would I do /? what would I do /? How long are you going to keep doing what you 're /- Yeah but you 're /- How old are you Mike /? Eighty - seven /. can you imagine /? I 'm going to be seventy - two /. so you 're fifteen years older than me /. That 's why I feel like a kid compared to you /. /. -------------------------------------------------------------------------------- /tests/unit/text/data/label_generator/labels/0.tsv: -------------------------------------------------------------------------------- 1 | basically O 2 | , O 3 | it O 4 | was O 5 | unanimously O 6 | agreed O 7 | upon O 8 | by O 9 | the O 10 | various O 11 | relevant O 12 | parties O 13 | . O 14 | 15 | To O 16 | express O 17 | its O 18 | determination O 19 | , O 20 | the O 21 | Chinese O 22 | securities O 23 | regulatory O 24 | department O 25 | compares O 26 | this O 27 | stock O 28 | reform O 29 | to O 30 | a O 31 | die O 32 | that O 33 | has O 34 | been O 35 | cast O 36 | . O 37 | 38 | It O 39 | takes O 40 | time O 41 | to O 42 | prove O 43 | whether O 44 | the O 45 | stock O 46 | reform O 47 | can O 48 | really O 49 | meet O 50 | expectations O 51 | , O 52 | and O 53 | whether O 54 | any O 55 | deviations O 56 | that O 57 | arise O 58 | during O 59 | the O 60 | stock O 61 | reform O 62 | can O 63 | be O 64 | promptly O 65 | corrected O 66 | . O 67 | 68 | Dear O 69 | viewers B-PERSONTYPE 70 | , O 71 | the O 72 | China B-ORGANIZATION 73 | News I-ORGANIZATION 74 | program O 75 | will O 76 | end O 77 | here O 78 | . O 79 | 80 | This O 81 | is O 82 | Xu B-PERSONNAME 83 | Li I-PERSONNAME 84 | . O 85 | 86 | Thank O 87 | you O 88 | everyone O 89 | for O 90 | watching O 91 | . O 92 | 93 | Coming O 94 | up O 95 | is O 96 | the O 97 | Focus B-ORGANIZATION 98 | Today I-ORGANIZATION 99 | program O 100 | hosted O 101 | by O 102 | Wang B-PERSONNAME 103 | Shilin I-PERSONNAME 104 | . O 105 | 106 | Good-bye O 107 | , O 108 | dear O 109 | viewers B-PERSONTYPE 110 | . O 111 | 112 | Hello O 113 | , O 114 | dear O 115 | viewers B-PERSONTYPE 116 | . O 117 | 118 | Welcome O 119 | to O 120 | Focus B-ORGANIZATION 121 | Today I-ORGANIZATION 122 | . O 123 | 124 | Today B-DATE 125 | , O 126 | let O 127 | 's O 128 | turn O 129 | our O 130 | attention O 131 | to O 132 | a O 133 | road O 134 | cave O 135 | - O 136 | in O 137 | accident O 138 | that O 139 | happened O 140 | in O 141 | Beijing B-GPE 142 | over O 143 | the O 144 | holiday O 145 | . O 146 | 147 | Before B-DATETIMERANGE 148 | dawn I-DATETIMERANGE 149 | on O 150 | January B-DATE 151 | 3 I-DATE 152 | , O 153 | a O 154 | sewage O 155 | pipe O 156 | leakage O 157 | accident O 158 | occurred O 159 | at O 160 | the O 161 | main O 162 | and O 163 | side O 164 | roads O 165 | of O 166 | Jingguang B-LOCATION 167 | Bridge I-LOCATION 168 | , O 169 | East B-ADDRESS 170 | Third I-ADDRESS 171 | Ring I-ADDRESS 172 | Road I-ADDRESS 173 | , O 174 | Beijing B-GPE 175 | Municipality I-GPE 176 | , O 177 | resulting O 178 | in O 179 | the O 180 | road O 181 | caving O 182 | in O 183 | . O 184 | 185 | Relevant O 186 | departments O 187 | from O 188 | Beijing B-GPE 189 | Municipality I-GPE 190 | promptly O 191 | activated O 192 | emergency O 193 | contingency O 194 | plans O 195 | . O 196 | 197 | The O 198 | traffic O 199 | administration O 200 | department O 201 | carried O 202 | out O 203 | traffic O 204 | supervision O 205 | near O 206 | the O 207 | accident O 208 | scene O 209 | . O 210 | 211 | Well O 212 | , O 213 | how O 214 | did O 215 | the O 216 | emergency O 217 | response O 218 | mechanisms O 219 | activated O 220 | by O 221 | governmental O 222 | departments O 223 | operate O 224 | effectively O 225 | during O 226 | the O 227 | holiday O 228 | ? O 229 | 230 | -------------------------------------------------------------------------------- /tests/unit/text/data/label_generator/labels/1.tsv: -------------------------------------------------------------------------------- 1 | After O 2 | the O 3 | holiday O 4 | , O 5 | what O 6 | will O 7 | be O 8 | done O 9 | to O 10 | handle O 11 | citizens B-PERSONTYPE 12 | ' O 13 | peak O 14 | commute O 15 | ? O 16 | 17 | In O 18 | addition O 19 | , O 20 | what O 21 | measures O 22 | did O 23 | relevant O 24 | departments O 25 | take O 26 | to O 27 | resolve O 28 | issues O 29 | such O 30 | as O 31 | waste O 32 | discharge O 33 | , O 34 | heating O 35 | , O 36 | and O 37 | communication O 38 | , O 39 | in O 40 | order O 41 | to O 42 | ensure O 43 | that O 44 | the O 45 | lives O 46 | of O 47 | citizens B-PERSONTYPE 48 | were O 49 | not O 50 | affected O 51 | ? O 52 | 53 | Well O 54 | , O 55 | we O 56 | have O 57 | invited O 58 | two B-NUMBER 59 | honorable O 60 | guests B-PERSONTYPE 61 | to O 62 | the O 63 | studio B-LOCATION 64 | today B-DATE 65 | to O 66 | follow O 67 | this O 68 | topic O 69 | with O 70 | us O 71 | . O 72 | 73 | One B-NUMBER 74 | of O 75 | the O 76 | two B-NUMBER 77 | honorable O 78 | guests B-PERSONTYPE 79 | in O 80 | the O 81 | studio B-LOCATION 82 | is O 83 | Professor O 84 | Zhou B-PERSONNAME 85 | Hanhua I-PERSONNAME 86 | from O 87 | the O 88 | Institute B-ORGANIZATION 89 | of I-ORGANIZATION 90 | Law I-ORGANIZATION 91 | of O 92 | the O 93 | Chinese B-ORGANIZATION 94 | Academy I-ORGANIZATION 95 | of I-ORGANIZATION 96 | Social I-ORGANIZATION 97 | Sciences I-ORGANIZATION 98 | . O 99 | 100 | Hello O 101 | . O 102 | 103 | Next B-ORDINAL 104 | is O 105 | Yang B-PERSONNAME 106 | Yang I-PERSONNAME 107 | , O 108 | a O 109 | host O 110 | of O 111 | Beijing B-ORGANIZATION 112 | Traffic I-ORGANIZATION 113 | Radio I-ORGANIZATION 114 | Station I-ORGANIZATION 115 | . O 116 | 117 | Hello O 118 | . O 119 | 120 | Welcome O 121 | both O 122 | of O 123 | you O 124 | to O 125 | the O 126 | studio O 127 | to O 128 | participate O 129 | in O 130 | our O 131 | program O 132 | . O 133 | 134 | Well O 135 | , O 136 | I O 137 | especially O 138 | want O 139 | to O 140 | know O 141 | , O 142 | ha O 143 | , O 144 | how O 145 | the O 146 | two B-NUMBER 147 | of O 148 | you O 149 | found O 150 | out O 151 | the O 152 | news O 153 | on O 154 | the B-DATE 155 | day I-DATE 156 | of O 157 | the O 158 | accident B-EVENT 159 | ? O 160 | 161 | Ah O 162 | , O 163 | , O 164 | about O 165 | 11:00 B-NUMBER 166 | m. O 167 | yesterday B-DATE 168 | , O 169 | ah O 170 | , O 171 | I O 172 | happened O 173 | to O 174 | find O 175 | out O 176 | through O 177 | an O 178 | SMS O 179 | when O 180 | I O 181 | was O 182 | outside O 183 | . O 184 | 185 | Uh-huh O 186 | . O 187 | 188 | Uh-huh O 189 | . O 190 | 191 | It O 192 | happened O 193 | that O 194 | I O 195 | was O 196 | going O 197 | to O 198 | have O 199 | lunch B-TIMERANGE 200 | with O 201 | a O 202 | friend B-PERSONTYPE 203 | , O 204 | um O 205 | , O 206 | at O 207 | noon B-TIME 208 | . O 209 | 210 | And O 211 | then O 212 | , O 213 | the O 214 | friend B-PERSONTYPE 215 | first B-ORDINAL 216 | sent O 217 | me O 218 | an O 219 | SMS O 220 | , O 221 | Uh-huh O 222 | . O 223 | 224 | saying O 225 | he O 226 | would O 227 | come O 228 | pick O 229 | me O 230 | up O 231 | to O 232 | go O 233 | together O 234 | . O 235 | 236 | After O 237 | that O 238 | , O 239 | I O 240 | received O 241 | an O 242 | SMS O 243 | from B-DATERANGE 244 | 1860 I-DATERANGE 245 | . O 246 | 247 | Uh-huh O 248 | , O 249 | it O 250 | was O 251 | through O 252 | an O 253 | SMS B-ORGANIZATION 254 | . O 255 | 256 | -------------------------------------------------------------------------------- /tests/unit/text/data/label_generator/text/0.txt: -------------------------------------------------------------------------------- 1 | basically, it was unanimously agreed upon by the various relevant parties. To express its determination, the Chinese securities regulatory department compares this stock reform to a die that has been cast. It takes time to prove whether the stock reform can really meet expectations, and whether any deviations that arise during the stock reform can be promptly corrected. Dear viewers, the China News program will end here. This is Xu Li. Thank you everyone for watching. Coming up is the Focus Today program hosted by Wang Shilin. Good-bye, dear viewers. Hello, dear viewers. Welcome to Focus Today. Today, let's turn our attention to a road cave - in accident that happened in Beijing over the holiday. Before dawn on January 3, a sewage pipe leakage accident occurred at the main and side roads of Jingguang Bridge , East Third Ring Road, Beijing Municipality, resulting in the road caving in. Relevant departments from Beijing Municipality promptly activated emergency contingency plans. The traffic administration department carried out traffic supervision near the accident scene. Well, how did the emergency response mechanisms activated by governmental departments operate effectively during the holiday ? -------------------------------------------------------------------------------- /tests/unit/text/data/label_generator/text/1.txt: -------------------------------------------------------------------------------- 1 | After the holiday, what will be done to handle citizens' peak commute? In addition, what measures did relevant departments take to resolve issues such as waste discharge, heating, and communication, in order to ensure that the lives of citizens were not affected? Well, we have invited two honorable guests to the studio today to follow this topic with us. One of the two honorable guests in the studio is Professor Zhou Hanhua from the Institute of Law of the Chinese Academy of Social Sciences. Hello. Next is Yang Yang, a host of Beijing Traffic Radio Station. Hello. Welcome both of you to the studio to participate in our program. Well, I especially want to know, ha, how the two of you found out the news on the day of the accident? Ah,, about 11:00 m. yesterday, ah, I happened to find out through an SMS when I was outside. Uh-huh. Uh-huh. It happened that I was going to have lunch with a friend, um, at noon. And then, the friend first sent me an SMS, Uh-huh. saying he would come pick me up to go together. After that, I received an SMS from 1860. Uh-huh, it was through an SMS. -------------------------------------------------------------------------------- /tests/unit/text/data/label_generator/text/11.txt: -------------------------------------------------------------------------------- 1 | And you, Yang Yang? A friend happened to call me. You were not at work that day? No. The station called me at noon and said something happened at Jingguang Bridge and that I had to go to the station immediately to research the upcoming program. Uh-huh, that means, er, you found out the accident through an information source at the station. Right, right, right. Uh-huh. Well, like Professor Zhou, I also received this news, ha, through a mobile phone SMS. At that time,, it can be said that this SMS was among the many, ha, SMS containing New Year wishes, like Happy New Year, received after the start of the New Year. Uh-huh. Ah, actually I felt a lot of warmth when I received that SMS. Although we live in the west instead of the east and it did not affect us much, I think it is very useful, ah, to inform people of this kind of news. Yes, exceptionally. Yes, exceptionally. Well, what in fact was the content of that SMS? Let's take a look via this footage, ha. I remember the SMS was written like this at that time, saying that, ah, there was a sewage pipe leakage accident on the side road at the southeast corner of Jingguang Bridge at East Third Ring Road, and, well, traffic supervision was implemented near Chaoyang Road, Jingguang Bridge, and East Third Ring Road, and requesting cars to make a detour. Some car owners said that it was very good that the SMS was sent. Furthermore, there was one last sentence in that SMS thanking citizens for their cooperation and support. Ah, after the SMS was sent ,, I felt it seems to be the first time that Beijing Municipality, ah, used an SMS to give notification at the time of a public emergency. I don't know, all of us are living in Beijing, is this the first time, Professor Zhou? Yes, in terms of an official notification , this should be the first time one was sent officially through 1860. Uh-huh. -------------------------------------------------------------------------------- /tests/unit/text/data/ocr_1.txt: -------------------------------------------------------------------------------- 1 | The book Between you and me /. Hello / Good evening / Is /. I'd we to ask it Wallace if he's ever turned down an inter / Turned down a what / Interview /. Taned down an interview /? Were you ever asked by CBS to say go do this guy ? and your sant no /7 # so I do n't remember / No I don't think so /. Ontario Florida head / like I wanted to know / Go ahead / the first time I was in wes bork I sow a nice looking young man on TV in's show bike and Butty / was that you ? That was me he he /, That was me and Buff Coco who was my ). That's not But taking, Is it /f to no he he /. cause Butt is up in New Hampshire /. She lives in a home up there /. She's not well /. Um yeah she and I used to do a show on CaS when I first came to he's work / and it was a fascinating /. It was a lene be like Regis and on Kathy or on Recy's and Kelly /. But you were married , right /1 Yes /. What was it wise to do a show with the wife 7 Not easy /. Ha ha / I'm serious /. You know in I'd love to see that ! on we used to backer on live or I. and what happened was after a while the bickering corewed sher we got off the air / After you got off the # 1. You know what I mean /. I know / Decree helio / bill. Is /. How are you /7 Fire ). He Wallace this is a big pleasure for me to talk to you / But tan is what is your most officus aderview that you had it Sixty Minutes the most difficult person that you could have over interviewed /? I think probably the Alitois really because he was not anxious to do i / It was on just otter the US hostages had been taken in van /, and I was surprised that he was wasing to talk to us /. and it was a very very autocult business / we did it in the holy city of which of we / and the circumstances were offcult /, They took good care to see that we and n't got into trouble / Ha ha / we # take a break !, And he just / we " be back with more of mike wallace /. The book is Between lexi and his / the DVD is backand / on what can one say it's a terrific work / we'll be right back / That voice was the subject of The Raider /. That man I. that man remains my hero / jet Wygen who took on the tobacco cartel If you will /. And you remen. bar when all those guys who ran the companies raised their hands and said On It's not aceactive /. they knew & was abiktive / And he has succeeded /. I mean really he has succeeded /. He runs s founds tion for Smoke Free Kids / and he's gotten off kind of success in of kinds of ways in foreign countries and so forth /. The man is my have /. And your are mine /. And we have a miage and s hat let /. I know you 're asked this all the time / but how long you going to keep on keeping on /. How ling you /. you know what the dickens would I do /7 what waxat I do /7 How king are you going to keep doing what you 're / was but you 're / How old are you like /? fighty . seven /. can you imagine /? I'm going to be seventy . two /. so you're fifteen years older than me ). That's why I feel we e kil compared to you 1. 1. -------------------------------------------------------------------------------- /tests/unit/text/test_lcs.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from genalog.text.lcs import LCS 4 | 5 | 6 | @pytest.fixture( 7 | params=[ 8 | ("", ""), # empty 9 | ("abcde", "ace"), # naive case 10 | ] 11 | ) 12 | def lcs(request): 13 | str1, str2 = request.param 14 | return LCS(str1, str2) 15 | 16 | 17 | def test_lcs_init(lcs): 18 | assert lcs._lcs_len is not None 19 | assert lcs._lcs is not None 20 | 21 | 22 | @pytest.mark.parametrize( 23 | "str1, str2, expected_len, expected_lcs", 24 | [ 25 | ("", "", 0, ""), # empty 26 | ("abc", "abc", 3, "abc"), 27 | ("abcde", "ace", 3, "ace"), # naive case 28 | ("a", "", 0, ""), # no results 29 | ("abc", "cba", 1, "c"), # multiple cases 30 | ("abcdgh", "aedfhr", 3, "adh"), 31 | ("abc.!\t\nd", "dxab", 2, "ab"), # with punctuations 32 | ( 33 | "New York @", 34 | "New @ York", 35 | len("New York"), 36 | "New York", 37 | ), # with space-separated, tokens 38 | ("Is A Big City", "A Big City Is", len("A Big City"), "A Big City"), 39 | ("Is A Big City", "City Big Is A", len(" Big "), " Big "), # reversed order 40 | # mixed order with similar tokens 41 | ("Is A Big City IS", "IS Big A City Is", len("I Big City I"), "I Big City I"), 42 | # casing 43 | ( 44 | "Is A Big City IS a", 45 | "IS a Big City Is A", 46 | len("I Big City I "), 47 | "I Big City I ", 48 | ), 49 | ], 50 | ) 51 | def test_lcs_e2e(str1, str2, expected_len, expected_lcs): 52 | lcs = LCS(str1, str2) 53 | assert expected_lcs == lcs.get_str() 54 | assert expected_len == lcs.get_len() 55 | -------------------------------------------------------------------------------- /tests/unit/text/test_preprocess.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from genalog.text import preprocess 4 | from genalog.text.alignment import GAP_CHAR 5 | 6 | 7 | @pytest.mark.parametrize( 8 | "token, replacement, desired_output", 9 | [ 10 | ("", "_", ""), # Do nothing to empty string 11 | (" ", "_", " "), # Do nothing to whitespaces 12 | (" \n\t", "_", " \n\t"), 13 | ("ascii", "_", "ascii"), 14 | ("a s\nc\tii", "_", "a s\nc\tii"), 15 | ("ascii·", "_", "ascii"), # Tokens with non-ASCII values 16 | ("·", "_", "_"), # Tokens with non-ASCII values 17 | ], 18 | ) 19 | def test_remove_non_ascii(token, replacement, desired_output): 20 | for code in range(128, 1000): # non-ASCII values 21 | token.replace("·", chr(code)) 22 | output = preprocess.remove_non_ascii(token, replacement) 23 | assert output == desired_output 24 | 25 | 26 | @pytest.mark.parametrize( 27 | "s, desired_output", 28 | [ 29 | (" New \t \n", ["New"]), 30 | # Mixed in gap char "@" 31 | (" @ @", ["@", "@"]), 32 | ("New York is big", ["New", "York", "is", "big"]), 33 | # Mixed multiple spaces and tabs 34 | (" New York \t is \t big", ["New", "York", "is", "big"]), 35 | # Mixed in punctuation 36 | ("New .York is, big !", ["New", ".York", "is,", "big", "!"]), 37 | # Mixed in gap char "@" 38 | ("@N@ew York@@@is,\t big@@@@@", ["@N@ew", "York@@@is,", "big@@@@@"]), 39 | ], 40 | ) 41 | def test_tokenize(s, desired_output): 42 | output = preprocess.tokenize(s) 43 | assert output == desired_output 44 | 45 | 46 | @pytest.mark.parametrize( 47 | "tokens, desired_output", 48 | [ 49 | ( 50 | ["New", "York", "is", "big"], 51 | "New York is big", 52 | ), 53 | # Mixed in punctuation 54 | ( 55 | ["New", ".York", "is,", "big", "!"], 56 | "New .York is, big !", 57 | ), 58 | # Mixed in gap char "@" 59 | ( 60 | ["@N@ew", "York@@@is,", "big@@@@@"], 61 | "@N@ew York@@@is, big@@@@@", 62 | ), 63 | ], 64 | ) 65 | def test_join_tokens(tokens, desired_output): 66 | output = preprocess.join_tokens(tokens) 67 | assert output == desired_output 68 | 69 | 70 | @pytest.mark.parametrize( 71 | "c, desired_output", 72 | [ 73 | # Gap char 74 | (GAP_CHAR, False), 75 | # Alphabet char 76 | ("a", False), 77 | ("A", False), 78 | # Punctuation 79 | (".", False), 80 | ("!", False), 81 | (",", False), 82 | ("-", False), 83 | # Token separators 84 | (" ", True), 85 | ("\n", True), 86 | ("\t", True), 87 | ], 88 | ) 89 | def test__is_spacing(c, desired_output): 90 | assert desired_output == preprocess._is_spacing(c) 91 | 92 | 93 | @pytest.mark.parametrize( 94 | "text, desired_output", 95 | [ 96 | ("", ""), 97 | ("w .", "w ."), 98 | ("w !", "w !"), 99 | ("w ?", "w ?"), 100 | ("w /.", "w /."), 101 | ("w /!", "w /!"), 102 | ("w /?", "w /?"), 103 | ("w1 , w2 .", "w1 , w2 ."), 104 | ("w1 . w2 .", "w1 . \nw2 ."), 105 | ("w1 /. w2 /.", "w1 /. \nw2 /."), 106 | ("w1 ! w2 .", "w1 ! \nw2 ."), 107 | ("w1 /! w2 /.", "w1 /! \nw2 /."), 108 | ("w1 ? w2 .", "w1 ? \nw2 ."), 109 | ("w1 /? w2 /.", "w1 /? \nw2 /."), 110 | ("U.S. . w2 .", "U.S. . \nw2 ."), 111 | ("w1 ??? w2 .", "w1 ??? w2 ."), # not splitting 112 | ("w1 !!! w2 .", "w1 !!! w2 ."), 113 | ("w1 ... . w2 .", "w1 ... . \nw2 ."), 114 | ("w1 ... /. w2 /.", "w1 ... /. \nw2 /."), 115 | ("w1 /. /. w2 .", "w1 /. /. \nw2 ."), 116 | ("w1 /. /.", "w1 /. \n/."), 117 | ("w1 /. /. ", "w1 /. /. \n"), 118 | ("w1 ? ? ? ? w2 .", "w1 ? ? ? ? \nw2 ."), 119 | ("w1 /? /? /? /? w2 /.", "w1 /? /? /? /? \nw2 /."), 120 | ("w1 ! ! ! ! w2 .", "w1 ! ! ! ! \nw2 ."), 121 | ("w1 /! /! /! /! w2 /.", "w1 /! /! /! /! \nw2 /."), 122 | ], 123 | ) 124 | def test_split_sentences(text, desired_output): 125 | assert desired_output == preprocess.split_sentences(text) 126 | 127 | 128 | @pytest.mark.parametrize( 129 | "token, desired_output", 130 | [ 131 | ("", False), 132 | (" ", False), 133 | ("\n", False), 134 | ("\t", False), 135 | (" \n \t", False), 136 | ("...", False), 137 | ("???", False), 138 | ("!!!", False), 139 | (".", True), 140 | ("!", True), 141 | ("?", True), 142 | ("/.", True), 143 | ("/!", True), 144 | ("/?", True), 145 | ], 146 | ) 147 | def test_is_sentence_separator(token, desired_output): 148 | assert desired_output == preprocess.is_sentence_separator(token) 149 | -------------------------------------------------------------------------------- /tests/unit/text/test_utf8.py: -------------------------------------------------------------------------------- 1 | import random 2 | import warnings 3 | 4 | import pytest 5 | 6 | from genalog.text import alignment 7 | from genalog.text.alignment import GAP_CHAR 8 | from tests.unit.cases.text_alignment import ALIGNMENT_REGRESSION_TEST_CASES 9 | 10 | 11 | def random_utf8_char(byte_len=1): 12 | if byte_len == 1: 13 | return chr(random.randint(0, 0x007F)) 14 | elif byte_len == 2: 15 | return chr(random.randint(0x007F, 0x07FF)) 16 | elif byte_len == 3: 17 | return chr(random.randint(0x07FF, 0xFFFF)) 18 | elif byte_len == 4: 19 | return chr(random.randint(0xFFFF, 0x10FFFF)) 20 | else: 21 | raise ValueError( 22 | f"Invalid byte length: {byte_len}." 23 | + "utf-8 does not encode characters with more than 4 bytes in length" 24 | ) 25 | 26 | 27 | @pytest.mark.parametrize( 28 | "num_utf_char_to_test", [100] 29 | ) # Number of char per byte length 30 | @pytest.mark.parametrize( 31 | "byte_len", [1, 2, 3, 4] 32 | ) # UTF does not encode with more than 4 bytes 33 | @pytest.mark.parametrize( 34 | "gt_txt, noisy_txt, expected_aligned_gt, expected_aligned_noise", 35 | ALIGNMENT_REGRESSION_TEST_CASES, 36 | ) 37 | def test_align( 38 | num_utf_char_to_test, 39 | byte_len, 40 | gt_txt, 41 | noisy_txt, 42 | expected_aligned_gt, 43 | expected_aligned_noise, 44 | ): 45 | 46 | invalid_char = set(gt_txt).union( 47 | set(GAP_CHAR) 48 | ) # character to replace to cannot be in this set 49 | for _ in range(num_utf_char_to_test): 50 | utf_char = random_utf8_char(byte_len) 51 | while ( 52 | utf_char in invalid_char 53 | ): # find a utf char not in the input string and not GAP_CHAR 54 | utf_char = random_utf8_char(byte_len) 55 | char_to_replace = random.choice(list(invalid_char)) if gt_txt else "" 56 | 57 | gt_txt.replace(char_to_replace, utf_char) 58 | noisy_txt.replace(char_to_replace, utf_char) 59 | expected_aligned_gt_sub = expected_aligned_gt.replace(char_to_replace, utf_char) 60 | expected_aligned_noise_sub = expected_aligned_noise.replace( 61 | char_to_replace, utf_char 62 | ) 63 | 64 | # Run alignment 65 | aligned_gt, aligned_noise = alignment.align(gt_txt, noisy_txt) 66 | 67 | aligned_gt = aligned_gt.replace(char_to_replace, utf_char) 68 | aligned_noise = aligned_noise.replace(char_to_replace, utf_char) 69 | if aligned_gt != expected_aligned_gt_sub: 70 | expected_alignment = alignment._format_alignment( 71 | expected_aligned_gt_sub, expected_aligned_noise_sub 72 | ) 73 | result_alignment = alignment._format_alignment(aligned_gt, aligned_noise) 74 | warnings.warn( 75 | RuntimeWarning( 76 | f"\n\n****Expect alignment returns:****\n{expected_alignment} \n****But got:****\n{result_alignment}" 77 | ) 78 | ) 79 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = flake8, py 3 | 4 | 5 | [testenv] 6 | passenv = 7 | # For e2e testing the OCR components 8 | BLOB_KEY 9 | BLOB_NAME 10 | COGNITIVE_SERVICE_KEY 11 | COMPUTER_VISION_SUBSCRIPTION_KEY 12 | SEARCH_SERVICE_KEY 13 | # Reading additional dependencies to run the test 14 | # https://tox.readthedocs.io/en/latest/example/basic.html#depending-on-requirements-txt-or-defining-constraints 15 | deps = -rrequirements-dev.txt 16 | commands = 17 | # {posargs} will be substituded by arguments after the `--` when running. 18 | # This will allow running subset of the test suite via tox. 19 | # 20 | # EX: tox -- -m "not azure and not slow" 21 | # will pass {-m "not azure and not slow"} to `pytest` 22 | # See https://tox.readthedocs.io/en/latest/example/general.html for more details 23 | pytest {posargs} 24 | 25 | 26 | [testenv:flake8] 27 | deps = flake8 28 | skip_install = True 29 | commands = flake8 . 30 | 31 | 32 | # Configurations for running pytest 33 | [pytest] 34 | log_cli = False 35 | log_format = %(asctime)s %(levelname)s %(message)s 36 | junit_family = xunit2 37 | # This enable custom marker as decorator "@pytest.mark.slow" 38 | markers = 39 | # These two markers allow to us to run faster subset of the test: 40 | # EX: pytest -m "not slow and not azure" 41 | # See https://docs.pytest.org/en/stable/example/markers.html#registering-markers 42 | slow: marks tests as slow (deselect with '-m "not slow"') 43 | azure: marks as integration tests that require azure resource 44 | io: marks integration tests involving some form of I/O operations (disk, internet, etc) 45 | testpaths = 46 | tests 47 | addopts = 48 | # reports all (except passed tests). See https://docs.pytest.org/en/latest/usage.html#detailed-summary-report 49 | -ra 50 | --cov-append --cov=genalog --cov-report=html --cov-report=term-missing --cov-report=xml --junitxml=junit/test-results.xml 51 | 52 | 53 | [flake8] 54 | # Configs for flake8-import-order, see https://pypi.org/project/flake8-import-order/ for more info. 55 | import-order-style=edited 56 | application-import-names=genalog, tests 57 | # Native flake8 configs 58 | max-line-length = 140 59 | exclude = 60 | build, dist, docs, example, 61 | .env*,.venv* # local virtual environments 62 | .tox 63 | --------------------------------------------------------------------------------