├── .gitignore
├── CHANGELOG.md
├── CODEOWNERS
├── CODE_OF_CONDUCT.md
├── LICENSE
├── MANIFEST.in
├── README.md
├── RELEASE.md
├── SECURITY.md
├── VERSION.txt
├── devops
    ├── nightly.yml
    ├── pr-gate.yml
    ├── release.yml
    └── templates
    │   ├── base
    │       ├── publish-test-results.yml
    │       ├── run-linter.yml
    │       └── run-tests.yml
    │   ├── build_wheel_n_sdist.yml
    │   ├── install-dependencies.yml
    │   ├── merge-cov-reports.yml
    │   └── run-tests-on-multiple-os-py.yml
├── docs
    ├── .gitignore
    ├── genalog_docs
    │   ├── _config.yml
    │   ├── _toc.yml
    │   ├── doc_degradation.md
    │   ├── doc_generation.md
    │   ├── docstring
    │   │   ├── genalog.degradation.rst
    │   │   ├── genalog.generation.rst
    │   │   ├── genalog.ocr.rst
    │   │   └── genalog.text.rst
    │   ├── e2e_dataset_pipeline.md
    │   ├── generation_pipeline.ipynb
    │   ├── index.md
    │   ├── installation.md
    │   ├── ocr_label_propagation.ipynb
    │   ├── static
    │   │   ├── analog_doc_gen_pipeline.png
    │   │   ├── bleed_through.png
    │   │   ├── blur.png
    │   │   ├── close_dilate.png
    │   │   ├── columns_Times_11px.png
    │   │   ├── degrader.png
    │   │   ├── degrader_heavy.png
    │   │   ├── genalog_demo.gif
    │   │   ├── genalog_favicon.svg
    │   │   ├── genalog_full_logo.svg
    │   │   ├── genalog_logo_no_text.svg
    │   │   ├── kernel_morph.png
    │   │   ├── labeled_synthetic_pipeline.png
    │   │   ├── letter_Times_11px.png
    │   │   ├── open_erode.png
    │   │   ├── salt_pepper.png
    │   │   └── text_block_Times_11px.png
    │   └── text_alignment.ipynb
    └── requirements-doc.txt
├── example
    ├── dataset_generation.ipynb
    ├── demo_generate.py
    ├── document_degradation.ipynb
    ├── document_generation.ipynb
    ├── generation_pipeline.ipynb
    ├── ocr_extraction.ipynb
    ├── ocr_label_propagation.ipynb
    ├── sample
    │   ├── degradation
    │   │   ├── bleed_through.png
    │   │   ├── blur.png
    │   │   ├── close_dilate.png
    │   │   ├── degrader.png
    │   │   ├── degrader_heavy.png
    │   │   ├── kernel_morph.png
    │   │   ├── open_erode.png
    │   │   ├── salt_pepper.png
    │   │   ├── text_block.png
    │   │   └── text_zoomed.png
    │   └── generation
    │   │   ├── columns_Times_11px.pdf
    │   │   ├── columns_Times_11px.png
    │   │   ├── example.txt
    │   │   ├── letter_Times_11px.pdf
    │   │   ├── letter_Times_11px.png
    │   │   ├── text_block_Times_11px.pdf
    │   │   ├── text_block_Times_11px.png
    │   │   ├── text_block_Times_11px_pg_0.png
    │   │   └── text_block_Times_11px_pg_1.png
    ├── static
    │   ├── analog_doc_gen_pipeline.png
    │   ├── genalog_components.png
    │   └── labeled_synthetic_pipeline.png
    └── text_alignment.ipynb
├── genalog
    ├── README.md
    ├── __init__.py
    ├── degradation
    │   ├── README.md
    │   ├── __init__.py
    │   ├── degrader.py
    │   └── effect.py
    ├── generation
    │   ├── .gitignore
    │   ├── README.md
    │   ├── __init__.py
    │   ├── content.py
    │   ├── document.py
    │   └── templates
    │   │   ├── base.css.jinja
    │   │   ├── base.html.jinja
    │   │   ├── columns.css.jinja
    │   │   ├── columns.html.jinja
    │   │   ├── letter.css.jinja
    │   │   ├── letter.html.jinja
    │   │   ├── macro
    │   │       ├── dimension.css.jinja
    │   │       ├── page_layout.css.jinja
    │   │       └── text.css.jinja
    │   │   ├── text_block.css.jinja
    │   │   └── text_block.html.jinja
    ├── ocr
    │   ├── README.md
    │   ├── __init__.py
    │   ├── blob_client.py
    │   ├── common.py
    │   ├── grok.py
    │   ├── metrics.py
    │   ├── rest_client.py
    │   └── templates
    │   │   ├── datasource.json
    │   │   ├── index.json
    │   │   ├── indexer.json
    │   │   ├── knowledge_store.json
    │   │   └── skillset.json
    ├── pipeline.py
    └── text
    │   ├── README.md
    │   ├── __init__.py
    │   ├── alignment.py
    │   ├── anchor.py
    │   ├── conll_format.py
    │   ├── lcs.py
    │   ├── ner_label.py
    │   ├── preprocess.py
    │   └── splitter.py
├── requirements-dev.txt
├── requirements.txt
├── setup.py
├── tests
    ├── .env
    ├── __init__.py
    ├── conftest.py
    ├── e2e
    │   ├── data
    │   │   ├── conll_formatter
    │   │   │   ├── clean_labels
    │   │   │   │   ├── 0.txt
    │   │   │   │   ├── 1.txt
    │   │   │   │   ├── 11618.txt
    │   │   │   │   ├── 11656.txt
    │   │   │   │   ├── 16.txt
    │   │   │   │   ├── 17.txt
    │   │   │   │   ├── 1838.txt
    │   │   │   │   ├── 1839.txt
    │   │   │   │   ├── 1901.txt
    │   │   │   │   ├── 2.txt
    │   │   │   │   ├── 2161.txt
    │   │   │   │   ├── 3.txt
    │   │   │   │   ├── 4.txt
    │   │   │   │   ├── 482.txt
    │   │   │   │   ├── 5.txt
    │   │   │   │   ├── 6.txt
    │   │   │   │   ├── 7.txt
    │   │   │   │   ├── 7965.txt
    │   │   │   │   ├── 8.txt
    │   │   │   │   └── 9.txt
    │   │   │   └── ocr_text
    │   │   │   │   ├── 0.txt
    │   │   │   │   ├── 1.txt
    │   │   │   │   ├── 11618.txt
    │   │   │   │   ├── 11656.txt
    │   │   │   │   ├── 16.txt
    │   │   │   │   ├── 17.txt
    │   │   │   │   ├── 1838.txt
    │   │   │   │   ├── 1839.txt
    │   │   │   │   ├── 1901.txt
    │   │   │   │   ├── 2.txt
    │   │   │   │   ├── 2161.txt
    │   │   │   │   ├── 3.txt
    │   │   │   │   ├── 4.txt
    │   │   │   │   ├── 5.txt
    │   │   │   │   ├── 6.txt
    │   │   │   │   ├── 7.txt
    │   │   │   │   ├── 7965.txt
    │   │   │   │   ├── 8.txt
    │   │   │   │   └── 9.txt
    │   │   ├── splitter
    │   │   │   ├── example_conll2012.txt
    │   │   │   └── example_splits
    │   │   │   │   ├── clean_labels
    │   │   │   │       ├── 0.txt
    │   │   │   │       └── 1.txt
    │   │   │   │   └── clean_text
    │   │   │   │       ├── 0.txt
    │   │   │   │       └── 1.txt
    │   │   └── synthetic_dataset
    │   │   │   ├── shared
    │   │   │       ├── test
    │   │   │       │   └── clean_labels
    │   │   │       │   │   └── 1901.txt
    │   │   │       └── train
    │   │   │       │   └── clean_labels
    │   │   │       │       └── 2161.txt
    │   │   │   └── test_version
    │   │   │       ├── .gitignore
    │   │   │       ├── test
    │   │   │           └── ocr
    │   │   │           │   └── 1901.json
    │   │   │       └── train
    │   │   │           └── ocr
    │   │   │               └── 2161.json
    │   ├── templates
    │   │   └── solid_bg.html.jinja
    │   ├── test_anchor_e2e.py
    │   ├── test_conll_format_e2e.py
    │   ├── test_document_generation.py
    │   ├── test_generaton_n_degradation.py
    │   ├── test_image_channel.py
    │   ├── test_ocr_e2e.py
    │   ├── test_pipeline.py
    │   └── test_splitter.py
    ├── required_env.py
    └── unit
    │   ├── __init__.py
    │   ├── cases
    │       ├── __init__.py
    │       ├── label_propagation.py
    │       └── text_alignment.py
    │   ├── degradation
    │       ├── __init__.py
    │       ├── test_degrader.py
    │       └── test_effect.py
    │   ├── generation
    │       ├── 2x2.jpg
    │       ├── __init__.py
    │       ├── templates
    │       │   ├── font_family.html.jinja
    │       │   ├── mock.html.jinja
    │       │   └── multipage.html.jinja
    │       ├── test_content.py
    │       └── test_document.py
    │   ├── ocr
    │       ├── __init__.py
    │       ├── data
    │       │   ├── img
    │       │   │   ├── 0.png
    │       │   │   ├── 1.png
    │       │   │   └── 11.png
    │       │   ├── json
    │       │   │   ├── 521c38122f783673598856cd81d91c21_0.json
    │       │   │   ├── 521c38122f783673598856cd81d91c21_1.json
    │       │   │   └── 521c38122f783673598856cd81d91c21_11.json
    │       │   ├── metrics.csv
    │       │   ├── metrics
    │       │   │   ├── json
    │       │   │   │   ├── 123_001.json
    │       │   │   │   ├── 123_002.json
    │       │   │   │   └── 123_003.json
    │       │   │   ├── metrics.csv
    │       │   │   ├── substitution.pkl
    │       │   │   └── text
    │       │   │   │   ├── 001.txt
    │       │   │   │   ├── 002.txt
    │       │   │   │   └── 003.txt
    │       │   ├── substitution.json
    │       │   ├── substitution.pkl
    │       │   └── text
    │       │   │   ├── 0.txt
    │       │   │   ├── 1.txt
    │       │   │   └── 11.txt
    │       ├── test_metrics.py
    │       └── test_ocr.py
    │   └── text
    │       ├── data
    │           ├── gt_1.txt
    │           ├── gt_2.txt
    │           ├── gt_3.txt
    │           ├── label_generator
    │           │   ├── labels
    │           │   │   ├── 0.tsv
    │           │   │   ├── 1.tsv
    │           │   │   └── 11.tsv
    │           │   └── text
    │           │   │   ├── 0.txt
    │           │   │   ├── 1.txt
    │           │   │   └── 11.txt
    │           ├── ocr_1.txt
    │           ├── ocr_2.txt
    │           └── ocr_3.txt
    │       ├── test_alignment.py
    │       ├── test_anchor.py
    │       ├── test_conll_format.py
    │       ├── test_lcs.py
    │       ├── test_ner_label.py
    │       ├── test_preprocess.py
    │       └── test_utf8.py
└── tox.ini


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Test output
  2 | test_out
  3 | 
  4 | # Secrets, keys and other credentials
  5 | .secret*
  6 | .cred*
  7 | 
  8 | # Environments
  9 | .env*
 10 | .venv*
 11 | **/.env/
 12 | env/
 13 | venv/
 14 | ENV/
 15 | env.bak/
 16 | venv.bak/
 17 | 
 18 | # Credentials
 19 | .secrets
 20 | .secret*
 21 | 
 22 | # IDE 
 23 | .vscode
 24 | 
 25 | # Byte-compiled / optimized / DLL files
 26 | __pycache__/
 27 | *.py[cod]
 28 | *$py.class
 29 | 
 30 | # C extensions
 31 | *.so
 32 | 
 33 | # Distribution / packaging
 34 | .Python
 35 | build/
 36 | develop-eggs/
 37 | dist/
 38 | downloads/
 39 | eggs/
 40 | .eggs/
 41 | lib/
 42 | lib64/
 43 | parts/
 44 | sdist/
 45 | var/
 46 | wheels/
 47 | pip-wheel-metadata/
 48 | share/python-wheels/
 49 | *.egg-info/
 50 | .installed.cfg
 51 | *.egg
 52 | MANIFEST
 53 | 
 54 | # PyInstaller
 55 | #  Usually these files are written by a python script from a template
 56 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 57 | *.manifest
 58 | *.spec
 59 | 
 60 | # Installer logs
 61 | pip-log.txt
 62 | pip-delete-this-directory.txt
 63 | 
 64 | # Unit test / coverage reports
 65 | htmlcov/
 66 | .tox/
 67 | .nox/
 68 | .coverage
 69 | .coverage.*
 70 | .cache
 71 | nosetests.xml
 72 | coverage.xml
 73 | *.cover
 74 | *.py,cover
 75 | .hypothesis/
 76 | .pytest_cache/
 77 | junit
 78 | 
 79 | # Translations
 80 | *.mo
 81 | *.pot
 82 | 
 83 | # Django stuff:
 84 | *.log
 85 | local_settings.py
 86 | db.sqlite3
 87 | db.sqlite3-journal
 88 | 
 89 | # Flask stuff:
 90 | instance/
 91 | .webassets-cache
 92 | 
 93 | # Scrapy stuff:
 94 | .scrapy
 95 | 
 96 | # Sphinx documentation
 97 | docs/_build/
 98 | 
 99 | # PyBuilder
100 | target/
101 | 
102 | # Jupyter Notebook
103 | .ipynb_checkpoints
104 | 
105 | # IPython
106 | profile_default/
107 | ipython_config.py
108 | 
109 | # pyenv
110 | .python-version
111 | 
112 | # pipenv
113 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
114 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
115 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
116 | #   install all needed dependencies.
117 | #Pipfile.lock
118 | 
119 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
120 | __pypackages__/
121 | 
122 | # Celery stuff
123 | celerybeat-schedule
124 | celerybeat.pid
125 | 
126 | # SageMath parsed files
127 | *.sage.py
128 | 
129 | # Environments
130 | .env
131 | .venv
132 | env/
133 | venv/
134 | ENV/
135 | env.bak/
136 | venv.bak/
137 | 
138 | # Spyder project settings
139 | .spyderproject
140 | .spyproject
141 | 
142 | # Rope project settings
143 | .ropeproject
144 | 
145 | # mkdocs documentation
146 | /site
147 | 
148 | # mypy
149 | .mypy_cache/
150 | .dmypy.json
151 | dmypy.json
152 | 
153 | # Pyre type checker
154 | .pyre/
155 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # Genalog Changelog
 2 | All notable changes to this project will be documented in this file.
 3 | 
 4 | Types of changes
 5 | 1. `Added` for new features.
 6 | 1. `Changed` for changes in existing functionality.
 7 | 1. `Deprecated` for soon-to-be removed features.
 8 | 1. `Removed` for now removed features.
 9 | 1. `Fixed` for any bug fixes.
10 | 1. `Security` in case of vulnerabilities.
11 | 
12 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
13 | and we adopt the [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
14 | 
15 | ## [v0.1.0] - 2021-07-19
16 | ### Added
17 | - Initial package release:
18 |     - 3 standard HTML document template for generation
19 |     - basic image degradation effects including blur, bleed-through, salt & pepper and other morphological operations.
20 |     - 2 flavors of text alignment algorithm: Needleman-Wunsch (shorter text segments) and RETAS (longer text segments)
21 |     - Full e2e NER-OCR label generation notebooks
22 |     - See [documentation](https://microsoft.github.io/genalog/installation.html) for more on the initial features of the package.
23 | 


--------------------------------------------------------------------------------
/CODEOWNERS:
--------------------------------------------------------------------------------
 1 | 
 2 | # Each line is a file pattern followed by one or more owners.
 3 | 
 4 | # These owners will be the default owners for everything in
 5 | # the repo. Unless a later match takes precedence,
 6 | # @global-owner1 and @global-owner2 will be requested for
 7 | *   @microsoft/genalog-admins
 8 | 
 9 | genalog/degradation/ @laserprec
10 | genalog/generation/ @laserprec
11 | genalog/text/ @laserprec
12 | genalog/ocr/ @laserprec
13 | 
14 | tests/ @laserprec


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Microsoft Open Source Code of Conduct
 2 | 
 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
 4 | 
 5 | Resources:
 6 | 
 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns
10 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) Microsoft Corporation.
 2 | 
 3 | MIT License
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include *.md
2 | include *.txt
3 | include LICENSE CODEOWNERS
4 | include .gitignore tox.ini MANIFEST.in
5 | recursive-include genalog *.py *.jinja
6 | recursive-include tests *.py *.jinja *.jpg


--------------------------------------------------------------------------------
/RELEASE.md:
--------------------------------------------------------------------------------
 1 | # Genalog Release Procedure
 2 | 
 3 | Checklist for the release process of `genalog`:
 4 | 
 5 | ### Preparation
 6 | - [ ] Ensure `main` branch contains all relevant changes and PRs relating to the specific release is merged
 7 | - [ ] Create and switch to a new release branch (i.e. release-X.Y.Z)
 8 | 
 9 | ### Package Metadata Update
10 | - [ ] Update VERSION.txt with version bump. Please reference [Semantic Versioning](https://semver.org/).
11 | - [ ] Update [CHANGELOG.md](./CHANGELOG.md)
12 | - [ ] Commit the above changes with title "Release vX.Y.Z" 
13 | - [ ] Generate a new git tag for the new version (e.g. `git tag -a v0.1.0 -m "Initial Release"`)
14 | - [ ] Push the new tag to remote `git push origin v0.1.0`
15 | - [ ] Create a new PR with the above changes into `main` branch. 
16 | 
17 | ### Run the Full Test Suites
18 | - [ ] If you haven't, `pip install tox`
19 | - [ ] Run the test suites with `tox -e py -- -m "not azure"` (we will skip the azure related tests as they will be deprecated)
20 | 
21 | ### Release to PyPI
22 | - [ ] Manually trigger the [release pipeline](https://dev.azure.com/genalog-dev/genalog/_build?definitionId=2) in DevOps on the release branch, this will publish latest version of `genalog` to PyPI.
23 |     - [ ] Select `releaseType` to `Test` to test out the release in [TestPyPI](https://test.pypi.org/project/genalog/)
24 |     - [ ] Rerun and switch `releaseType` to production if looks good.
25 | - [ ] If the pipeline ran successfully, check and publish the draft of this release on [Github Release](https://github.com/microsoft/genalog/releases)
26 | - [ ] Latest version is pip-installable with:
27 |     - `pip install genalog`
28 | 
29 | ### Update Documentation on Github Page
30 | - [ ] Staying on the release branch, `cd docs && pip install -r requirements-doc.txt`
31 | - [ ] Build the jupyter-book with `jupyter-book build --all genalog_docs`
32 | - [ ] Preview the HTML files, if looks good [publish to Github Page](https://jupyterbook.org/start/publish.html#publish-your-book-online-with-github-pages): `ghp-import -n -p -f genalog_docs/_build/html` 
33 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | <!-- BEGIN MICROSOFT SECURITY.MD V0.0.5 BLOCK -->
 2 | 
 3 | ## Security
 4 | 
 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/).
 6 | 
 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://docs.microsoft.com/en-us/previous-versions/tn-archive/cc751383(v=technet.10)), please report it to us as described below.
 8 | 
 9 | ## Reporting Security Issues
10 | 
11 | **Please do not report security vulnerabilities through public GitHub issues.**
12 | 
13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://msrc.microsoft.com/create-report).
14 | 
15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com).  If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://www.microsoft.com/en-us/msrc/pgp-key-msrc).
16 | 
17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). 
18 | 
19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
20 | 
21 |   * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
22 |   * Full paths of source file(s) related to the manifestation of the issue
23 |   * The location of the affected source code (tag/branch/commit or direct URL)
24 |   * Any special configuration required to reproduce the issue
25 |   * Step-by-step instructions to reproduce the issue
26 |   * Proof-of-concept or exploit code (if possible)
27 |   * Impact of the issue, including how an attacker might exploit the issue
28 | 
29 | This information will help us triage your report more quickly.
30 | 
31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://microsoft.com/msrc/bounty) page for more details about our active programs.
32 | 
33 | ## Preferred Languages
34 | 
35 | We prefer all communications to be in English.
36 | 
37 | ## Policy
38 | 
39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://www.microsoft.com/en-us/msrc/cvd).
40 | 
41 | <!-- END MICROSOFT SECURITY.MD BLOCK -->


--------------------------------------------------------------------------------
/VERSION.txt:
--------------------------------------------------------------------------------
1 | 0.1.0


--------------------------------------------------------------------------------
/devops/nightly.yml:
--------------------------------------------------------------------------------
 1 | # ---------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation. All rights reserved.
 3 | # Licensed under the MIT License.
 4 | # ---------------------------------------------------------
 5 | 
 6 | name: $(Date:yyyyMMdd).$(Rev:r)
 7 | 
 8 | trigger: none # nightly build is scheduled once per day
 9 | 
10 | pr: none
11 | 
12 | variables:
13 |   - group: azureResourceKeys
14 | 
15 | stages:
16 | - stage: static_analysis
17 |   jobs:
18 |   - job: flake8_linux_py36
19 |     pool:
20 |       vmImage: 'ubuntu-latest'
21 |     steps:
22 |     - template: templates/base/run-linter.yml
23 |       parameters:
24 |         pyVersion: '3.6'
25 |     - task: ComponentGovernanceComponentDetection@0
26 | 
27 | - stage: unit_tests
28 |   dependsOn: static_analysis
29 |   jobs:
30 |   - template: templates/run-tests-on-multiple-os-py.yml
31 |     parameters:
32 |       pyVersions: ['3.6', '3.7', '3.8']
33 |       testTypes: ['unit', 'io']
34 |       imageOSs: ['ubuntu-18.04']  # 'windows-latest', 'macos-latest' are not supported
35 | 
36 | - stage: e2e_tests
37 |   dependsOn: static_analysis
38 |   jobs:
39 |   - template: templates/run-tests-on-multiple-os-py.yml
40 |     parameters:
41 |       pyVersions: ['3.6', '3.7', '3.8']
42 |       testTypes: ['e2e']
43 |       imageOSs: ['ubuntu-18.04']  # 'windows-latest', 'macos-latest' are not supported
44 | 
45 | - stage: collect_final_code_coverage
46 |   dependsOn: 
47 |   - unit_tests
48 |   - e2e_tests
49 |   jobs:
50 |   - template: templates/merge-cov-reports.yml
51 | 
52 | - stage: publish_artifacts
53 |   jobs:
54 |   - job: archive_wheel_and_sdist
55 |     pool:
56 |       vmImage: 'ubuntu-latest'
57 |     steps:
58 |     - template: templates/build_wheel_n_sdist.yml
59 | 
60 |     - task: PublishBuildArtifacts@1
61 |       inputs:
62 |         PathtoPublish: $(Build.SourcesDirectory)/dist
63 |         ArtifactName: distribution_artifacts
64 |         publishLocation: 'Container'
65 |       displayName: 'Publish wheel and sdist'
66 | 


--------------------------------------------------------------------------------
/devops/pr-gate.yml:
--------------------------------------------------------------------------------
 1 | # ---------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation. All rights reserved.
 3 | # Licensed under the MIT License.
 4 | # ---------------------------------------------------------
 5 | 
 6 | name: $(Date:yyyyMMdd).$(Rev:r)
 7 | 
 8 | trigger: none # trigger only via pr
 9 | 
10 | pr:
11 | - main
12 | 
13 | variables:
14 | - group: azureResourceKeys
15 | 
16 | stages:
17 | - stage: static_analysis
18 |   jobs:
19 |   - job: flake8_linux_py36
20 |     pool:
21 |       vmImage: 'ubuntu-latest'
22 |     steps:
23 |     - template: templates/base/run-linter.yml
24 |       parameters:
25 |         pyVersion: '3.6'
26 |     - task: ComponentGovernanceComponentDetection@0
27 | 
28 | - stage: unit_tests
29 |   dependsOn: static_analysis
30 |   jobs:
31 |   - template: templates/run-tests-on-multiple-os-py.yml
32 |     parameters:
33 |       pyVersions: ['3.6', '3.7', '3.8']
34 |       testTypes: ['unit', 'io']
35 |       imageOSs: ['ubuntu-18.04']  # 'windows-latest', 'macos-latest' are not supported
36 | 
37 | - stage: e2e_tests
38 |   dependsOn: static_analysis
39 |   jobs:
40 |   - template: templates/run-tests-on-multiple-os-py.yml
41 |     parameters:
42 |       pyVersions: ['3.6']
43 |       testTypes: ['e2e']
44 |       imageOSs: ['ubuntu-18.04']  # 'windows-latest', 'macos-latest' are not supported
45 | 
46 | - stage: collect_final_code_coverage
47 |   dependsOn: 
48 |   - unit_tests
49 |   - e2e_tests
50 |   jobs:
51 |   - template: templates/merge-cov-reports.yml
52 | 


--------------------------------------------------------------------------------
/devops/release.yml:
--------------------------------------------------------------------------------
 1 | # ---------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation. All rights reserved.
 3 | # Licensed under the MIT License.
 4 | # ---------------------------------------------------------
 5 | 
 6 | trigger: none # Need manual trigger
 7 | 
 8 | parameters:
 9 | - name: releaseType
10 |   displayName: Release Type
11 |   type: string
12 |   default: Test
13 |   values:
14 |   - Test
15 |   - Production
16 |   
17 | strategy:
18 |   matrix:
19 |     linux_x64_py3.6:
20 |       imageName: 'ubuntu-18.04'
21 |       python.version: '3.6'
22 | 
23 | pool:
24 |     vmImage: '$(imageName)'
25 | 
26 | steps:
27 | - task: UsePythonVersion@0
28 |   inputs:
29 |     versionSpec: '$(python.version)'
30 |     addToPath: true
31 |     architecture: 'x64'
32 |   displayName: 'Use Python $(python.version)'
33 | 
34 | - bash: |
35 |     pip install --upgrade pip 
36 |     pip install setuptools wheel
37 |     python setup.py bdist_wheel --dist-dir dist
38 |     python setup.py sdist --dist-dir dist
39 |   workingDirectory: $(Build.SourcesDirectory)
40 |   displayName: 'Building wheel package & sdist'
41 |   
42 | - task: GitHubRelease@1
43 |   inputs:
44 |     gitHubConnection: 'github.com_laserprec'
45 |     repositoryName: 'microsoft/genalog'
46 |     action: 'create'
47 |     target: '$(Build.SourceVersion)'
48 |     tagSource: 'gitTag'
49 |     tagPattern: 'v.*'
50 |     releaseNotesFilePath: 'CHANGELOG.md'
51 |     assets: '$(Build.SourcesDirectory)/dist/*'
52 |     isDraft: true
53 |     changeLogCompareToRelease: 'lastFullRelease'
54 |     changeLogType: 'commitBased'
55 |   condition: ${{eq(parameters.releaseType, 'Test')}}
56 |   displayName: 'Prepare GitHub Release (Draft)'
57 | 
58 | - bash: |
59 |     pip install twine
60 |   workingDirectory: $(Build.SourcesDirectory)
61 |   displayName: 'Install twine' 
62 | 
63 | - task: TwineAuthenticate@1
64 |   inputs:
65 |     pythonUploadServiceConnection: testpypi
66 |   condition: ${{eq(parameters.releaseType, 'Test')}}
67 |   displayName: 'Twine Authentication for Test'
68 | 
69 | - task: TwineAuthenticate@1
70 |   inputs:
71 |     pythonUploadServiceConnection: pypi
72 |   condition: ${{eq(parameters.releaseType, 'Production')}}
73 |   displayName: 'Twine Authentication for Production'
74 | 
75 | - bash: |
76 |     twine upload --verbose -r genalog --config-file $(PYPIRC_PATH) dist/*.whl
77 |   workingDirectory: $(Build.SourcesDirectory)
78 |   displayName: 'Uploading Wheel to ${{parameters.releaseType}} PyPI'


--------------------------------------------------------------------------------
/devops/templates/base/publish-test-results.yml:
--------------------------------------------------------------------------------
 1 | # ---------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation. All rights reserved.
 3 | # Licensed under the MIT License.
 4 | # ---------------------------------------------------------
 5 | 
 6 | # Template for publishing test result report
 7 | parameters:
 8 | - name: pyVersion
 9 |   type: string
10 | 
11 | steps:
12 | - task: PublishTestResults@2
13 |   inputs:
14 |     testResultsFormat: 'JUnit' 
15 |     testResultsFiles: 'junit/*.xml' 
16 |     searchFolder: $(Build.SourcesDirectory)
17 |     testRunTitle: $(Agent.OS) py$(pyVersion) Build
18 |     buildPlatform: $(Agent.OS)
19 |   condition: always() # Always publish test results
20 |   displayName: 'Publish test report'


--------------------------------------------------------------------------------
/devops/templates/base/run-linter.yml:
--------------------------------------------------------------------------------
 1 | # ---------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation. All rights reserved.
 3 | # Licensed under the MIT License.
 4 | # ---------------------------------------------------------
 5 | 
 6 | # Template for running linter and other static analysis tools on the code
 7 | parameters:
 8 | - name: pyVersion
 9 |   type: string
10 |   default: '3.6'
11 | 
12 | steps:
13 | - task: UsePythonVersion@0
14 |   inputs:
15 |     versionSpec: ${{ parameters.pyVersion }}
16 |     addToPath: true
17 |     architecture: 'x64'
18 |   displayName: 'Use Python ${{ parameters.pyVersion }}'
19 | 
20 | - bash: |
21 |     python -m pip install --upgrade pip setuptools wheel
22 |     python -m pip install -r requirements-dev.txt
23 |   workingDirectory: $(Build.SourcesDirectory)
24 |   displayName: 'Install flake8 and other dev dependencies'
25 | 
26 | - bash: |
27 |     tox -e flake8
28 |   workingDirectory: $(Build.SourcesDirectory)
29 |   displayName: 'Run Linter (flake8)'


--------------------------------------------------------------------------------
/devops/templates/base/run-tests.yml:
--------------------------------------------------------------------------------
 1 | # ---------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation. All rights reserved.
 3 | # Licensed under the MIT License.
 4 | # ---------------------------------------------------------
 5 | 
 6 | # Template for running tests on multiple Python versions and platforms
 7 | parameters:
 8 | - name: testType
 9 |   type: string
10 |   default: all
11 |   values:
12 |     - unit
13 |     - e2e
14 |     - slow
15 |     - azure
16 |     - io
17 |     - all
18 | 
19 | steps:
20 | - bash: |
21 |     if [[ '${{parameters.testType}}' == 'all' ]]
22 |     then
23 |       tox -e py 
24 |     elif [[ '${{parameters.testType}}' == 'unit' ]]
25 |     then
26 |       tox -e py -- tests/unit
27 |     elif [[ '${{parameters.testType}}' == 'e2e' ]]
28 |     then
29 |       tox -e py -- tests/e2e
30 |     else
31 |       tox -e py -- -m "${{parameters.testType}}"
32 |     fi
33 |   env:
34 |     # These keys come from azureResourceKeys variable group
35 |     BLOB_KEY : $(BLOB_KEY) 
36 |     SEARCH_SERVICE_KEY: $(SEARCH_SERVICE_KEY)
37 |     COGNITIVE_SERVICE_KEY: $(COGNITIVE_SERVICE_KEY)
38 |     COMPUTER_VISION_SUBSCRIPTION_KEY: $(COMPUTER_VISION_SUBSCRIPTION_KEY)
39 |   workingDirectory: $(Build.SourcesDirectory)
40 |   displayName: 'Running (${{parameters.testType}}) Tests'
41 | 


--------------------------------------------------------------------------------
/devops/templates/build_wheel_n_sdist.yml:
--------------------------------------------------------------------------------
 1 | # ---------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation. All rights reserved.
 3 | # Licensed under the MIT License.
 4 | # ---------------------------------------------------------
 5 | 
 6 | # Template to create wheel and source distribution
 7 | parameters:
 8 | - name: pyVersion
 9 |   default: '3.6'
10 | 
11 | steps:
12 | - task: UsePythonVersion@0
13 |   inputs:
14 |     versionSpec: ${{ parameters.pyVersion }}
15 |     addToPath: true
16 |     architecture: 'x64'
17 |   displayName: 'Use Python ${{ parameters.pyVersion }}'
18 | 
19 | - bash: |
20 |     python -m pip install --upgrade pip setuptools wheel
21 |   displayName: 'Update pip and setuptools'
22 | 
23 | - bash: |
24 |     python setup.py bdist_wheel
25 |   workingDirectory: $(Build.SourcesDirectory)
26 |   displayName: 'Build wheel'
27 | 
28 | - bash: |
29 |     python setup.py sdist
30 |   workingDirectory: $(Build.SourcesDirectory)
31 |   displayName: 'Build source distribution'
32 | 
33 | - bash: |
34 |     ls dist
35 |   workingDirectory: $(Build.SourcesDirectory)
36 |   displayName: 'Show artifacts in folder'
37 | 


--------------------------------------------------------------------------------
/devops/templates/install-dependencies.yml:
--------------------------------------------------------------------------------
 1 | # ---------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation. All rights reserved.
 3 | # Licensed under the MIT License.
 4 | # ---------------------------------------------------------
 5 | 
 6 | # Assume a python version is enabled with "UsePythonVersion@0" task
 7 | steps:
 8 | - bash: |
 9 |     python -m pip install --upgrade pip setuptools wheel
10 |     python -m pip install -r requirements-dev.txt
11 |   workingDirectory: $(Build.SourcesDirectory)
12 |   displayName: 'Install dependencies'
13 | 


--------------------------------------------------------------------------------
/devops/templates/merge-cov-reports.yml:
--------------------------------------------------------------------------------
 1 | # ---------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation. All rights reserved.
 3 | # Licensed under the MIT License.
 4 | # ---------------------------------------------------------
 5 | 
 6 | # Template to merge several code coverage reports (.coverage*)
 7 | parameters:
 8 | - name: pyVersion
 9 |   default: '3.6'
10 | 
11 | jobs:
12 |   - job:
13 |     displayName: Merge cov reports
14 |     pool:
15 |         vmImage: 'ubuntu-latest'
16 | 
17 |     steps:
18 |     - task: UsePythonVersion@0
19 |       inputs:
20 |         versionSpec: ${{ parameters.pyVersion }}
21 |         addToPath: true
22 |         architecture: 'x64'
23 |       displayName: 'Use Python ${{ parameters.pyVersion }}'
24 | 
25 |     - bash: |
26 |         python -m pip --upgrade pip setuptools
27 |         python -m pip install coverage
28 |       workingDirectory: $(Build.SourcesDirectory)
29 |       displayName: 'Install coverage'
30 |          
31 |     # See https://docs.microsoft.com/en-us/azure/devops/pipelines/artifacts/pipeline-artifacts?view=azure-devops&tabs=yaml#multiple-artifacts
32 |     - download: current
33 |       patterns: '**/.coverage*'
34 | 
35 |     - bash: |
36 |         python -m coverage combine $(Pipeline.Workspace)/**/.coverage*
37 |         python -m coverage report
38 |         python -m coverage xml
39 |       workingDirectory: $(Build.SourcesDirectory)
40 |       displayName: Show and merge cached coverage report
41 | 
42 |     - task: PublishCodeCoverageResults@1
43 |       inputs:
44 |         codeCoverageTool: Cobertura
45 |         summaryFileLocation: '$(Build.SourcesDirectory)/coverage.xml'
46 |       displayName: 'Publish merged code coverage report'
47 | 
48 | 
49 | 


--------------------------------------------------------------------------------
/devops/templates/run-tests-on-multiple-os-py.yml:
--------------------------------------------------------------------------------
 1 | # ---------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation. All rights reserved.
 3 | # Licensed under the MIT License.
 4 | # ---------------------------------------------------------
 5 | 
 6 | parameters:
 7 | - name: pyVersions
 8 |   type: object
 9 |   default: ['3.6', '3.7', '3.8']
10 | - name: testTypes
11 |   type: object
12 |   default: ['fast', 'slow']
13 | - name: imageOSs
14 |   type: object
15 |   default: ['ubuntu-latest']
16 | 
17 | jobs:
18 |   - ${{ each imageOS in parameters.imageOSs }}:
19 |     - ${{ each pyVersion in parameters.pyVersions }}:
20 |       - job:
21 |         displayName: ${{imageOS}} py${{pyVersion}}
22 |         pool:
23 |           vmImage: ${{imageOS}}
24 |         steps:
25 | 
26 |         - task: UsePythonVersion@0
27 |           inputs:
28 |             versionSpec: ${{pyVersion}}
29 |             addToPath: true
30 |             architecture: 'x64'
31 |           displayName: 'Use Python ${{pyVersion}}'
32 | 
33 |         - template: install-dependencies.yml
34 | 
35 |         - ${{ each testType in parameters.testTypes }}:
36 |           - template: base/run-tests.yml
37 |             parameters:
38 |               testType: ${{testType}}
39 |           - template: base/publish-test-results.yml
40 |             parameters:
41 |               pyVersion: ${{pyVersion}}
42 | 
43 |         - bash: |
44 |             mv .coverage .coverage_$(System.StageName)_${{imageOS}}_${{pyVersion}}
45 |             ls .coverage*
46 |           workingDirectory: $(Build.SourcesDirectory)
47 |           displayName: 'Rename coverage report'
48 |         # Cache the coverage report
49 |         - publish: $(Build.SourcesDirectory)/.coverage_$(System.StageName)_${{imageOS}}_${{pyVersion}}
50 |           artifact: cov_report_$(System.StageName)_${{imageOS}}_${{pyVersion}}


--------------------------------------------------------------------------------
/docs/.gitignore:
--------------------------------------------------------------------------------
1 | **/example.txt
2 | **/_build
3 | **/data


--------------------------------------------------------------------------------
/docs/genalog_docs/_config.yml:
--------------------------------------------------------------------------------
 1 | title : ''
 2 | author: Jianjie Liu and Amit Gupte
 3 | logo: static/genalog_full_logo.svg
 4 | 
 5 | # Short description about the book
 6 | description: >-
 7 |   Guide for end-to-end synthetic analog document generation
 8 | 
 9 | execute:
10 |   execute_notebooks           : off
11 | 
12 | # Interact link settings
13 | notebook_interface            : "notebook"
14 | 
15 | # Launch button settings
16 | repository:
17 |   url                         : https://github.com/microsoft/genalog
18 |   path_to_book                : /docs/genalog_docs
19 |   branch                      : main
20 | 
21 | launch_buttons:
22 |   notebook_interface          : classic
23 | 
24 | # HTML-specific settings
25 | html:
26 |   favicon                     : static/genalog_favicon.svg
27 |   home_page_in_navbar         : false
28 |   use_repository_button       : true
29 |   use_issues_button           : true
30 |   baseurl                     : https://microsoft.github.io/genalog/
31 |   extra_footer                : "Don't forget to check out <a href=
32 | https://arxiv.org/abs/2108.02899>our paper from Document Intelligence Workshop at KDD 2021!</a>"
33 | 
34 | sphinx:
35 |   extra_extensions:
36 |   - sphinx_inline_tabs
37 |   - sphinx.ext.autodoc
38 |   - sphinx.ext.napoleon
39 |   - sphinx.ext.viewcode
40 |   config:
41 |     napoleon_google_docstring: True
42 |     autodoc_member_order: groupwise
43 |     autoclass_content: both
44 | 


--------------------------------------------------------------------------------
/docs/genalog_docs/_toc.yml:
--------------------------------------------------------------------------------
 1 | root: index
 2 | format: jb-book
 3 | defaults:
 4 |   numbered: false
 5 | parts:
 6 | - caption: Getting Started
 7 |   chapters:
 8 |   - file: installation
 9 |   - file: generation_pipeline
10 |   - file: e2e_dataset_pipeline
11 | - caption: Fabricating Document & Noise
12 |   chapters:
13 |   - file: doc_generation
14 |   - file: doc_degradation
15 | - caption: Handling Noisy Text
16 |   chapters:
17 |   - file: text_alignment
18 |   - file: ocr_label_propagation
19 | - caption: API Documentation
20 |   chapters:
21 |   - file: docstring/genalog.degradation
22 |   - file: docstring/genalog.generation
23 |   - file: docstring/genalog.ocr
24 |   - file: docstring/genalog.text
25 | 


--------------------------------------------------------------------------------
/docs/genalog_docs/docstring/genalog.degradation.rst:
--------------------------------------------------------------------------------
 1 | genalog.degradation
 2 | ====================
 3 | 
 4 | Image Degrader
 5 | -----------------------------------
 6 | 
 7 | .. automodule:: genalog.degradation.degrader
 8 |    :members:
 9 | 
10 | Degration Effects
11 | ---------------------------------
12 | 
13 | .. automodule:: genalog.degradation.effect
14 |    :members:
15 |    :show-inheritance:


--------------------------------------------------------------------------------
/docs/genalog_docs/docstring/genalog.generation.rst:
--------------------------------------------------------------------------------
 1 | genalog.generation
 2 | ==========================
 3 | 
 4 | genalog.generation.content module
 5 | ---------------------------------
 6 | 
 7 | .. automodule:: genalog.generation.content
 8 |    :members:
 9 |    :show-inheritance:
10 | 
11 | genalog.generation.document module
12 | ----------------------------------
13 | 
14 | .. automodule:: genalog.generation.document
15 |    :members:
16 |    :show-inheritance:
17 | 


--------------------------------------------------------------------------------
/docs/genalog_docs/docstring/genalog.ocr.rst:
--------------------------------------------------------------------------------
 1 | genalog.ocr
 2 | ===================
 3 | 
 4 | This module will be *deprecated* in favor of the official `Azure Computer Vision SDK <https://docs.microsoft.com/en-us/azure/cognitive-services/computer-vision/quickstarts-sdk/client-library?tabs=visual-studio&pivots=programming-language-python>`_ .
 5 | 
 6 | genalog.ocr.common module
 7 | -------------------------
 8 | 
 9 | .. automodule:: genalog.ocr.common
10 |    :members:
11 | 
12 | genalog.ocr.grok module
13 | -----------------------
14 | 
15 | .. automodule:: genalog.ocr.grok
16 |    :members:
17 | 
18 | genalog.ocr.metrics module
19 | --------------------------
20 | 
21 | .. automodule:: genalog.ocr.metrics
22 |    :members:
23 | 
24 | genalog.ocr.rest\_client module
25 | -------------------------------
26 | 
27 | .. automodule:: genalog.ocr.rest_client
28 |    :members:
29 | 
30 | genalog.ocr.blob\_client module
31 | -------------------------------
32 | 
33 | .. automodule:: genalog.ocr.blob_client
34 |    :members:
35 | 
36 | 


--------------------------------------------------------------------------------
/docs/genalog_docs/docstring/genalog.text.rst:
--------------------------------------------------------------------------------
 1 | genalog.text
 2 | ====================
 3 | 
 4 | genalog.text.alignment module
 5 | -----------------------------
 6 | 
 7 | .. automodule:: genalog.text.alignment
 8 |    :members:
 9 | 
10 | genalog.text.anchor module
11 | --------------------------
12 | 
13 | .. automodule:: genalog.text.anchor
14 |    :members:
15 | 
16 | genalog.text.conll\_format module
17 | ---------------------------------
18 | 
19 | .. automodule:: genalog.text.conll_format
20 |    :members:
21 | 
22 | genalog.text.lcs module
23 | -----------------------
24 | 
25 | .. automodule:: genalog.text.lcs
26 |    :members:
27 | 
28 | genalog.text.ner\_label module
29 | ------------------------------
30 | 
31 | .. automodule:: genalog.text.ner_label
32 |    :members:  
33 |    :private-members: _propagate_label_to_ocr
34 | 
35 | genalog.text.preprocess module
36 | ------------------------------
37 | 
38 | .. automodule:: genalog.text.preprocess
39 |    :members:
40 | 
41 | genalog.text.splitter module
42 | ----------------------------
43 | 
44 | .. automodule:: genalog.text.splitter
45 |    :members:
46 | 
47 | 
48 | 


--------------------------------------------------------------------------------
/docs/genalog_docs/e2e_dataset_pipeline.md:
--------------------------------------------------------------------------------
 1 | # OCR-NER Dataset Generation
 2 | 
 3 | ```{image} static/labeled_synthetic_pipeline.png
 4 | :width: 80%
 5 | :align: center
 6 | ```
 7 | 
 8 | If you were brought here by [our paper](https://arxiv.org/abs/2108.02899), you may be interested in the data preparation pipeline built with `genalog`. The figure above shows the steps involved in tranforming a Named-Entity Recognition (NER) dataset like [CoNLL 2003](https://deepai.org/dataset/conll-2003-english) with synthetic Optical Character Recognition (OCR) errors. This OCR-NER dataset is useful to train an error-prune NER model against common OCR mistakes. You can find the full dataset prepration pipeline in this [notebook](https://github.com/microsoft/genalog/blob/main/example/dataset_generation.ipynb) from our repo.
 9 | 
10 | We believe this methodology of inducing OCR errors onto the dataset can be applied to other NLP tasks to improve model performance against inherent noise from OCR outputs. We welcome the community to contribute if this fits your use cases.
11 | 
12 | 
13 | 


--------------------------------------------------------------------------------
/docs/genalog_docs/index.md:
--------------------------------------------------------------------------------
  1 | # Synthetic Document Generator
  2 | 
  3 | ![Python Versions](https://img.shields.io/badge/py-3.6%20%7C%203.7%20%7C%203.8%20-blue) [![arxiv link](https://img.shields.io/badge/arxiv-2108.02899-critical)](https://arxiv.org/abs/2108.02899) ![MIT license](https://img.shields.io/badge/License-MIT-blue.svg)
  4 | 
  5 | ````{margin}
  6 | ```sh
  7 | pip install genalog
  8 | ```
  9 | <a class="github-button" href="https://github.com/microsoft/genalog" data-icon="octicon-star" style="margin:auto" data-size="large" data-show-count="false" aria-label="Star us microsoft/genalog on GitHub">Star Us</a><script async defer src="https://buttons.github.io/buttons.js"></script>
 10 | ````
 11 | 
 12 | `genalog` is an open source, cross-platform python package for **gen**erating document images with synthetic noise that mimics scanned an**alog** documents (thus the name `genalog`). You can also add various text degradations to these images. The purpose of this tool is to provide a fast and efficient way to generate synthetic documents from text data by leveraging layout from templates that you can create in simple HTML format.
 13 | 
 14 | ```{figure} static/genalog_demo.gif
 15 | :width: 80%
 16 | Generate documents and apply degradations
 17 | ```
 18 | 
 19 | `genalog` provides several document templates as a start. You can alter the document layout using standard CSS properties like `font-family`, `font-size`, `text-align`, etc. Here are some of the example generated documents:
 20 | 
 21 | ````{tab} Multi-Column
 22 | ```{figure} static/columns_Times_11px.png
 23 | :width: 60%
 24 | :name: two-columns-index
 25 | Document template with 2 columns 
 26 | ```
 27 | ````
 28 | ````{tab} Letter-like
 29 | ```{figure} static/letter_Times_11px.png
 30 | :width: 60%
 31 | :name: letter-like-index
 32 | Letter-like document template
 33 | ```
 34 | ````
 35 | ````{tab} Simple Text Block
 36 | ```{figure} static/text_block_Times_11px.png
 37 | :width: 60%
 38 | :name: text-block-index
 39 | Simple text block template
 40 | ```
 41 | ````
 42 | 
 43 | Once a document is generated, you can combine various image degradation effects and apply onto the synthetic documents. Here are some of the degradation effects:
 44 | 
 45 | ````{tab} Bleed-through
 46 | ```{figure} static/bleed_through.png
 47 | :name: bleed-through-index
 48 | :width: 80%
 49 | Mimics a document printed on two sides
 50 | ```
 51 | ````
 52 | ````{tab} Blur
 53 | ```{figure} static/blur.png
 54 | :name: blur-index
 55 | :width: 80%
 56 | Lowers image quality
 57 | ```
 58 | ````
 59 | ````{tab} Salt/Pepper
 60 | ```{figure} static/salt_pepper.png
 61 | :name: salt/pepper-index
 62 | :width: 50%
 63 | Mimics ink degradation
 64 | ```
 65 | ````
 66 | `````{tab} Close/Dilate
 67 | ```{figure} static/close_dilate.png
 68 | :name: close-dilate-index
 69 | :width: 90%
 70 | Degrades printing quality
 71 | ```
 72 | ````{margin}
 73 | ```{note}
 74 | For more details on this degradation, see [Morphilogical Operations](https://homepages.inf.ed.ac.uk/rbf/HIPR2/morops.htm)
 75 | ```
 76 | ````
 77 | `````
 78 | `````{tab} Open/Erode
 79 | ```{figure} static/open_erode.png
 80 | :name: open-erode-index
 81 | :width: 90%
 82 | Ink overflows
 83 | ```
 84 | ````{margin}
 85 | ```{note}
 86 | For more details on this degradation, see [Morphilogical Operations](https://homepages.inf.ed.ac.uk/rbf/HIPR2/morops.htm)
 87 | ```
 88 | ````
 89 | `````
 90 | ````{tab} Combined Effects
 91 | ```{figure} static/degrader.png
 92 | :width: 40%
 93 | :name: combined-effects-index
 94 | Combining various degradation effects: blur, salt, open, and bleed-through
 95 | ```
 96 | ````
 97 | 
 98 | In addition to the document generation and degradation, `genalog` also provide efficient implementation for [text alignment](text-alignment-page) between the source and noise text.
 99 | 
100 | 


--------------------------------------------------------------------------------
/docs/genalog_docs/installation.md:
--------------------------------------------------------------------------------
 1 | # Installation
 2 | 
 3 | Genalog is supported across Windows, Mac and Linux on Python 3.6+. However there are *additional* installation steps for Windows and Mac users.
 4 | 
 5 | 
 6 | ````{tab} pip
 7 | ```sh
 8 | pip install genalog
 9 | ```
10 | ````
11 | ````{tab} source
12 | ```sh
13 | git clone https://github.com/microsoft/genalog.git && cd genalog && pip install -e .
14 | ```
15 | ````
16 | 
17 | ## Extra Steps for Windows & Mac Users
18 | 
19 | We have a dependency on [`Weasyprint`](https://weasyprint.readthedocs.io/en/stable/install.html) for image generation, which in turn has non-python dependencies including `Pango`, `cairo` and `GDK-PixBuf` that need to be installed separately.
20 | 
21 | So far, `Pango`, `cairo` and `GDK-PixBuf` libraries are available in `Ubuntu-18.04` and later by default.
22 | 
23 | If you are running on Windows, MacOS, or other Linux distributions, please see [installation instructions from WeasyPrint](https://weasyprint.readthedocs.io/en/stable/install.html).
24 | 
25 | ```{note}
26 | If you encounter the errors like `no library called "libcairo-2" was found`, this is probably due to the three extra dependencies missing.
27 | ```
28 | 
29 | 


--------------------------------------------------------------------------------
/docs/genalog_docs/static/analog_doc_gen_pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/docs/genalog_docs/static/analog_doc_gen_pipeline.png


--------------------------------------------------------------------------------
/docs/genalog_docs/static/bleed_through.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/docs/genalog_docs/static/bleed_through.png


--------------------------------------------------------------------------------
/docs/genalog_docs/static/blur.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/docs/genalog_docs/static/blur.png


--------------------------------------------------------------------------------
/docs/genalog_docs/static/close_dilate.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/docs/genalog_docs/static/close_dilate.png


--------------------------------------------------------------------------------
/docs/genalog_docs/static/columns_Times_11px.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/docs/genalog_docs/static/columns_Times_11px.png


--------------------------------------------------------------------------------
/docs/genalog_docs/static/degrader.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/docs/genalog_docs/static/degrader.png


--------------------------------------------------------------------------------
/docs/genalog_docs/static/degrader_heavy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/docs/genalog_docs/static/degrader_heavy.png


--------------------------------------------------------------------------------
/docs/genalog_docs/static/genalog_demo.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/docs/genalog_docs/static/genalog_demo.gif


--------------------------------------------------------------------------------
/docs/genalog_docs/static/kernel_morph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/docs/genalog_docs/static/kernel_morph.png


--------------------------------------------------------------------------------
/docs/genalog_docs/static/labeled_synthetic_pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/docs/genalog_docs/static/labeled_synthetic_pipeline.png


--------------------------------------------------------------------------------
/docs/genalog_docs/static/letter_Times_11px.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/docs/genalog_docs/static/letter_Times_11px.png


--------------------------------------------------------------------------------
/docs/genalog_docs/static/open_erode.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/docs/genalog_docs/static/open_erode.png


--------------------------------------------------------------------------------
/docs/genalog_docs/static/salt_pepper.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/docs/genalog_docs/static/salt_pepper.png


--------------------------------------------------------------------------------
/docs/genalog_docs/static/text_block_Times_11px.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/docs/genalog_docs/static/text_block_Times_11px.png


--------------------------------------------------------------------------------
/docs/requirements-doc.txt:
--------------------------------------------------------------------------------
1 | jupyter-book
2 | sphinx
3 | sphinx_inline_tabs
4 | ghp-import


--------------------------------------------------------------------------------
/example/demo_generate.py:
--------------------------------------------------------------------------------
 1 | #%% 
 2 | from genalog.pipeline import AnalogDocumentGeneration
 3 | from genalog.degradation.degrader import ImageState
 4 | 
 5 | sample_text = "sample/generation/example.txt"
 6 | 
 7 | # Common CSS properties
 8 | STYLE_COMBINATIONS = {
 9 |     "font_family"   : ["Times"], # sans-serif, Times, monospace, etc
10 |     "font_size"     : ["12px"],
11 |     "text_align"    : ["justify"], # left, right, center, justify
12 |     "language"      : ["en_US"],  # controls how words are hyphenated
13 |     "hyphenate"     : [True],
14 | }
15 | 
16 | # <columns|letter|text_block>.html.jinja
17 | HTML_TEMPLATE = "columns.html.jinja" 
18 | 
19 | # Degration effects applied in sequence
20 | DEGRADATIONS = [
21 |     ("blur", {"radius": 3}),  # needs to be an odd number
22 |     ("bleed_through", {
23 |         "src": ImageState.CURRENT_STATE, "background": ImageState.ORIGINAL_STATE,
24 |         "alpha": 0.8,
25 |         "offset_y": 9, "offset_x": 12
26 |     }),
27 |     ("morphology", {"operation": "open", "kernel_shape": (3, 3)}),
28 |     ("pepper", {"amount": 0.05}),
29 |     ("salt", {"amount": 0.05}),
30 | ]
31 | 
32 | doc_generation = AnalogDocumentGeneration(styles=STYLE_COMBINATIONS, degradations=DEGRADATIONS)
33 | img_array = doc_generation.generate_img(sample_text, HTML_TEMPLATE, target_folder=None)
34 | 
35 | import cv2
36 | from IPython.core.display import Image, display
37 | 
38 | _, encoded_image = cv2.imencode('.png', img_array)
39 | display(Image(data=encoded_image, width=600))
40 | 
41 | 


--------------------------------------------------------------------------------
/example/ocr_label_propagation.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## `genalog.text` module: \n",
  8 |     "This module is responsible for:\n",
  9 |     "1. Text alignment\n",
 10 |     "1. NER label propagation using text alignment results"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 1,
 16 |    "metadata": {},
 17 |    "outputs": [],
 18 |    "source": [
 19 |     "from genalog.text import ner_label\n",
 20 |     "from genalog.text import preprocess\n",
 21 |     "\n",
 22 |     "gt_txt = \"New York is big\"\n",
 23 |     "ocr_txt = \"New Yo rkis big\"\n",
 24 |     "\n",
 25 |     "# Input to the method\n",
 26 |     "gt_labels = [\"B-P\", \"I-P\", \"O\", \"O\"]\n",
 27 |     "gt_tokens = preprocess.tokenize(gt_txt) # tokenize into list of tokens\n",
 28 |     "ocr_tokens = preprocess.tokenize(ocr_txt)"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": 2,
 34 |    "metadata": {},
 35 |    "outputs": [
 36 |     {
 37 |      "name": "stdout",
 38 |      "output_type": "stream",
 39 |      "text": [
 40 |       "['B-P', 'I-P', 'O', 'O']\n",
 41 |       "['New', 'York', 'is', 'big']\n",
 42 |       "['New', 'Yo', 'rkis', 'big']\n"
 43 |      ]
 44 |     }
 45 |    ],
 46 |    "source": [
 47 |     "# Inputs to the method\n",
 48 |     "print(gt_labels)\n",
 49 |     "print(gt_tokens)\n",
 50 |     "print(ocr_tokens)"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": 5,
 56 |    "metadata": {},
 57 |    "outputs": [],
 58 |    "source": [
 59 |     "# Method returns a tuple of 4 elements (gt_tokens, gt_labels, ocr_tokens, ocr_labels, gap_char)\n",
 60 |     "ocr_labels, aligned_gt, aligned_ocr, gap_char = ner_label.propagate_label_to_ocr(gt_labels, gt_tokens, ocr_tokens)"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": 6,
 66 |    "metadata": {},
 67 |    "outputs": [
 68 |     {
 69 |      "name": "stdout",
 70 |      "output_type": "stream",
 71 |      "text": [
 72 |       "OCR labels:           ['B-P', 'I-P', 'I-P', 'O']\n",
 73 |       "Aligned ground truth: New Yo@rk is big\n",
 74 |       "Alinged OCR text:     New Yo rk@is big\n"
 75 |      ]
 76 |     }
 77 |    ],
 78 |    "source": [
 79 |     "# Outputs\n",
 80 |     "print(f\"OCR labels:           {ocr_labels}\")\n",
 81 |     "print(f\"Aligned ground truth: {aligned_gt}\")\n",
 82 |     "print(f\"Alinged OCR text:     {aligned_ocr}\")"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": 9,
 88 |    "metadata": {},
 89 |    "outputs": [
 90 |     {
 91 |      "name": "stdout",
 92 |      "output_type": "stream",
 93 |      "text": [
 94 |       "B-P I-P  O  O   \n",
 95 |       "New York is big \n",
 96 |       "New Yo@rk is big\n",
 97 |       "||||||.||.||||||\n",
 98 |       "New Yo rk@is big\n",
 99 |       "New Yo  rkis big \n",
100 |       "B-P I-P I-P  O   \n",
101 |       "\n"
102 |      ]
103 |     }
104 |    ],
105 |    "source": [
106 |     "# Format result for display\n",
107 |     "print(ner_label.format_label_propagation(gt_tokens, gt_labels, ocr_tokens, ocr_labels, aligned_gt, aligned_ocr))"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "code",
112 |    "execution_count": 12,
113 |    "metadata": {},
114 |    "outputs": [
115 |     {
116 |      "name": "stdout",
117 |      "output_type": "stream",
118 |      "text": [
119 |       "B-P I-P  O  O   \n",
120 |       "New York is big \n",
121 |       "New Yo  rkis big \n",
122 |       "B-P I-P I-P  O   \n",
123 |       "\n"
124 |      ]
125 |     }
126 |    ],
127 |    "source": [
128 |     "# To turn off alignment information:\n",
129 |     "print(ner_label.format_label_propagation(gt_tokens, gt_labels, ocr_tokens, ocr_labels, aligned_gt, aligned_ocr, show_alignment=False))"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": 14,
135 |    "metadata": {},
136 |    "outputs": [
137 |     {
138 |      "name": "stdout",
139 |      "output_type": "stream",
140 |      "text": [
141 |       "B-P I-P I-P  O   \n",
142 |       "New Yo  rkis big \n",
143 |       "\n"
144 |      ]
145 |     }
146 |    ],
147 |    "source": [
148 |     "# Format tokens and labels\n",
149 |     "print(ner_label.format_labels(ocr_tokens, ocr_labels))"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "code",
154 |    "execution_count": null,
155 |    "metadata": {},
156 |    "outputs": [],
157 |    "source": []
158 |   }
159 |  ],
160 |  "metadata": {
161 |   "kernelspec": {
162 |    "display_name": "Python 3",
163 |    "language": "python",
164 |    "name": "python3"
165 |   },
166 |   "language_info": {
167 |    "codemirror_mode": {
168 |     "name": "ipython",
169 |     "version": 3
170 |    },
171 |    "file_extension": ".py",
172 |    "mimetype": "text/x-python",
173 |    "name": "python",
174 |    "nbconvert_exporter": "python",
175 |    "pygments_lexer": "ipython3",
176 |    "version": "3.6.9"
177 |   }
178 |  },
179 |  "nbformat": 4,
180 |  "nbformat_minor": 4
181 | }
182 | 


--------------------------------------------------------------------------------
/example/sample/degradation/bleed_through.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/example/sample/degradation/bleed_through.png


--------------------------------------------------------------------------------
/example/sample/degradation/blur.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/example/sample/degradation/blur.png


--------------------------------------------------------------------------------
/example/sample/degradation/close_dilate.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/example/sample/degradation/close_dilate.png


--------------------------------------------------------------------------------
/example/sample/degradation/degrader.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/example/sample/degradation/degrader.png


--------------------------------------------------------------------------------
/example/sample/degradation/degrader_heavy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/example/sample/degradation/degrader_heavy.png


--------------------------------------------------------------------------------
/example/sample/degradation/kernel_morph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/example/sample/degradation/kernel_morph.png


--------------------------------------------------------------------------------
/example/sample/degradation/open_erode.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/example/sample/degradation/open_erode.png


--------------------------------------------------------------------------------
/example/sample/degradation/salt_pepper.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/example/sample/degradation/salt_pepper.png


--------------------------------------------------------------------------------
/example/sample/degradation/text_block.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/example/sample/degradation/text_block.png


--------------------------------------------------------------------------------
/example/sample/degradation/text_zoomed.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/example/sample/degradation/text_zoomed.png


--------------------------------------------------------------------------------
/example/sample/generation/columns_Times_11px.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/example/sample/generation/columns_Times_11px.pdf


--------------------------------------------------------------------------------
/example/sample/generation/columns_Times_11px.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/example/sample/generation/columns_Times_11px.png


--------------------------------------------------------------------------------
/example/sample/generation/example.txt:
--------------------------------------------------------------------------------
 1 | Time magazine , in a move to reduce the costs of wooing new subscribers , is lowering its circulation guarantee to advertisers for the second consecutive year , increasing its subscription rates and cutting back on merchandise giveaways .
 2 | In an announcement to its staff last week , executives at Time Warner Inc. 's weekly magazine said Time will `` dramatically de-emphasize '' its use of electronic giveaways such as telephones in television subscription drives ; cut the circulation it guarantees advertisers by 300,000 , to four million ; and increase the cost of its annual subscription rate by about $ 4 to $ 55 .
 3 | In a related development , the news - weekly , for the fourth year in a row , said it wo n't increase its advertising rates in 1990 ; a full , four - color page in the magazine costs about $ 120,000 .
 4 | However , because the guaranteed circulation base is being lowered , ad rates will be effectively 7.5 % higher per subscriber , according to Richard Heinemann , Time associate publisher .
 5 | Time is following the course of some other mass - circulation magazines that in recent years have challenged the publishing myth that maintaining artificially high , and expensive , circulations is the way to draw advertisers .
 6 | In recent years , Reader 's Digest , New York Times Co. 's McCall 's , and most recently News Corp. 's TV Guide , have cut their massive circulation rate bases to eliminate marginal circulation and hold down rates for advertisers .
 7 | Deep discounts in subscriptions and offers of free clock radios and watches have become accepted forms of attracting new subscribers in the hyper-competitive world of magazine news - weeklies .
 8 | But Time , as part of the more cost - conscious Time Warner , wants to wean itself away from expensive gimmicks .
 9 | Besides , Time executives think selling a news magazine with a clock radio is tacky .
10 | 
11 | 
12 | `` Giveaways just give people the wrong image , '' said Mr. Heinemann .
13 | `` That perception takes the focus off the magazine . ''
14 | Time magazine executives predictably paint the circulation cut as a show of strength and actually a benefit to advertisers .
15 | `` What we are doing is screening out the readers who are only casually related to the magazine and do n't really read it , '' said Mr. Heinemann .
16 | `` We are trying to create quality and involvement . ''
17 | However , Time executives used the same explanation when in October 1988 the magazine cut its guaranteed circulation from 4.6 million to 4.3 million .
18 | And Time 's paid circulation , according to Audit Bureau of Circulations , dropped 7.3 % to 4,393,237 in the six months ended June 30 , 1989 .
19 | Still , Time 's move is being received well , once again .
20 | `` It 's terrific for advertisers to know the reader will be paying more , '' said Michael Drexler , national media director at Bozell Inc. ad agency .
21 | `` A few drops in circulation are of no consequence .
22 | It 's not a show of weakness ; they are improving the quality of circulation while insuring their profits . ''
23 | Mr. Heinemann said the changes represent a new focus in the magazine industry : a magazine 's net revenue per subscriber , or the actual revenue from subscribers after discounts and the cost of premiums have been stripped away .
24 | `` The question is how much are we getting from each reader , '' said Mr. Heinemann .
25 | Time 's rivals news - weeklies , Washington Post Co. 's Newsweek and U.S. News & World Report , are less reliant on electronic giveaways , and in recent years both have been increasing their circulation rate bases .
26 | Both magazines are expected to announce their ad rates and circulation levels for 1990 within a month .


--------------------------------------------------------------------------------
/example/sample/generation/letter_Times_11px.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/example/sample/generation/letter_Times_11px.pdf


--------------------------------------------------------------------------------
/example/sample/generation/letter_Times_11px.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/example/sample/generation/letter_Times_11px.png


--------------------------------------------------------------------------------
/example/sample/generation/text_block_Times_11px.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/example/sample/generation/text_block_Times_11px.pdf


--------------------------------------------------------------------------------
/example/sample/generation/text_block_Times_11px.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/example/sample/generation/text_block_Times_11px.png


--------------------------------------------------------------------------------
/example/sample/generation/text_block_Times_11px_pg_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/example/sample/generation/text_block_Times_11px_pg_0.png


--------------------------------------------------------------------------------
/example/sample/generation/text_block_Times_11px_pg_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/example/sample/generation/text_block_Times_11px_pg_1.png


--------------------------------------------------------------------------------
/example/static/analog_doc_gen_pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/example/static/analog_doc_gen_pipeline.png


--------------------------------------------------------------------------------
/example/static/genalog_components.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/example/static/genalog_components.png


--------------------------------------------------------------------------------
/example/static/labeled_synthetic_pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/example/static/labeled_synthetic_pipeline.png


--------------------------------------------------------------------------------
/genalog/README.md:
--------------------------------------------------------------------------------
 1 | # Genalog Core
 2 | 
 3 | This is the core of the package and contains all core components necessary to generate new docs, degrade the documents and get text out of degraded images using OCR Capabilities of Azure.
 4 | 
 5 | ## Image Generation
 6 | 
 7 | This directory contains the class implementations for image generation. The image generation leverages [Jinja templates](https://jinja.palletsprojects.com/en/2.11.x/templates/) for image generation. You can create a Jinja HTML template for any image layout and specify content variables to add content into images. This allows you the flexibility to be as declarative as possible.
 8 | 
 9 | [Here is our guide to Image Generation](generation/README.md)
10 | 
11 | ## Image Degradation
12 | 
13 | This directory contains the class implementations for degrading your images such that they simulate real world Document degradations.
14 | 
15 | [Here is our guide to Image Degradation](degradation/README.md)
16 | 
17 | ## Extract Text from Images
18 | 
19 | This directory contains the class implementations for Extract Text from Images using Azure OCR Process.
20 | 
21 | [Here is our guide to Extract Text from Images](ocr/README.md)
22 | 
23 | ## Text Alignment
24 | 
25 | This directory contains the class implementations for text alignment. We expect that these capabilities will be required when you need to align text with its incorrect versions when you  degrade documents and then have errors in OCR. We use [Biopython's](https://biopython.org/) implementation of the Needleman-Wunsch algorithm for text alignment as the method `genalog.text.alignment.align()`. This algorithm is an exhaustive search for all possible candidates with dynamic programming. It produces weighted score for each candidate and returns those having the highest score. Note this is an algorithm with quadratic time and space complexity, and is not so efficient on aligning longer strings.
26 | 
27 | For more efficient alignment on longer documents, we also include an implementation of the RETAS method from the paper ["A Fast Alignment Scheme for Automatic OCR Evaluation of Books"](https://ieeexplore.ieee.org/document/6065412) in `genalog.text.anchor.align_w_anchor()`. We would recommend using this method for input longer than 200 characters.
28 | 
29 | [Here is our guide to Text Alignment](text/README.md)
30 | 


--------------------------------------------------------------------------------
/genalog/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/genalog/__init__.py


--------------------------------------------------------------------------------
/genalog/degradation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/genalog/degradation/__init__.py


--------------------------------------------------------------------------------
/genalog/generation/.gitignore:
--------------------------------------------------------------------------------
1 | # output folders for debugging purpose
2 | output/
3 | # sample input for debugging
4 | sample/


--------------------------------------------------------------------------------
/genalog/generation/README.md:
--------------------------------------------------------------------------------
  1 | ## Document Generation
  2 | 
  3 | This folder contains the scripts that allow you generate synthetic documents from any given text. We provide **three** standard templates for with document layouts:
  4 | 
  5 | <p float="left">
  6 |   <img src="../../example/sample/generation/columns_Times_11px.png" width="300" />
  7 |   <img src="../../example/sample/generation/letter_Times_11px.png" width="300" />
  8 |   <img src="../../example/sample/generation/text_block_Times_11px.png" width="300" />
  9 | </p>
 10 | 
 11 | You can find these templates in path `genalog/generation/templates`.
 12 | 
 13 | ### 1. Document Content
 14 | 
 15 | The goal is to be able to generate synthetic documents on ANY text input. However, to properly initiate the content populating a document template, we need to create the `CompositeContent` class.
 16 | 
 17 | ```python
 18 | from genalog.generation.content import CompositeContent, ContentType
 19 | 
 20 | # Here we are loading an sample text file in the root "example" directory
 21 | # You may use any text as well.
 22 | with open("example/sample/generation/example.txt", 'r') as f:
 23 |     text = f.read()
 24 | 
 25 | # Initialize CompositeContent Object
 26 | paragraphs = text.split('\n\n') # split paragraphs by `\n\n`
 27 | content_types = [ContentType.PARAGRAPH] * len(paragraphs)
 28 | content = CompositeContent(paragraphs, content_types)
 29 | ```
 30 | The `CompositeContent` is a list of pairs of bodies of text and their `ContentType`. Here we can declaring a list of multiple `ContentType.PARAGRAPH`s.
 31 | 
 32 | ### 2. Populate Content into Template
 33 | 
 34 | Once we initialized a `CompositeContent` object, we can populate the content into any standard template, via `DocumentGenerator` class.
 35 | 
 36 | ```python
 37 | from genalog.generation.document import DocumentGenerator
 38 | default_generator = DocumentGenerator()
 39 | 
 40 | print(f"Available default templates: {default_generator.template_list}")
 41 | print(f"Default styles to generate: {default_generator.styles_to_generate}")
 42 | ```
 43 | The `DocumentGenerator` has default styles. The above code snippet will show the default configurations and the names of the 3 standard templates. You will use the information to select the template you want to generate. The three templates are `["columns.html.jinja", "letter.html.jinja", "text_block.html.jinja"]`
 44 | 
 45 | ```python
 46 | # Select specific template, content and create the generator
 47 | doc_gen = default_generator.create_generator(content, ["columns.html.jinja", "letter.html.jinja", "text_block.html.jinja"]) 
 48 | # we will use the `CompositeContent` object initialized from above cell
 49 | 
 50 | # python generator 
 51 | for doc in doc_gen:
 52 |     template_name = doc.template.name.replace(".html.jinja", "")
 53 |     doc.render_png(target=f"example_{template_name}.png", resolution=300) #in dots per inch
 54 | ```
 55 | You can also retrieve the raw image byte information without specifying the `target`
 56 | 
 57 | ```python
 58 | from genalog.generation.document import DocumentGenerator
 59 | from IPython.core.display import Image, display
 60 | 
 61 | doc_gen = default_generator.create_generator(content, ['text_block.html.jinja']) 
 62 | 
 63 | for doc in doc_gen:
 64 |     image_byte = doc.render_png(resolution=100)
 65 |     display(Image(image_byte))
 66 | ```
 67 | 
 68 | Alternative, you can also save the document as a PDF file.
 69 | 
 70 | ```python
 71 | # Select specific template, content and create the generator
 72 | doc_gen = default_generator.create_generator(content, ['text_block.html.jinja']) 
 73 | # we will use the `CompositeContent` object initialized from above cell
 74 | 
 75 | # python generator 
 76 | for doc in doc_gen:
 77 |     doc.render_pdf(target="example_text_block.png")
 78 | ```
 79 | 
 80 | ### Changing Document Styles
 81 | 
 82 | You can alter the document styles including font family, font size, enabling hyphenation, and text alignment. These are mock style properties of their CSS counterparts. You can find standard CSS values replace the following properties.
 83 | 
 84 | ```python
 85 | from genalog.generation.document import DocumentGenerator
 86 | from IPython.core.display import Image, display
 87 | 
 88 | # You can add as many options as possible. A new document will be generated per combination of the styles
 89 | new_style_combinations = {
 90 |     "hyphenate": [True],
 91 |     "font_size": ["11px", "12px"], # most CSS units are supported `px`, `cm`, `em`, etc...
 92 |     "font_family": ["Times"],
 93 |     "text_align": ["justify"]
 94 | }
 95 | 
 96 | default_generator = DocumentGenerator()
 97 | default_generator.set_styles_to_generate(new_style_combinations)
 98 | # Example the list of all style combination to generate
 99 | print(f"Styles to generate: {default_generator.styles_to_generate}")
100 | 
101 | doc_gen = default_generator.create_generator(titled_content, ["columns.html.jinja", "letter.html.jinja"])
102 | 
103 | for doc in doc_gen:
104 |     print(doc.styles)
105 |     print(doc.template.name)
106 |     image_byte = doc.render_png(resolution=300)
107 |     display(Image(image_byte))
108 | ```
109 | 


--------------------------------------------------------------------------------
/genalog/generation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/genalog/generation/__init__.py


--------------------------------------------------------------------------------
/genalog/generation/content.py:
--------------------------------------------------------------------------------
 1 | from enum import auto, Enum
 2 | 
 3 | 
 4 | class ContentType(Enum):
 5 |     PARAGRAPH = auto()
 6 |     TITLE = auto()
 7 |     IMAGE = auto()
 8 |     COMPOSITE = auto()
 9 | 
10 | 
11 | class Content:
12 |     def __init__(self):
13 |         self.iterable = True
14 |         self._content = None
15 | 
16 |     def set_content_type(self, content_type):
17 |         if type(content_type) != ContentType:
18 |             raise TypeError(
19 |                 f"Invalid content type: {content_type}, valid types are {list(ContentType)}"
20 |             )
21 |         self.content_type = content_type
22 | 
23 |     def validate_content(self):
24 |         NotImplementedError
25 | 
26 |     def __str__(self):
27 |         return self._content.__str__()
28 | 
29 |     def __iter__(self):
30 |         return self._content.__iter__()
31 | 
32 |     def __getitem__(self, key):
33 |         return self._content.__getitem__(key)
34 | 
35 | 
36 | class Paragraph(Content):
37 |     def __init__(self, content):
38 |         self.set_content_type(ContentType.PARAGRAPH)
39 |         self.validate_content(content)
40 |         self._content = content
41 | 
42 |     def validate_content(self, content):
43 |         if not isinstance(content, str):
44 |             raise TypeError(f"Expect a str, but got {type(content)}")
45 | 
46 | 
47 | class Title(Content):
48 |     def __init__(self, content):
49 |         self.set_content_type(ContentType.TITLE)
50 |         self.validate_content(content)
51 |         self._content = content
52 | 
53 |     def validate_content(self, content):
54 |         if not isinstance(content, str):
55 |             raise TypeError(f"Expect a str, but got {type(content)}")
56 | 
57 | 
58 | class CompositeContent(Content):
59 |     def __init__(self, content_list, content_type_list):
60 |         self.set_content_type(ContentType.COMPOSITE)
61 |         self.validate_content(content_list)
62 |         self.construct_content(content_list, content_type_list)
63 |         self.iterable = True
64 | 
65 |     def validate_content(self, content_list):
66 |         if not isinstance(content_list, list):
67 |             raise TypeError(f"Expect a list of content, but got {type(content_list)}")
68 | 
69 |     def construct_content(self, content_list, content_type_list):
70 |         self._content = []
71 |         for content, content_type in zip(content_list, content_type_list):
72 |             if content_type == ContentType.TITLE:
73 |                 self._content.append(Title(content))
74 |             elif content_type == ContentType.PARAGRAPH:
75 |                 self._content.append(Paragraph(content))
76 |             else:
77 |                 raise NotImplementedError(f"{content_type} is not currently supported")
78 | 
79 |     def insert_content(self, new_content, index):
80 |         NotImplementedError
81 | 
82 |     def delete_content(self, index):
83 |         NotImplementedError
84 | 
85 |     def __repr__(self):
86 |         return "CompositeContent(" + self._content.__repr__() + ")"
87 | 
88 |     def __str__(self):
89 |         """get a string transparent of the nested object types"""
90 |         transparent_str = "["
91 |         for content in self._content:
92 |             transparent_str += '"' + content.__str__() + '", '
93 |         return transparent_str + "]"
94 | 


--------------------------------------------------------------------------------
/genalog/generation/templates/base.css.jinja:
--------------------------------------------------------------------------------
1 | {# Copyright (c) Microsoft Corporation. All rights reserved. #}


--------------------------------------------------------------------------------
/genalog/generation/templates/base.html.jinja:
--------------------------------------------------------------------------------
 1 | {# Copyright (c) Microsoft Corporation. All rights reserved. #}
 2 | 
 3 | <!DOCTYPE html>
 4 | <!-- setting language for hyphenation purpose -->
 5 | {% if language %} 
 6 | <html lang={{ language }}>
 7 | {% else %}
 8 | <html lang=en> <!-- defaults to English -->
 9 | {% endif %}
10 | 
11 | <head>
12 |     {%- block head %}
13 |     <style>
14 |         {% import "macro/page_layout.css.jinja" as layout %}
15 |             {{ layout.set_page_bg() }}
16 |         {%- block style %} {% endblock style %}
17 |     </style>
18 |     {% endblock head %}
19 | </head>
20 | 
21 | <body>
22 |     {% block body %} {% endblock body %}
23 | </body>
24 | </html>


--------------------------------------------------------------------------------
/genalog/generation/templates/columns.css.jinja:
--------------------------------------------------------------------------------
 1 | {# Copyright (c) Microsoft Corporation. All rights reserved. #}
 2 | 
 3 | .title, .authors  {
 4 |     margin: auto;
 5 |     width: 80%;
 6 |     text-align: center;
 7 | }
 8 | 
 9 | .title {
10 |     font-weight: bold;
11 | }
12 | 
13 | .authors {
14 |     font-style: italic;
15 |     margin: 15px auto ;
16 | }
17 | 
18 | .abstract {
19 |     margin: auto;
20 |     width: 100%;
21 |     text-align: justify;
22 |     margin-bottom: 5px;
23 | }
24 | 
25 | .abstract-title {
26 |     font-weight: bold;
27 |     font-size: 14px;
28 |     text-align: center;
29 |     margin-bottom: 5px;
30 | }
31 | 
32 | .columns {
33 |     margin-top: 0;
34 | }
35 | .columns {
36 |     column-gap: 40px;
37 |     {% if column_num %}
38 |         column-count: {{ column_num }};
39 |     {% else %}
40 |         column-count: 2;
41 |     {% endif %}
42 | }
43 | .title {
44 |     font-size: 16px;
45 | }
46 | .section-title {
47 |     font-weight: bold;
48 |     font-size: {{ font_size_title }};
49 | }
50 | .section-content {
51 |     
52 | }
53 | img {
54 |     max-width:100%;
55 |     height:auto;
56 | }


--------------------------------------------------------------------------------
/genalog/generation/templates/columns.html.jinja:
--------------------------------------------------------------------------------
 1 | {# Copyright (c) Microsoft Corporation. All rights reserved. #}
 2 | 
 3 | {% extends "base.html.jinja" %}
 4 | {%- block style %}
 5 |     {# Global Style #}
 6 |     {% import "macro/dimension.css.jinja" as dimension %}
 7 |         {{ dimension.a4_paper() }}
 8 |     {% import "macro/text.css.jinja" as text %}
 9 |         {{ text.set_font(font_family, font_size) }}
10 |         {{ text.set_hyphenation(hyphenate) }}
11 |         {{ text.set_text_align(text_align) }}
12 |     {% import "macro/page_layout.css.jinja" as layout %}
13 |         {{ layout.set_page_num() }}
14 |     {# Element-Specific Style #}
15 |     {%- include "columns.css.jinja" with context %}
16 | {% endblock style %}
17 |         
18 | {% block body %}
19 |     <div class="title">
20 |         <p> A Study of Wild Unicorns in a Rainbow-rich Habitat </p>
21 |     </div>
22 | 
23 |     <div class="authors">
24 |         Pony Tail, Sweet Rock, Umbrella Mushroom <br>
25 |         Colourful University of Magic <br>
26 |         One Rainbow Road <br>
27 |         Utopia, 001 <br>
28 |         everyone@happiness.joy
29 |     </div>
30 | 
31 |     <div class="abstract">
32 |         <div class="abstract-title"> Abstract </div>
33 |         A study of wild unicorns in a rainbow-rich habitat, 
34 |         in an effort to understand the dynamics of this unusual animal. 
35 |         "Rainbows are considered a sign of life," explained Lise Saut ter, 
36 |         a scientist at the University of Ber gen in Norway and lead author 
37 |         of the study. "The unicorn also has a very interesting evolutionary 
38 |         history. This study is a first step toward understanding why unicorns 
39 |         behave the way they do." In order to better understand these unique 
40 |         animals, researchers collected four wild females from the rain forest 
41 |         in Northern Norway in 2006. They spent several weeks with them, feeding 
42 |         them on different types of wild fruit, grass and mushrooms, and recording 
43 |         the activity and responses of the wild animals.
44 |     </div>
45 | 
46 |     {% if content and content.content_type.__str__() == "ContentType.COMPOSITE"%}
47 |         <div class="columns" >
48 |             {% for c in content %}
49 |                 {% if c.content_type.__str__() == "ContentType.TITLE"%}
50 |                     <p class="section-title"> {{ c }} </p>
51 |                 {% elif c.content_type.__str__() == "ContentType.PARAGRAPH" %}
52 |                     <p> {{ c }} </p>
53 |                 {% else %}
54 |                     <p> Unsupported Content Type: {{c.content_type.__str__()}} </p>
55 |                 {% endif %}
56 |             {% endfor %}
57 |         </div>
58 |     {% else %}
59 |         <div>
60 |             No content loaded or content is not an instance of CompositeContent Class
61 |         </div>
62 |     {% endif %}
63 |     </div>
64 | {% endblock body %}
65 | 


--------------------------------------------------------------------------------
/genalog/generation/templates/letter.css.jinja:
--------------------------------------------------------------------------------
 1 | {# Copyright (c) Microsoft Corporation. All rights reserved. #}
 2 | 
 3 | .section-title {
 4 |     font-weight: bold;
 5 |     font-size: {{ font_size_title }};
 6 | }
 7 | 
 8 | .letter-head {
 9 |     margin: auto;
10 |     width: 50%;
11 |     text-align: center;
12 |     font-size: 16px;
13 |     font-weight: bold;
14 |     font-style: italic;
15 | }
16 | 
17 | .letter-head p {
18 |     margin-top: 0;
19 | }
20 | 
21 | .addressee {
22 |     margin: 30px 0 15px 0 ;
23 | }
24 | 
25 | 


--------------------------------------------------------------------------------
/genalog/generation/templates/letter.html.jinja:
--------------------------------------------------------------------------------
 1 | {# Copyright (c) Microsoft Corporation. All rights reserved. #}
 2 | 
 3 | {% extends "base.html.jinja" %}
 4 | {%- block style %}
 5 |     {% import "macro/dimension.css.jinja" as dimension %}
 6 |         {{ dimension.a4_paper() }}
 7 |     {% import "macro/text.css.jinja" as text %}
 8 |         {{ text.set_font(font_family, font_size) }}
 9 |         {{ text.set_hyphenation(hyphenate) }}
10 |         {{ text.set_text_align(text_align) }}
11 |     {% import "macro/page_layout.css.jinja" as layout %}
12 |         {{ layout.set_page_num() }}
13 |     {%- include "letter.css.jinja" with context %}
14 | {% endblock style %}
15 | 
16 | {% block body %}
17 |     <div class="letter-head">
18 |         <img src="default_company_logo.jpg" height="42" width="42">
19 |         <p> Company X <br/>
20 |         One Company Road <br/>
21 |         City, State, 0001 <br/>
22 |         January 1st, 2020 </p>
23 |     </div>
24 |     
25 |     <div class="addressee">
26 |         Dear Mr/Ms. X
27 |     </div>
28 | 
29 |    {% if content and content.content_type.__str__() == "ContentType.COMPOSITE"%}
30 |         <div class="sections" >
31 |             {% for c in content %}
32 |                 {% if c.content_type.__str__() == "ContentType.TITLE"%}
33 |                     <p> {{ c }} </p>
34 |                 {% elif c.content_type.__str__() == "ContentType.PARAGRAPH" %}
35 |                     <p> {{ c }} </p>
36 |                 {% else %}
37 |                     <p> Unsupported Content Type: {{c.content_type.__str__()}} </p>
38 |                 {% endif %}
39 |             {% endfor %}
40 |         </div>
41 |     {% else %}
42 |         <div>
43 |             No content loaded or content is not an instance of CompositeContent Class
44 |         </div>
45 |     {% endif %}
46 | {% endblock body %}


--------------------------------------------------------------------------------
/genalog/generation/templates/macro/dimension.css.jinja:
--------------------------------------------------------------------------------
 1 | {# Copyright (c) Microsoft Corporation. All rights reserved. #}
 2 | 
 3 | {% macro set_page_dimension(width, height, margin) -%}
 4 |     @page {
 5 |         size: {{ width }}cm {{ height }}cm;
 6 |         margin: {{ margin }}cm;
 7 |     }
 8 | {% endmacro %}
 9 | 
10 | {% macro a4_paper(margin=2) %}
11 |     {{ set_page_dimension(21, 30, margin) }}
12 | {% endmacro %}


--------------------------------------------------------------------------------
/genalog/generation/templates/macro/page_layout.css.jinja:
--------------------------------------------------------------------------------
 1 | {# Copyright (c) Microsoft Corporation. All rights reserved. #}
 2 | 
 3 | {% macro set_page_num() -%}
 4 |    @page {
 5 |        @bottom-right { content: counter(page); }
 6 |    }
 7 | {% endmacro %}
 8 | 
 9 | {% macro set_page_bg() %}
10 |     @page {
11 |         background: white;
12 |     }
13 | {% endmacro%}


--------------------------------------------------------------------------------
/genalog/generation/templates/macro/text.css.jinja:
--------------------------------------------------------------------------------
 1 | {# Copyright (c) Microsoft Corporation. All rights reserved. #}
 2 | 
 3 | {% macro set_font(font_family, size) -%}
 4 |     html {
 5 |         font-family: {{ font_family }};
 6 |         font-size: {{ size }};
 7 |     }
 8 | {% endmacro %}
 9 | 
10 | {% macro set_hyphenation(hyphenate=True) -%}
11 |     {% if hyphenate %}
12 |         html { hyphens: auto; }
13 |     {% else %}
14 |         html { hyphens: none; }
15 |     {% endif %}
16 | {% endmacro %}
17 | 
18 | {% macro set_text_align(alignment) -%}
19 |     html { text-align: {{ alignment }} }
20 | {% endmacro %}


--------------------------------------------------------------------------------
/genalog/generation/templates/text_block.css.jinja:
--------------------------------------------------------------------------------
1 | {# Copyright (c) Microsoft Corporation. All rights reserved. #}


--------------------------------------------------------------------------------
/genalog/generation/templates/text_block.html.jinja:
--------------------------------------------------------------------------------
 1 | {# Copyright (c) Microsoft Corporation. All rights reserved. #}
 2 | 
 3 | {% extends "base.html.jinja" %}
 4 | {%- block style %}
 5 |     {# Global Style #}
 6 |     {% import "macro/dimension.css.jinja" as dimension %}
 7 |         {{ dimension.a4_paper() }}
 8 |     {% import "macro/text.css.jinja" as text %}
 9 |         {{ text.set_font(font_family, font_size) }}
10 |         {{ text.set_hyphenation(hyphenate) }}
11 |         {{ text.set_text_align(text_align) }}
12 |     {# Element-Specific Style #}
13 |     {%- include "text_block.css.jinja" with context %}
14 | {% endblock style %}
15 | 
16 | {% block body %}
17 |     {% if content and content.content_type.__str__() == "ContentType.COMPOSITE"%}
18 |         <div class="paragraphs" >
19 |             {% for c in content %}
20 |                 {% if c.content_type.__str__() == "ContentType.PARAGRAPH" %}
21 |                     <p style="page-break-after: always;"> {{ c }} </p>
22 |                 {% else %}
23 |                     <p> Unsupported Content Type: {{c.content_type.__str__()}} </p>
24 |                 {% endif %}
25 |             {% endfor %}
26 |         </div>
27 |     {% else %}
28 |         <div>
29 |             No content loaded or content is not an instance of CompositeContent Class
30 |         </div>
31 |     {% endif %}
32 | {% endblock body %}


--------------------------------------------------------------------------------
/genalog/ocr/README.md:
--------------------------------------------------------------------------------
 1 | # GROK Client
 2 | 
 3 | Use the GROK client to make rest calls to the Azure Search Service to create and run the indexing pipeline. Blob client is used to transfer the images to blob and download the extracted OCR from blob.
 4 | 
 5 | Example usage:
 6 | 
 7 | 1. Create an .env file with the environment variables that includes the names of you index, indexer, skillset, and datasource to create on the search service. Include keys to the blob that contains the documents you want to index, keys to the cognitive service and keys to you computer vision subscription and search service. In order to index more than 20 documents, you must have a computer services subscription. You can find the keys for the services in the Azure Portal. An example of the .env file content is given below:
 8 | 
 9 |     ```bash
10 | 
11 |     SEARCH_SERVICE_NAME = "ocr-ner-pipeline"
12 |     SKILLSET_NAME = "ocrskillset"
13 |     INDEX_NAME = "ocrindex"
14 |     INDEXER_NAME = "ocrindexer"
15 |     DATASOURCE_NAME = "syntheticimages"
16 |     DATASOURCE_CONTAINER_NAME = "ocrimages"
17 |     PROJECTIONS_CONTAINER_NAME = "ocrprojection"
18 | 
19 |     BLOB_NAME = "syntheticimages"
20 |     BLOB_KEY = "<YOUR BLOB KEY>"
21 |     SEARCH_SERVICE_KEY = "<YOUR SEARCH SERVICE KEY>"
22 |     COGNITIVE_SERVICE_KEY = "<YOUR COGNITIVE SERVICE KEY>"
23 |     ```
24 | 
25 | 2. Source this .env file to load the variables then you can create and use the Grok class , REST client or blob client.
26 | 
27 | 3. First, we need to upload our image files to azure blob. To do this, we use the blob client and call the `upload_images_to_blob` function. This function takes in the local and remote path and an optional parameter to specify whether to use asyncio asynchronous uploads [https://docs.python.org/3/library/asyncio.html]. Asynchronous uploads are faster, however, some setups of python may not support them. In such cases, sychronous uploads can be made using `use_async=False`.
28 | 
29 |     ```python
30 |     from genalog.ocr.blob_client import GrokBlobClient
31 |     from dotenv import load_dotenv
32 |     load_dotenv(".env")
33 |     destination_folder_name, upload_task = blob_client.upload_images_to_blob(local_path, remote_path, use_async=True)
34 |     await upload_task
35 |     ```
36 | 
37 | 4. Once files are uploaded, use the rest client to create an indexing pipeline to extract the text from the images on blob. The results are stored as json blobs in a projection blob container where the names of these json blobs are the base64 encoded paths of the source blob images. The name of this projection container is specified in the env file. The `poll_indexer_till_complete` will block and continuosly poll the indexer until it completly processes all docs.
38 | 
39 |     ```python
40 |     from genalog.ocr.rest_client import GrokRestClient
41 |     from dotenv import load_dotenv
42 |     load_dotenv(".env")
43 | 
44 |     grok_rest_client = GrokRestClient.
45 |     grok_rest_client.create_indexing_pipeline()
46 |     grok_rest_client.run_indexer()
47 |     indexer_status = grok_rest_client.poll_indexer_till_complete()
48 | 
49 |     ```
50 | 
51 | 5. Once the indexer completes, use the blob client to download the results from the projections blob.
52 | 
53 |     ```python
54 |     from genalog.ocr.blob_client import GrokBlobClient
55 |     from dotenv import load_dotenv
56 |     load_dotenv(".env")
57 |     
58 |     output_folder = "./ocr"
59 |     async_download_task = blob_client.get_ocr_json( remote_path, output_folder, use_async=True)
60 |     await async_download_task
61 |     ```
62 | 
63 | 6. Alternatively, steps 3, 4 and 5 can be skipped by using the Grok class. This class is wrapper of the rest and blob clients. It upload images from src_folder_path to blob, runs the indexer, then donwloads the ocr projections to dest_folder_path
64 | 
65 | 
66 |     ```python
67 |     from genalog.ocr.grok import Grok
68 |     from dotenv import load_dotenv
69 |     load_dotenv("tests/unit/ocr/.env")
70 | 
71 |     grok = Grok.create_from_env_var()
72 |     grok.run_grok(src_folder_path = "tests/unit/ocr/data/img", dest_folder_path = "tests/unit/ocr/data/json")
73 |     ```
74 |     
75 | 


--------------------------------------------------------------------------------
/genalog/ocr/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/genalog/ocr/__init__.py


--------------------------------------------------------------------------------
/genalog/ocr/common.py:
--------------------------------------------------------------------------------
1 | # ---------------------------------------------------------
2 | # Copyright (c) Microsoft Corporation. All rights reserved.
3 | # Licensed under the MIT License.
4 | # ---------------------------------------------------------
5 | 
6 | DEFAULT_PROJECTIONS_CONTAINER_NAME = "ocrprojections"
7 | 


--------------------------------------------------------------------------------
/genalog/ocr/grok.py:
--------------------------------------------------------------------------------
  1 | # ---------------------------------------------------------
  2 | # Copyright (c) Microsoft Corporation. All rights reserved.
  3 | # Licensed under the MIT License.
  4 | # ---------------------------------------------------------
  5 | 
  6 | import time
  7 | 
  8 | from .blob_client import GrokBlobClient
  9 | from .rest_client import GrokRestClient
 10 | 
 11 | 
 12 | class Grok:
 13 |     @staticmethod
 14 |     def create_from_env_var():
 15 |         """Initializes Grok based on keys in the environment variables.
 16 | 
 17 |         Returns:
 18 |             Grok: the Grok client
 19 |         """
 20 |         grok_rest_client = GrokRestClient.create_from_env_var()
 21 |         grok_blob_client = GrokBlobClient.create_from_env_var()
 22 |         return Grok(grok_rest_client, grok_blob_client)
 23 | 
 24 |     def __init__(
 25 |         self, grok_rest_client: GrokRestClient, grok_blob_client: GrokBlobClient
 26 |     ):
 27 |         self.grok_rest_client = grok_rest_client
 28 |         self.grok_blob_client = grok_blob_client
 29 | 
 30 |     def run_grok(
 31 |         self,
 32 |         src_folder_path,
 33 |         dest_folder_path,
 34 |         blob_dest_folder=None,
 35 |         cleanup=False,
 36 |         use_async=True,
 37 |     ):
 38 |         """Uploads images in the source folder to blob, sets up an indexing pipeline to run
 39 |         GROK OCR on this blob storage as a source, then dowloads the OCR output json to the destination
 40 |         folder. There resulting json files are of the same name as the original images except prefixed
 41 |         with the name of their folder on the blob storages and suffixed with the .json extension.
 42 | 
 43 |         Args:
 44 |             src_folder_path (str): Path to folder holding the images. This folder must only contain png or jpg files
 45 |             dest_folder_path (str): Path to folder where OCR json files will be placed
 46 |             blob_dest_folder (str, optional): Folder tag to use on the blob storage. If set to None, a hash is generated
 47 |                 based on the names of files in the src folder. Defaults to None.
 48 |             cleanup (bool, optional): If set to True, the indexing pipeline is deleted, and the files uploaded to the blob are
 49 |                 deleted from blob after running. Defaults to True.
 50 |             use_multiprocessing (boo, optional): If set to True, this will use multiprocessing to increase blob transfers speed.
 51 | 
 52 |         Returns:
 53 |             indexer_status json, blob folder name
 54 |         """
 55 |         print("uploading images to blob")
 56 |         blob_folder_name, _ = self.grok_blob_client.upload_images_to_blob(
 57 |             src_folder_path, dest_folder_name=blob_dest_folder, use_async=use_async
 58 |         )
 59 |         print(f"images upload under folder {blob_folder_name}")
 60 |         try:
 61 |             print("creating and running indexer")
 62 |             self.grok_rest_client.create_indexing_pipeline()
 63 |             time.sleep(2)
 64 | 
 65 |             indexer_status = self.grok_rest_client.get_indexer_status()
 66 |             if indexer_status["status"] == "error":
 67 |                 raise RuntimeError(f"indexer error: {indexer_status}")
 68 | 
 69 |             # if not already running start the indexer
 70 |             print("indexer_status", indexer_status)
 71 |             if (
 72 |                 indexer_status["lastResult"] is None
 73 |                 or indexer_status["lastResult"]["status"] != "inProgress"
 74 |             ):
 75 |                 self.grok_rest_client.run_indexer()
 76 | 
 77 |             time.sleep(1)
 78 |             print("\nrunning indexer")
 79 |             indexer_status = self.grok_rest_client.poll_indexer_till_complete()
 80 |             if indexer_status["lastResult"]["status"] == "success":
 81 |                 time.sleep(30)
 82 |                 print("fetching ocr json results.")
 83 |                 self.grok_blob_client.get_ocr_json(
 84 |                     blob_folder_name, dest_folder_path, use_async=use_async
 85 |                 )
 86 |                 print(f"indexer status {indexer_status}")
 87 |                 print(
 88 |                     f"finished running indexer. json files saved to {dest_folder_path}"
 89 |                 )
 90 |             else:
 91 |                 print("GROK failed", indexer_status["status"])
 92 |                 raise RuntimeError("GROK failed", indexer_status["status"])
 93 |             return indexer_status, blob_folder_name
 94 |         finally:
 95 |             if cleanup:
 96 |                 print("cleaning up indexer pipeline and blob store")
 97 |                 self.cleanup(blob_folder_name)
 98 | 
 99 |     def cleanup(self, folder_name):
100 |         """Deletes the indexing pipeline (index, indexer, datasource, skillset) from the search service.
101 |         Deletes uploaded files from the blob
102 | 
103 |         Args:
104 |             folder_name (str): blob folder name tag to remove
105 |         """
106 |         self.grok_blob_client.delete_blobs_folder(folder_name)
107 |         self.grok_rest_client.delete_indexer_pipeline()
108 | 


--------------------------------------------------------------------------------
/genalog/ocr/templates/datasource.json:
--------------------------------------------------------------------------------
1 | {    
2 |     "description" : "ocr image datasource",  
3 |     "credentials" : { "connectionString" : "" },
4 |     "container" : {"name": ""}
5 | }


--------------------------------------------------------------------------------
/genalog/ocr/templates/index.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "name": "[INDEX_NAME]",
  3 |   "fields": [
  4 |     {
  5 |       "name": "content",
  6 |       "type": "Edm.String",
  7 |       "facetable": false,
  8 |       "filterable": false,
  9 |       "key": false,
 10 |       "retrievable": true,
 11 |       "searchable": true,
 12 |       "sortable": false,
 13 |       "analyzer": "standard.lucene",
 14 |       "indexAnalyzer": null,
 15 |       "searchAnalyzer": null,
 16 |       "synonymMaps": [],
 17 |       "fields": []
 18 |     },
 19 |     {
 20 |       "name": "metadata_storage_content_type",
 21 |       "type": "Edm.String",
 22 |       "facetable": false,
 23 |       "filterable": false,
 24 |       "key": false,
 25 |       "retrievable": false,
 26 |       "searchable": false,
 27 |       "sortable": false,
 28 |       "analyzer": null,
 29 |       "indexAnalyzer": null,
 30 |       "searchAnalyzer": null,
 31 |       "synonymMaps": [],
 32 |       "fields": []
 33 |     },
 34 |     {
 35 |       "name": "metadata_storage_size",
 36 |       "type": "Edm.Int64",
 37 |       "facetable": false,
 38 |       "filterable": false,
 39 |       "retrievable": false,
 40 |       "sortable": false,
 41 |       "analyzer": null,
 42 |       "indexAnalyzer": null,
 43 |       "searchAnalyzer": null,
 44 |       "synonymMaps": [],
 45 |       "fields": []
 46 |     },
 47 |     {
 48 |       "name": "metadata_storage_last_modified",
 49 |       "type": "Edm.DateTimeOffset",
 50 |       "facetable": false,
 51 |       "filterable": false,
 52 |       "retrievable": true,
 53 |       "sortable": false,
 54 |       "analyzer": null,
 55 |       "indexAnalyzer": null,
 56 |       "searchAnalyzer": null,
 57 |       "synonymMaps": [],
 58 |       "fields": []
 59 |     },
 60 |     {
 61 |       "name": "metadata_storage_content_md5",
 62 |       "type": "Edm.String",
 63 |       "facetable": false,
 64 |       "filterable": false,
 65 |       "key": false,
 66 |       "retrievable": true,
 67 |       "searchable": false,
 68 |       "sortable": false,
 69 |       "analyzer": null,
 70 |       "indexAnalyzer": null,
 71 |       "searchAnalyzer": null,
 72 |       "synonymMaps": [],
 73 |       "fields": []
 74 |     },
 75 |     {
 76 |       "name": "metadata_storage_name",
 77 |       "type": "Edm.String",
 78 |       "facetable": false,
 79 |       "filterable": false,
 80 |       "key": false,
 81 |       "retrievable": true,
 82 |       "searchable": true,
 83 |       "sortable": true,
 84 |       "analyzer": null,
 85 |       "indexAnalyzer": null,
 86 |       "searchAnalyzer": null,
 87 |       "synonymMaps": [],
 88 |       "fields": []
 89 |     },
 90 |     {
 91 |       "name": "metadata_storage_path",
 92 |       "type": "Edm.String",
 93 |       "facetable": false,
 94 |       "filterable": false,
 95 |       "key": true,
 96 |       "retrievable": true,
 97 |       "searchable": false,
 98 |       "sortable": false,
 99 |       "analyzer": null,
100 |       "indexAnalyzer": null,
101 |       "searchAnalyzer": null,
102 |       "synonymMaps": [],
103 |       "fields": []
104 |     },
105 |     {
106 |       "name": "metadata_content_type",
107 |       "type": "Edm.String",
108 |       "facetable": false,
109 |       "filterable": false,
110 |       "key": false,
111 |       "retrievable": false,
112 |       "searchable": false,
113 |       "sortable": false,
114 |       "analyzer": null,
115 |       "indexAnalyzer": null,
116 |       "searchAnalyzer": null,
117 |       "synonymMaps": [],
118 |       "fields": []
119 |     },
120 |     {
121 |       "name": "merged_content",
122 |       "type": "Edm.String",
123 |       "facetable": false,
124 |       "filterable": false,
125 |       "key": false,
126 |       "retrievable": true,
127 |       "searchable": true,
128 |       "sortable": false,
129 |       "analyzer": "standard.lucene",
130 |       "indexAnalyzer": null,
131 |       "searchAnalyzer": null,
132 |       "synonymMaps": [],
133 |       "fields": []
134 |     },
135 |     {
136 |       "name": "text",
137 |       "type": "Collection(Edm.String)",
138 |       "facetable": false,
139 |       "filterable": false,
140 |       "retrievable": true,
141 |       "searchable": true,
142 |       "analyzer": "standard.lucene",
143 |       "indexAnalyzer": null,
144 |       "searchAnalyzer": null,
145 |       "synonymMaps": [],
146 |       "fields": []
147 |     },
148 |     {
149 |       "name": "layoutText",
150 |       "type": "Collection(Edm.String)",
151 |       "facetable": false,
152 |       "filterable": false,
153 |       "retrievable": true,
154 |       "searchable": true,
155 |       "analyzer": "standard.lucene",
156 |       "indexAnalyzer": null,
157 |       "searchAnalyzer": null,
158 |       "synonymMaps": [],
159 |       "fields": []
160 |     }
161 |   ],
162 |   "suggesters": [],
163 |   "scoringProfiles": [],
164 |   "defaultScoringProfile": "",
165 |   "corsOptions": null,
166 |   "analyzers": [],
167 |   "charFilters": [],
168 |   "tokenFilters": [],
169 |   "tokenizers": []
170 | }


--------------------------------------------------------------------------------
/genalog/ocr/templates/indexer.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "fieldMappings": [
 3 |     {
 4 |       "sourceFieldName": "metadata_storage_path",
 5 |       "targetFieldName": "metadata_storage_path",
 6 |       "mappingFunction": {
 7 |         "name": "base64Encode"
 8 |       }
 9 |     }
10 |   ],
11 |   "outputFieldMappings": [
12 |     {
13 |       "sourceFieldName": "/document/merged_content",
14 |       "targetFieldName": "merged_content"
15 |     },
16 |     {
17 |       "sourceFieldName": "/document/normalized_images/*/text",
18 |       "targetFieldName": "text"
19 |     },
20 |     {
21 |       "sourceFieldName": "/document/normalized_images/*/layoutText",
22 |       "targetFieldName": "layoutText"
23 |     }
24 |   ],
25 |   "parameters": {
26 |     "maxFailedItems": -1,
27 |     "configuration": {
28 |       "dataToExtract": "contentAndMetadata",
29 |       "imageAction": "generateNormalizedImages"
30 |     }
31 |   }
32 | }


--------------------------------------------------------------------------------
/genalog/ocr/templates/knowledge_store.json:
--------------------------------------------------------------------------------
 1 | {
 2 |         "projections": [
 3 |              {
 4 |                 "tables": [ ],
 5 |                 "objects": [
 6 |                     {
 7 |                         "storageContainer": "projections",
 8 |                         "source": null,
 9 |                         "generatedKeyName": "myobject",
10 |                         "sourceContext": "/document",
11 |                         "inputs": [
12 |                             {
13 |                                 "name": "metadata_storage_name",
14 |                                 "source": "/document/metadata_storage_name"
15 |                             },
16 |                             {
17 |                                 "name": "metadata_storage_path",
18 |                                 "source": "/document/metadata_storage_path"
19 |                             },
20 |                             {
21 |                                 "name": "ocrText",
22 |                                 "source": "/document/normalized_images/*/text"
23 |                             },
24 |                             {
25 |                                 "name": "ocrLayoutText",
26 |                                 "source": "/document/normalized_images/*/layoutText"
27 |                             }
28 |                         ]
29 | 
30 |                     }
31 |                 ],
32 |                 "files": []
33 |             }
34 |         ]
35 |     }


--------------------------------------------------------------------------------
/genalog/ocr/templates/skillset.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "example_skillset",
 3 |   "description": "Skillset to run ocr on docs ;",
 4 |   "skills": [
 5 |     {
 6 |       "@odata.type": "#Microsoft.Skills.Text.MergeSkill",
 7 |       "name": "#1",
 8 |       "context": "/document",
 9 |       "insertPreTag": " ",
10 |       "insertPostTag": " ",
11 |       "inputs": [
12 |         {
13 |           "name": "text",
14 |           "source": "/document/content"
15 |         },
16 |         {
17 |           "name": "itemsToInsert",
18 |           "source": "/document/normalized_images/*/text"
19 |         },
20 |         {
21 |           "name": "offsets",
22 |           "source": "/document/normalized_images/*/contentOffset"
23 |         }
24 |       ],
25 |       "outputs": [
26 |         {
27 |           "name": "mergedText",
28 |           "targetName": "merged_content"
29 |         }
30 |       ]
31 |     },
32 |     {
33 |       "@odata.type": "#Microsoft.Skills.Vision.OcrSkill",
34 |       "name": "#2",
35 |       "context": "/document/normalized_images/*",
36 |       "lineEnding": "Space",
37 |       "defaultLanguageCode": "en",
38 |       "detectOrientation": true,
39 |       "inputs": [
40 |         {
41 |           "name": "image",
42 |           "source": "/document/normalized_images/*"
43 |         }
44 |       ],
45 |       "outputs": [
46 |         {
47 |           "name": "text",
48 |           "targetName": "text"
49 |         },
50 |         {
51 |           "name": "layoutText",
52 |           "targetName": "layoutText"
53 |         }
54 |       ]
55 |     }
56 |   ],
57 |   "cognitiveServices": {
58 |     "@odata.type": "#Microsoft.Azure.Search.CognitiveServicesByKey",
59 |     "description": "cognitive service provider",
60 |     "key": ""
61 |   }
62 | }


--------------------------------------------------------------------------------
/genalog/text/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/genalog/text/__init__.py


--------------------------------------------------------------------------------
/genalog/text/lcs.py:
--------------------------------------------------------------------------------
 1 | # ---------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation. All rights reserved.
 3 | # Licensed under the MIT License.
 4 | # ---------------------------------------------------------
 5 | 
 6 | class LCS:
 7 |     """ Compute the Longest Common Subsequence (LCS) of two given string."""
 8 | 
 9 |     def __init__(self, str_m, str_n):
10 |         self.str_m_len = len(str_m)
11 |         self.str_n_len = len(str_n)
12 |         dp_table = self._construct_dp_table(str_m, str_n)
13 |         self._lcs_len = dp_table[self.str_m_len][self.str_n_len]
14 |         self._lcs = self._find_lcs_str(str_m, str_n, dp_table)
15 | 
16 |     def _construct_dp_table(self, str_m, str_n):
17 |         m = self.str_m_len
18 |         n = self.str_n_len
19 | 
20 |         # Initialize DP table
21 |         dp = [[0 for j in range(n + 1)] for i in range(m + 1)]
22 | 
23 |         for i in range(1, m + 1):
24 |             for j in range(1, n + 1):
25 |                 # Case 1: if char1 == char2
26 |                 if str_m[i - 1] == str_n[j - 1]:
27 |                     dp[i][j] = 1 + dp[i - 1][j - 1]
28 |                 # Case 2: take the max of the values in the top and left cell
29 |                 else:
30 |                     dp[i][j] = max(dp[i - 1][j], dp[i][j - 1])
31 |         return dp
32 | 
33 |     def _find_lcs_str(self, str_m, str_n, dp_table):
34 |         m = self.str_m_len
35 |         n = self.str_n_len
36 |         lcs = ""
37 |         while m > 0 and n > 0:
38 |             # same char
39 |             if str_m[m - 1] == str_n[n - 1]:
40 |                 # prepend the character
41 |                 lcs = str_m[m - 1] + lcs
42 |                 m -= 1
43 |                 n -= 1
44 |             # top cell > left cell
45 |             elif dp_table[m - 1][n] > dp_table[m][n - 1]:
46 |                 m -= 1
47 |             else:
48 |                 n -= 1
49 |         return lcs
50 | 
51 |     def get_len(self):
52 |         return self._lcs_len
53 | 
54 |     def get_str(self):
55 |         return self._lcs
56 | 


--------------------------------------------------------------------------------
/genalog/text/preprocess.py:
--------------------------------------------------------------------------------
 1 | # ---------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation. All rights reserved.
 3 | # Licensed under the MIT License.
 4 | # ---------------------------------------------------------
 5 | 
 6 | import re
 7 | 
 8 | END_OF_TOKEN = {" ", "\t", "\n"}
 9 | NON_ASCII_REPLACEMENT = "_"
10 | 
11 | 
12 | def remove_non_ascii(token, replacement=NON_ASCII_REPLACEMENT):
13 |     """Remove non ascii characters in a token
14 | 
15 |     Arguments:
16 |         token (str) : a word token
17 |         replacement (str, optional) : a replace character for non-ASCII characters.
18 |                                       Defaults to ``NON_ASCII_REPLACEMENT``.
19 |     Returns:
20 |         str -- a word token with non-ASCII characters removed
21 |     """
22 |     # Remove non-ASCII characters in the token
23 |     ascii_token = str(token.encode("utf-8").decode("ascii", "ignore"))
24 |     # If token becomes an empty string as a result
25 |     if len(ascii_token) == 0 and len(token) != 0:
26 |         ascii_token = replacement  # replace with a default character
27 |     return ascii_token
28 | 
29 | 
30 | def tokenize(s):
31 |     """Tokenize string
32 | 
33 |     Arguments:
34 |         s (str) : aligned string
35 | 
36 |     Returns:
37 |         a list of tokens
38 |     """
39 |     # split alignment tokens by spaces, tabs and newline (and excluding them in the tokens)
40 |     return s.split()
41 | 
42 | 
43 | def join_tokens(tokens):
44 |     """Join a list of tokens into a string
45 | 
46 |     Arguments:
47 |         tokens (list) : a list of tokens
48 | 
49 |     Returns:
50 |         a string with space-separated tokens
51 |     """
52 |     return " ".join(tokens)
53 | 
54 | 
55 | def _is_spacing(c):
56 |     """ Determine if the character is ignorable """
57 |     return True if c in END_OF_TOKEN else False
58 | 
59 | 
60 | def split_sentences(text, delimiter="\n"):
61 |     """ Split a text into sentences with a delimiter"""
62 |     return re.sub(r"(( /?[.!?])+ )", rf"\1{delimiter}", text)
63 | 
64 | 
65 | def is_sentence_separator(token):
66 |     """ Returns true if the token is a sentence splitter """
67 |     return re.match(r"^/?[.!?]$", token) is not None
68 | 


--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | flake8
2 | flake8-import-order
3 | pytest
4 | pytest-cov
5 | pytest-mock
6 | pytest-xdist[psutil]
7 | pytest-lazy-fixture
8 | tox
9 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | biopython
 2 | numpy
 3 | python-dotenv
 4 | requests
 5 | azure-core
 6 | azure-common
 7 | azure-storage-blob
 8 | tqdm
 9 | Jinja2==2.11.1
10 | WeasyPrint
11 | matplotlib
12 | scikit-image
13 | pandas
14 | aiofiles
15 | aiohttp


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import setuptools
 4 | 
 5 | with open(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'VERSION.txt')) as version_file:
 6 |     BUILD_VERSION = version_file.read().strip()
 7 | 
 8 | # Loading dependencies from requirements.txt
 9 | with open('requirements.txt') as f:
10 |     requirements = f.read().splitlines()
11 | 
12 | with open("README.md", "r", encoding="utf8") as fh:
13 |     long_description = fh.read()
14 | 
15 | setuptools.setup(
16 |     name="genalog",
17 |     install_requires=requirements,
18 |     version=BUILD_VERSION,
19 |     author="Jianjie Liu & Amit Gupte",
20 |     author_email="ta_maidap_fy20_h2@microsoft.com",
21 |     description="Tools for generating analog document (images) from raw text",
22 |     long_description=long_description,
23 |     long_description_content_type="text/markdown",
24 |     url='https://github.com/microsoft/genalog',
25 |     packages=setuptools.find_packages(exclude=['tests', 'tests.*']),
26 |     package_data={'': [
27 |         'genalog/generation/templates/*.jinja'
28 |     ]},
29 |     include_package_data=True,
30 |     classifiers=[
31 |         "Programming Language :: Python :: 3",
32 |         "Operating System :: OS Independent",
33 |     ],
34 |     python_requires='>=3.6',
35 | )
36 | 


--------------------------------------------------------------------------------
/tests/.env:
--------------------------------------------------------------------------------
1 | COMPUTER_VISION_ENDPOINT = "https://enki-vision.cognitiveservices.azure.com/"
2 | SEARCH_SERVICE_NAME = "ocr-ner-pipeline"
3 | SKILLSET_NAME = "testocrskillset"
4 | INDEX_NAME = "testocrindex"
5 | INDEXER_NAME = "testocrindexer"
6 | DATASOURCE_NAME = "syntheticimages"
7 | DATASOURCE_CONTAINER_NAME = "testocrimages"
8 | BLOB_NAME = "syntheticimages"
9 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/tests/__init__.py


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | 
 4 | import pytest
 5 | from dotenv import load_dotenv
 6 | 
 7 | from tests.required_env import RequiredEnvVar
 8 | 
 9 | ENV_FILEPATH = "tests/.env"
10 | 
11 | 
12 | @pytest.fixture(scope="session")
13 | def load_azure_resources():
14 |     # Loading the non-secrets
15 |     load_dotenv(ENV_FILEPATH)
16 |     logging.info(f"Loading .env from {ENV_FILEPATH}")
17 |     logging.debug("Printing environment vars: ")
18 |     for env in RequiredEnvVar:
19 |         logging.debug(f"\t{env.value}: {os.environ.get(env.value)}")
20 | 


--------------------------------------------------------------------------------
/tests/e2e/data/conll_formatter/clean_labels/2161.txt:
--------------------------------------------------------------------------------
 1 | who	O
 2 | would	O
 3 | be	O
 4 | elevated	O
 5 | to	O
 6 | Heaven	O
 7 | and	O
 8 | not	O
 9 | be	O
10 | burned	O
11 | in	O
12 | etermal	O
13 | damnation	O
14 | ,	O
15 | only	O
16 | slants	O
17 | the	O
18 | facts	O
19 | :	O
20 | &	O
21 | quot	O
22 | ;	O
23 | .	O
24 | 


--------------------------------------------------------------------------------
/tests/e2e/data/conll_formatter/ocr_text/17.txt:
--------------------------------------------------------------------------------
1 | So , al - jazeera TV station seized this opportunity to get hold of , this encies group of people by faring them , or , at their reporters , editors , or anchors at high salaries , Uhuh . So, they had a relatively good team of reporters . Un fun . Well , this way , later it followed that , or ! or. . , er , six - Chanese character prix loss of news reporting cased , er , independence , neutraity , an , neveraity , basece and freedom Un fun . Uh-huh. So , what it reported was in a completely different style from that of some other Arab TV stations. Right . in thas respect , that is , bet me add one powx , that is , this al- Jazeera TV station, ah , it's style is very much characterized by direct borrowing from the west , for in stance , the two mainstream media outlets un-ten . Unsoon. Right . Un toh One is CAN, and the other is BSC . Yeah , Well , I think that BBC , in particular . hes grate s bag influence on it . Just now that is to say , many of its reporters directly came from the societe fast charmed of that time . Un fam . That was jointty pin by Bec and Saudi Arsexia ! So I had s very good foundation! in adition , that is . actually . this aljazeera TV station has a quite unique structure. That's because the one of this country is caked Hamad : He studied in Britain and therefore had quite a good knowsedge of Bream's BBC TV station , He also ques adriwed . .So, in this way, it has borrowed some of the BBC style . For instance, Britain's BNC is a very old TV group established in 1927 . Un hon: Though it is funded by the government with many of is properties owned by the government , at the fodoes the guideline of ta ter's independence . Un-hun For iatance, the top deckion . making body of BBC is called the board of director which are composed of twelve members sweetly appointed by the Queen . Unnun, So at Jazeera TV station has onto adopted that structure . It has a top seven . member board of directors . Un huh . Un tech . However , even! it's property, an , and funds can come from the government . R gis relatively has its . Mortal independence. En, but in essence, be Wy, is it ready a private TV station or government . nun TV station ? Er , they themselves claim that & is a private TV station , but in revery I could not have been established weneed a large amount of financial support given by the govern mart. That's because it, eh, has an extremely small number of ads during as around the clock, an. TVbroadcast Un hun Un hun ! well, in audition , at has such a large team , especially with high , high er, wages , and reporters based abroad, so many reporters abroad. If the government had not sug ported wich exper Stures , on, It would be impossible for a private TV station to survive , According to your knowledge , how much is its yearty expenditure ? it was said that the yearly expenditures seem to be about 7 bastion US dollars. About 7 bibion US dollars, that is equivalent to more than 50 bastion Road We showed say this is a very huge financial expenditure, Un fun. Extremely large. wan. Therefore, some people call it gaining woke but kising money because it has relatively imas revenues due to few ads Ut hun Unfun Union , Un tan. But it has a tremendous influence. Right. So. It appears that * is sod not bad in gaining voice as it does achieve some effect, wan. He was, speaking of its andtu ence , we have ano nocked that or, during the Alphon was, er, because it has the exclusive interview right to enter Mohantion to conduct independent and cocksive interviews, we could say this is one of is advantages, Uhiuhi, & ano presented an opportunity for as sex. wet, to the development of a TV station , both opportunity and real strength are actually very important ) well , as al Jazeera TV sta tion has been was to develop see as current status. what do you think has & robed upon so that its competitiveness and invential power , even surpassed CAN and BAC during the iraq war ? The great west feature of al jazeera TV station is that it is a small station that competes with large ones and has become wed . known through wars . That is, the reason why it can establish itself is that it first resed on the Action war, and then the way war after the 9/ 11, Unfun Chifue So, It made fod use of these two opportunities . in addition, as the iraq war occurred right in an Arab country , everyone is very much concerned with what is going on in the war . So , thus provides it with a large viewership. making it instantly famous . Un hurt , Un fish . Besides , i has a lot of resources , including it's exckative coverage right in Afghanistan as you mentioned wist now . Un ton , un tan . in addition , al Jazeera TV station is actualy quite ious in the Arab rection, For instance , a very tough commut ie this region is the conflict between Palestine and barzel unfun Utton. Well, in gerard, Arabs wie nick wake the trash leader in make a speech on their TV. However, for al jazeera TV station , It could inyee Barak the israel prime minister at that time , to debver a speech at al Jazeera TV station , This was un precedented in the Arab world . Right . Un huh . Well , this also gave it a very unique perspective . Un


--------------------------------------------------------------------------------
/tests/e2e/data/conll_formatter/ocr_text/1839.txt:
--------------------------------------------------------------------------------
1 | would n't k be more & quot ; deflational & quot ; to excise bebefs where we can ? Box manyways were and still are . convinced that the memory lobotomy was intentional . part of washington 's plans to ex cige the strong . rooted nation that was andreplace it with their own model , And you can also exche the topnerseparatce wow needed to return the pendant to which explains it be that & quot ; Rumor is that a personal kem of a priest 's may be used to drew & out or excise a from is hours & quot ; and the wants to use the pendant to & quot : remove a particularly burdensome spire from a property I'm loking to kwest in & quot : Would n't it be more deflations to excise beeats where we can ? You're probably right , you ca n't excise an entire category LAB unless you 've budgeted for entertainment and vacations , in which case that should be the first to go to zero ARE , but I suspect you need to book twit at the areas of large expense wich for most people are housing and cars. My famay practice doctor was going to excise it and although I have a very high level of confidence in him . I posted ask. big # I should be referred to a surgeon . But we loise something in the world when are have to excise at imagery of chadren , they innocence and joy from our world, In order to protect them? The doctor ex cises the biopsy and does n't stitch up the wound site to preserve healthy tiswe for a week or more be fore he gives me the results of has exam. That would be more evident had you not excised the crap to which I was responding , but of course you had to leave that out in order to come up on your high horse and found superior. wore : You're probably right, you ca n't excise an entire category LAD unless you've budgeted for entertainment and vacations , In which case that should be the first to go to zero. And. , beat I suspect you need to look first at the areas of large expense which for most people are housing and cars ? " ARB. war quote , which your neatly excised from this post, could be interpreted several offerent ways : and humorous was envy one of them . Diebold and the dubious voting machines voting machine company Diebold apparently excised long paragraphs detawny the US security indus. try's concerns over the integrity of thes voting machines, and information about the company 's chief executive 's neweasing for President bush, It extinguishes the small; it inflames the great. This made me into s meany , but at aise extinguished the whining. He said that we had to extinguish the bigies of the world , and when we would see the lights of New wat go out, we would know the our job was done & quot ; too have done nothing that extinguishes others " homes to use the land . just lee the he wrists , they are trying to extinguish the flames of the jewish sox & quot ; Father , extinguishes the thast of our poverty . Then I took tom from & and extinguished it with my hands , which made the evening news . The reason for this is that the owner of the house is treated as paying off part of the Gett wah the /MY of the house . however, because the debt is a personal lately. the transfer of the house don't not extinguish the remainder of the date, which continues is be a personal Sabaity of the owner ! You're paying attention to the new behavior and letting the bid one go, so the old one extin guithes . If you did n't extinguish the fame, you 'd have good kick for a fod year ! The present Tet madaw Government extinguished the conducts of fire the could not be extinguished at the time of pre visit Governeverts If he 'd 've done ., he I've found 's way to extinguish es before . prestened to turn them as crispy . The Vista fire Department extinguished the bare before i grew out of control the statement said. Thus little by better, It extinguishes their sports and enervates their souls .. R Is understood that the rain drop reserves of the tree must wolter in capecky to the plan of government * has to extinguish, They tell your that if you accidentally start a fire and you have n't managed to extin guish it in the first name , you're probably not going to be able to and you should get help ined ately . This mual harkens back to the days when Somann was one of only two days the other being Betaire when a was considered correct to extinguish the & quot ; hearth fire & quot ; and then to re light it, The two other meds are sbout to teach thes investigations, and according to our sources , the White House exerts pressures so that they extinguish the business . You have to be careful what you extinguish , It only takes a few minutes to appopriately extinguish any behavior , but you 've got to know HOWE : The suspect's son, a fee forover, was ised enter this year while trying to extinguish s factory bare , according to police. I know k is to extinguish a five but how old could they be and what we they worth ? ? ? 17 )? when the new bankruptcy be was passed by Congress last spring, bankers predicted & would turn many people away from the protection of the courts by making & harder to ex ianguish dete , A flashlight shone in one comer of a dark room does not extinguish the rest of the room we just ca n't SEE R .


--------------------------------------------------------------------------------
/tests/e2e/data/conll_formatter/ocr_text/2161.txt:
--------------------------------------------------------------------------------
1 | who would be elevated to Heaven and not be burned in etermal damnation , only slants the facts : & quot ; .


--------------------------------------------------------------------------------
/tests/e2e/data/conll_formatter/ocr_text/5.txt:
--------------------------------------------------------------------------------
1 | well, this is because he is all using the pre Cold . was automatic thinking after the Cold War, that is the United States is the number one superpower in the world . Un ten : Un hat, Japan is the second largest economy in the world . Un hun As long as these two countries stand together , then it seems that other countries could not do anything to them . Un hun . well, this is actually a very overbearing image Un hit . Well, looking at the situation in Ava, actually , I'is exactly the other way round . Un. hun. That is to say , I it does not get on good terms with is Asian neighboring countries. then in rest. ly the US mis become even more hard . mine in its diplomacy with japan. As for japan, to the US. . As thermore has lost some assets , diplomneck assets when making negotiations with the US. Un fun. Un hun. Well, just recently, it's nee diets for us to say, or, that Japan's relations , we can say , wich it's neighboring countries that your have suffered overat deterioration . These inchoate Japan 's relations with South Korea, Chine, and Russia which are all deteriorating, was, even including it's relations with the US. in fact,. even today , we sho noticed a piece of news that athough at, uns, the recers two ! pills . two meeting, or , the US and japan again reached an intermediate report, Un tun, Un fun . UP hun. Well . today , actualy the head of japan's Defense Agency again mentioned to the US that seems to feel regretted, why ? Uh-huh, That is , a be regreens, Because, according to the two - pais . two intermediate report jist reached. the US matary bases in japan should undergo adjustment . Now ever . I was met with strong opposition from the general pubic in Japan . So it may be hard to inple ment . Union So, now the DA head came to talk with the US, saying, whether our interests can be taken wito comideration again . or. in this inverme face report . That is, between the US and Japan. centering upon this intermediate report, actually the deal is again partially completed . Uh ton. So . such a prime minister is forum , who sandy follows the US as the passions of his diplomacy . was rarely seen even during the Coal wes ersUh-huh, We know that in 1957 Japan had three diplomatic principles . That is to say , It ment take good care of as relations with the western countries, Asian countries , and the United Nations : Uni out, Utsaun, But now! Keizer is lee a stick , let 's say . a post. Right it takes three points to support s plane . Wan . So it turned out that they had such son on the surface : Well ; some prime ministers before Kotzuns of beast would stil property deal with and butsince the relations with America , with the US and Asia , In particular , with them . an Un hun . Yes when a comes to Kozuers , he only takes good care of the relations with the US, Right . in fact, he wisely fail to take good care of the relations wah the US because if you can not win the trust of your Asin neighboring countries, the US we not by respect such a country . Un tan ! Un has . The US also wants to go beyond japan to keep good relations wan Chains , South korea , and other Asian coun. tries . Therefore , the more he follows the US, the lover will be ise actual status in Ass, Unfan. Ris apossible . Right , his kies was actually erakired and opposed by some postxian's even in japan. He was strongly criticized and increasingly isolated . Ah . The domestic economy was also affected . Many people afto mentioned , by young the US alone, you are inoring Atla . was. Now many meds of is sued statements and accorians. Therefore, some people also compared this diplomacy of has to a lame diplomacy . Yeah . He - Could he wake stewkey with a lame diplomacy ? Eh, as far as kolrumi himself is concerned , what is his probeem ? Uh ten , He himself, oh, or what showed we say ? He does not re spect and acknowledge reality . Actually , some people say he is deceiving himself as well as others Un hat. for instance , that time at the APEC meeting in Puton. when he mentioned, or , japan 's reis When's with Chains, he win used this kind of, that is to say, thecork, to offend manned, Un hen Union He said, or an , right now, Japan. China is ations are not lee, or, what the internacional comes nety is worried about Un ton. well the ecoreank trade development in our two countries is going on gate well, in fact , we can say he does not understand the current status of Sino . Japanese relations Un hut. Actually, due to his visit to resusun Shine and the cooling down of the political relationship Sing . japanese relations have suffered severe blows , He is only indulging in his unilateral winds thinking . I is only what he thinks. thi han , we can see some concrete figures , for instance, from larwary to August its's year, the latest statistics show that japan's experts to Chine grew only by 3.2 % according to China's statistics . Union Uni butt . It grew only by 5.8 % according to japan's statistics If tum . Well , in prevex's years . It was always more than 25 % . in other words , Japan's exports to China were declining sharply . More than 25 % . Un hun .


--------------------------------------------------------------------------------
/tests/e2e/data/conll_formatter/ocr_text/7965.txt:
--------------------------------------------------------------------------------
1 | A it the ultimate room with a view for formula one fans , racing afficionados all next week be able to spend the night and the race day in the heart of the action at the malaysian grand prix on the sepang frack . between march 27 and 29 , visitors wit site have a unique midnight tour of the track , a trade tional matrian breakfast in the morning with former git reporter sanjeev palar , and an exclusive be And the scenes pit tour. enjoy the world 's hottest race , /1 , from the comforts of your safe , with a unique stay in the heart of the action inside sepang track in malaysia . the apartment boasts unrivaled views of the racing track and will allow fans to witness their favourite start up close , the apartment sleeps four and provides a unique luxury experience with panoramic wows . guests can enjoy the open plan apartment for all those sessions of the race including practice . qualifying and race day Itself . the home Is described as an " basis at the heart of the world 's hottest race ' and comes complete with a sige living room featuring panorama windows offering uninteripted views of the racetrack , one mas let bedroom ,'s kitchen , a divine area and a bathroom : fans can witness every second of the world 's fastest engines from the comfort of the living room couch . they can also bring along those friends of their own to share in the exciting event ! from monaco to melbourne , the world 's fastest and most prestigious race has traversed five continents, renowned for it's mooring engines and sky high tempers ture . depending on the dates assigned . quests will be treated to a race pa tour , a midnight track tour and an sunday breakfast with an auto host , malaysia 's pit lane reporter , sanjeev palar . in or det to secure your place in this once in a sictime racing experience , enter at airbnb before march 22 the sepang track stay forms part of awoes 's 's night at ' campaign, which aims to convert unique to cations around the world , where no one has ever been able to send the night before , into unforget table wight says . this has included a night at the top of use holmenkollen ski jump in norway . inside the open plan apartment , there is a large living room . with panorama views of the racetrack , one master bedroom , a small kitchen, # dining area and a bathroom . quests will get to watch all three sessions of the race - practice , qualifying and race day fock


--------------------------------------------------------------------------------
/tests/e2e/data/splitter/example_splits/clean_labels/1.txt:
--------------------------------------------------------------------------------
  1 | On	O
  2 | July	B-DATE
  3 | 22	I-DATE
  4 | ,	I-DATE
  5 | 1940	I-DATE
  6 | ,	O
  7 | a	O
  8 | campaign	O
  9 | preparation	O
 10 | order	O
 11 | to	O
 12 | attack	O
 13 | the	B-FAC
 14 | Zhengtai	I-FAC
 15 | Railway	I-FAC
 16 | ,	O
 17 | jointly	O
 18 | signed	O
 19 | by	O
 20 | Zhu	B-PERSONNAME
 21 | De	I-PERSONNAME
 22 | ,	O
 23 | Peng	B-PERSONNAME
 24 | Dehuai	I-PERSONNAME
 25 | ,	O
 26 | and	O
 27 | Zuo	B-PERSONNAME
 28 | Quan	I-PERSONNAME
 29 | ,	O
 30 | was	O
 31 | sent	O
 32 | to	O
 33 | Yan'an	B-GPE
 34 | and	O
 35 | all	O
 36 | units	O
 37 | of	O
 38 | the	B-ORGANIZATION
 39 | Eighth	I-ORGANIZATION
 40 | Route	I-ORGANIZATION
 41 | Army	I-ORGANIZATION
 42 | .	O
 43 | 
 44 | What	O
 45 | was	O
 46 | the	O
 47 | ,	O
 48 | purpose	O
 49 | and	O
 50 | goal	O
 51 | of	O
 52 | this	O
 53 | campaign	O
 54 | ?	O
 55 | ?	O
 56 | ?	O
 57 | ?	O
 58 | 
 59 | It	O
 60 | was	O
 61 | to	O
 62 | break	O
 63 | through	O
 64 | the	O
 65 | Japanese	B-NORP
 66 | army	O
 67 | 's	O
 68 | siege	O
 69 | policy	O
 70 | against	O
 71 | base	O
 72 | areas	O
 73 | behind	O
 74 | enemy	O
 75 | lines	O
 76 | ,	O
 77 | and	O
 78 | to	O
 79 | avert	O
 80 | the	O
 81 | crisis	O
 82 | of	O
 83 | China	B-GPE
 84 | 's	O
 85 | compromise	O
 86 | and	O
 87 | surrender	O
 88 | .	O
 89 | 
 90 | It	O
 91 | was	O
 92 | to	O
 93 | overcome	O
 94 | this	O
 95 | crisis	O
 96 | .	O
 97 | 
 98 | Well	O
 99 | ,	O
100 | the	B-EVENT
101 | Hundred	I-EVENT
102 | Regiments	I-EVENT
103 | Offensive	I-EVENT
104 | was	O
105 | divided	O
106 | into	O
107 | three	B-CARDINAL
108 | phases	O
109 | .	O
110 | 
111 | Beginning	O
112 | from	O
113 | August	B-DATE
114 | 20	I-DATE
115 | ,	O
116 | from	O
117 | August	B-DATE
118 | 20	I-DATE
119 | to	I-DATE
120 | September	I-DATE
121 | 10	I-DATE
122 | ,	O
123 | the	O
124 | main	O
125 | purpose	O
126 | of	O
127 | the	O
128 | ...	O
129 | .	O
130 | 
131 | 


--------------------------------------------------------------------------------
/tests/e2e/data/splitter/example_splits/clean_text/0.txt:
--------------------------------------------------------------------------------
 1 | What kind of memory ? 
 2 | We respectfully invite you to watch a special edition of Across China ! ! ! 
 3 | WW II Landmarks on the Great Earth of China : Eternal Memories of Taihang Mountain Standing tall on Taihang Mountain is the Monument to the Hundred Regiments Offensive . 
 4 | It is composed of a primary stele , secondary steles , a huge round sculpture and beacon tower , and the Great Wall , among other things . 
 5 | A primary stele , three secondary steles , and two inscribed steles . 
 6 | The Hundred Regiments Offensive was the campaign of the largest scale launched by the Eighth Route Army during the War of Resistance against Japan . 
 7 | This campaign broke through the Japanese army 's blockade to reach base areas behind enemy lines , stirring up anti-Japanese spirit throughout the nation and influencing the situation of the anti-fascist war of the people worldwide . 
 8 | This is Zhuanbi Village , Wuxiang County of Shanxi Province , where the Eighth Route Army was headquartered back then . 
 9 | On a wall outside the headquarters we found a map . 
10 | This map was the Eighth Route Army 's depiction of the Mediterranean Sea situation at that time . 
11 | This map reflected the European battlefield situation . 
12 | In 1940 , the German army invaded and occupied Czechoslovakia , Poland , the Netherlands , Belgium , and France . 
13 | It was during this year that the Japanese army developed a strategy to rapidly force the Chinese people into submission by the end of 1940 . 
14 | In May , the Japanese army launched -- From one side , it seized an important city in China called Yichang . 
15 | Um , , uh , through Yichang , it could directly reach Chongqing . 
16 | Ah , that threatened Chongqing . 
17 | Then they would , ah , bomb these large rear areas such as Chongqing . 
18 | So , along with the coordinated , er , economic blockade , military offensives , and strategic bombings , er , a simultaneous attack was launched in Hong Kong to lure the KMT government into surrender . 
19 | The progress of this coordinated offensive was already very entrenched by then . 
20 | By 1940 , China 's War of Resistance against Japan had entered a stalemate . 
21 | The situation on our side and the enemy 's side was intertwined . 
22 | The Eighth Route Army guerrillas were extraordinarily active , creating more and more trouble for the Japanese army in North China . 
23 | Hayao Tada , commander of the Japanese North China Area Army , adopted a strategy of siege warfare to deal with the Eighth Route Army . 
24 | The specific method was building a closely connected transport network , with a road for every village and defensive towers on every road . 
25 | Roads and railways were used as links to connect all of North China into a solid , widespread siege , in order to strangle the Eighth Route Army and its base areas in this net . 
26 | As part of the Japanese army 's strategy of siege warfare , railways and roads had actually become the Japanese army 's weapons of war , becoming a great threat to the base areas . 
27 | In December 1939 , Commander - in - chief Zhu De and Vice Commander Peng Dehuai of the Eighth Route Army received a top - secret telegram from Commander Lu Zhengcao of the Jizhong Military District , among other people . 
28 | The telegram said that the Japanese troops were building blockade trenches and chessboard - like roads to divide the Jizhong base area into small isolated blocks without the ability to mutually communicate and support each other , causing the Eighth Route Army and the guerrillas to lose maneuverability . 
29 | Before the Hundred Regiments Offensive in 1940 , an inclination to compromise , ah , surrender , was an extremely serious crisis in the frontline situation in China . 
30 | Well , on the battlefield behind enemy lines , in order to take over , consolidate the area under its occupation , Japan began a new strategy . 
31 | That was to use railways as a pillar , roads as a chain , and strongholds as a lock , to carry out siege warfare in an attempt to divide the base areas behind enemy lines , ah , so as , er , to cut off their communication with one another . 
32 | In addition , it relied on this cage , ah , to further strengthen its assaults against the base areas . 
33 | Er . 
34 | So , it was amidst such a grave international and domestic situation that the Eighth Route Army led by the Chinese Communist Party , ah , launched , ah , a strategic offensive called the Hundred Regiments Offensive . 
35 | This plot of the Japanese army drew great attention from Zhu De and Peng Dehuai of Eighth Route Army headquarters . 
36 | After meticulous studies and painstaking preparations by many parties , a battle plan based on surprise was formulated . 
37 | 


--------------------------------------------------------------------------------
/tests/e2e/data/splitter/example_splits/clean_text/1.txt:
--------------------------------------------------------------------------------
1 | On July 22 , 1940 , a campaign preparation order to attack the Zhengtai Railway , jointly signed by Zhu De , Peng Dehuai , and Zuo Quan , was sent to Yan'an and all units of the Eighth Route Army . 
2 | What was the , purpose and goal of this campaign ? ? ? ? 
3 | It was to break through the Japanese army 's siege policy against base areas behind enemy lines , and to avert the crisis of China 's compromise and surrender . 
4 | It was to overcome this crisis . 
5 | Well , the Hundred Regiments Offensive was divided into three phases . 
6 | Beginning from August 20 , from August 20 to September 10 , the main purpose of the ... . 
7 | 


--------------------------------------------------------------------------------
/tests/e2e/data/synthetic_dataset/shared/train/clean_labels/2161.txt:
--------------------------------------------------------------------------------
 1 | who	O
 2 | would	O
 3 | be	O
 4 | elevated	O
 5 | to	O
 6 | Heaven	O
 7 | and	O
 8 | not	O
 9 | be	O
10 | burned	O
11 | in	O
12 | etermal	O
13 | damnation	O
14 | ,	O
15 | only	O
16 | slants	O
17 | the	O
18 | facts	O
19 | :	O
20 | &	O
21 | quot	O
22 | ;	O
23 | .	O
24 | 


--------------------------------------------------------------------------------
/tests/e2e/data/synthetic_dataset/test_version/.gitignore:
--------------------------------------------------------------------------------
1 | **/ocr_labels
2 | **/ocr_text


--------------------------------------------------------------------------------
/tests/e2e/data/synthetic_dataset/test_version/train/ocr/2161.json:
--------------------------------------------------------------------------------
1 | [{"language": "en", "text": "who would be elevated to Heaven and not be burned in etermal damnation , only slants the facts : & quot ; .", "lines": [{"boundingBox": [{"x": 146, "y": 157}, {"x": 1252, "y": 156}, {"x": 1253, "y": 179}, {"x": 147, "y": 180}], "text": "who would be elevated to Heaven and not be burned in etermal damnation , only slants the facts : &"}, {"boundingBox": [{"x": 147, "y": 184}, {"x": 228, "y": 183}, {"x": 229, "y": 203}, {"x": 148, "y": 204}], "text": "quot ; ."}], "words": [{"boundingBox": [{"x": 147, "y": 158}, {"x": 192, "y": 158}, {"x": 192, "y": 179}, {"x": 147, "y": 179}], "text": "who"}, {"boundingBox": [{"x": 199, "y": 158}, {"x": 263, "y": 158}, {"x": 264, "y": 179}, {"x": 199, "y": 179}], "text": "would"}, {"boundingBox": [{"x": 271, "y": 158}, {"x": 299, "y": 157}, {"x": 299, "y": 180}, {"x": 271, "y": 179}], "text": "be"}, {"boundingBox": [{"x": 307, "y": 157}, {"x": 400, "y": 157}, {"x": 400, "y": 180}, {"x": 308, "y": 180}], "text": "elevated"}, {"boundingBox": [{"x": 407, "y": 157}, {"x": 429, "y": 157}, {"x": 430, "y": 180}, {"x": 407, "y": 180}], "text": "to"}, {"boundingBox": [{"x": 436, "y": 157}, {"x": 518, "y": 157}, {"x": 518, "y": 180}, {"x": 437, "y": 180}], "text": "Heaven"}, {"boundingBox": [{"x": 528, "y": 157}, {"x": 567, "y": 157}, {"x": 567, "y": 180}, {"x": 528, "y": 180}], "text": "and"}, {"boundingBox": [{"x": 574, "y": 157}, {"x": 613, "y": 157}, {"x": 614, "y": 180}, {"x": 574, "y": 180}], "text": "not"}, {"boundingBox": [{"x": 618, "y": 157}, {"x": 646, "y": 157}, {"x": 646, "y": 180}, {"x": 618, "y": 180}], "text": "be"}, {"boundingBox": [{"x": 653, "y": 157}, {"x": 730, "y": 157}, {"x": 730, "y": 180}, {"x": 653, "y": 180}], "text": "burned"}, {"boundingBox": [{"x": 736, "y": 157}, {"x": 757, "y": 157}, {"x": 757, "y": 180}, {"x": 736, "y": 180}], "text": "in"}, {"boundingBox": [{"x": 765, "y": 157}, {"x": 854, "y": 157}, {"x": 854, "y": 180}, {"x": 765, "y": 180}], "text": "etermal"}, {"boundingBox": [{"x": 858, "y": 157}, {"x": 970, "y": 157}, {"x": 970, "y": 180}, {"x": 858, "y": 180}], "text": "damnation"}, {"boundingBox": [{"x": 979, "y": 157}, {"x": 990, "y": 157}, {"x": 990, "y": 180}, {"x": 979, "y": 180}], "text": ","}, {"boundingBox": [{"x": 994, "y": 157}, {"x": 1041, "y": 157}, {"x": 1041, "y": 180}, {"x": 994, "y": 180}], "text": "only"}, {"boundingBox": [{"x": 1046, "y": 157}, {"x": 1114, "y": 157}, {"x": 1114, "y": 180}, {"x": 1046, "y": 180}], "text": "slants"}, {"boundingBox": [{"x": 1118, "y": 157}, {"x": 1154, "y": 157}, {"x": 1154, "y": 180}, {"x": 1118, "y": 180}], "text": "the"}, {"boundingBox": [{"x": 1161, "y": 157}, {"x": 1216, "y": 157}, {"x": 1216, "y": 179}, {"x": 1161, "y": 180}], "text": "facts"}, {"boundingBox": [{"x": 1220, "y": 157}, {"x": 1233, "y": 157}, {"x": 1233, "y": 179}, {"x": 1220, "y": 179}], "text": ":"}, {"boundingBox": [{"x": 1237, "y": 157}, {"x": 1253, "y": 157}, {"x": 1253, "y": 179}, {"x": 1237, "y": 179}], "text": "&"}, {"boundingBox": [{"x": 148, "y": 185}, {"x": 198, "y": 184}, {"x": 198, "y": 204}, {"x": 149, "y": 205}], "text": "quot"}, {"boundingBox": [{"x": 202, "y": 184}, {"x": 213, "y": 184}, {"x": 213, "y": 204}, {"x": 202, "y": 204}], "text": ";"}, {"boundingBox": [{"x": 217, "y": 184}, {"x": 229, "y": 184}, {"x": 228, "y": 204}, {"x": 216, "y": 204}], "text": "."}]}]


--------------------------------------------------------------------------------
/tests/e2e/templates/solid_bg.html.jinja:
--------------------------------------------------------------------------------
 1 | <style>
 2 |     @page {
 3 |         size: 1in 1in;
 4 |         background-color: {{background_color}};
 5 |         font-size: 8px;
 6 |     }
 7 | </style>
 8 | {% if content and content.content_type.__str__() == "ContentType.COMPOSITE" %}
 9 |     {% for c in content %}
10 |         {% if c.content_type.__str__() == "ContentType.PARAGRAPH"%}
11 |             <p>
12 |                {{ c }}
13 |             </p>
14 |         {% else %}
15 |             <p> Unsupported Content Type: {{c.content_type.__str__()}} </p>
16 |         {% endif %}
17 |     {% endfor %}
18 | {% else %}
19 |     No content loaded
20 | {% endif %}
21 | 


--------------------------------------------------------------------------------
/tests/e2e/test_anchor_e2e.py:
--------------------------------------------------------------------------------
 1 | import difflib
 2 | import glob
 3 | import warnings
 4 | 
 5 | import pytest
 6 | 
 7 | from genalog.text import alignment, anchor, preprocess
 8 | 
 9 | 
10 | @pytest.mark.slow
11 | @pytest.mark.parametrize(
12 |     "gt_file, ocr_file",
13 |     zip(
14 |         sorted(glob.glob("tests/unit/text/data/gt_*.txt")),
15 |         sorted(glob.glob("tests/unit/text/data/ocr_*.txt")),
16 |     ),
17 | )
18 | def test_align_w_anchor_and_align(gt_file, ocr_file):
19 |     gt_text = open(gt_file, "r").read()
20 |     ocr_text = open(ocr_file, "r").read()
21 |     aligned_anchor_gt, aligned_anchor_noise = anchor.align_w_anchor(gt_text, ocr_text)
22 |     aligned_gt, aligned_noise = alignment.align(gt_text, ocr_text)
23 | 
24 |     if aligned_gt != aligned_anchor_gt:
25 |         aligned_anchor_gt = aligned_anchor_gt.split(".")
26 |         aligned_gt = aligned_gt.split(".")
27 |         str_diff = "\n".join(difflib.unified_diff(aligned_gt, aligned_anchor_gt))
28 |         warnings.warn(
29 |             UserWarning(
30 |                 "\n"
31 |                 + f"{str_diff}"
32 |                 + "\n\n**** Inconsistent Alignment Results between align() and "
33 |                 + "align_w_anchor(). Ignore this if the delta is not significant. ****\n"
34 |             )
35 |         )
36 | 
37 | 
38 | @pytest.mark.slow
39 | @pytest.mark.parametrize(
40 |     "gt_file, ocr_file",
41 |     zip(
42 |         sorted(glob.glob("tests/unit/text/data/gt_*.txt")),
43 |         sorted(glob.glob("tests/unit/text/data/ocr_*.txt")),
44 |     ),
45 | )
46 | @pytest.mark.parametrize("max_seg_length", [25, 50, 75, 100, 150])
47 | def test_find_anchor_recur_e2e(gt_file, ocr_file, max_seg_length):
48 |     gt_text = open(gt_file, "r").read()
49 |     ocr_text = open(ocr_file, "r").read()
50 |     gt_tokens = preprocess.tokenize(gt_text)
51 |     ocr_tokens = preprocess.tokenize(ocr_text)
52 |     gt_anchors, ocr_anchors = anchor.find_anchor_recur(
53 |         gt_tokens, ocr_tokens, max_seg_length=max_seg_length
54 |     )
55 |     for gt_anchor, ocr_anchor in zip(gt_anchors, ocr_anchors):
56 |         # Ensure that each anchor word is the same word in both text
57 |         assert gt_tokens[gt_anchor] == ocr_tokens[ocr_anchor]
58 | 


--------------------------------------------------------------------------------
/tests/e2e/test_conll_format_e2e.py:
--------------------------------------------------------------------------------
 1 | import glob
 2 | import itertools
 3 | 
 4 | import pytest
 5 | 
 6 | from genalog.text import conll_format
 7 | 
 8 | 
 9 | @pytest.mark.slow
10 | @pytest.mark.parametrize(
11 |     "required_args", [(["tests/e2e/data/synthetic_dataset", "test_version"])]
12 | )
13 | @pytest.mark.parametrize(
14 |     "optional_args",
15 |     [
16 |         (["--train_subset"]),
17 |         (["--test_subset"]),
18 |         (["--gt_folder", "shared"]),
19 |     ],
20 | )
21 | def test_conll_format(required_args, optional_args):
22 |     parser = conll_format.create_parser()
23 |     arg_list = required_args + optional_args
24 |     args = parser.parse_args(args=arg_list)
25 |     conll_format.main(args)
26 | 
27 | 
28 | basepath = "tests/e2e/data/conll_formatter/"
29 | 
30 | 
31 | @pytest.mark.slow
32 | @pytest.mark.parametrize(
33 |     "clean_label_filename, ocr_text_filename",
34 |     zip(
35 |         sorted(glob.glob("tests/e2e/data/conll_formatter/clean_labels/*.txt")),
36 |         sorted(glob.glob("tests/e2e/data/conll_formatter/ocr_text/*.txt")),
37 |     ),
38 | )
39 | def test_propagate_labels_sentence_single_file(clean_label_filename, ocr_text_filename):
40 |     with open(clean_label_filename, "r", encoding="utf-8") as clf:
41 |         tokens_labels_str = clf.readlines()
42 |     clean_tokens = [
43 |         line.split()[0].strip() for line in tokens_labels_str if len(line.split()) == 2
44 |     ]
45 |     clean_labels = [
46 |         line.split()[1].strip() for line in tokens_labels_str if len(line.split()) == 2
47 |     ]
48 |     clean_sentences = conll_format.get_sentences_from_iob_format(tokens_labels_str)
49 |     # read ocr tokens
50 |     with open(ocr_text_filename, "r", encoding="utf-8") as otf:
51 |         ocr_text_str = " ".join(otf.readlines())
52 |     ocr_tokens = [
53 |         token.strip() for token in ocr_text_str.split()
54 |     ]  # already tokenized in data
55 | 
56 |     ocr_text_sentences, ocr_labels_sentences = conll_format.propagate_labels_sentences(
57 |         clean_tokens, clean_labels, clean_sentences, ocr_tokens
58 |     )
59 |     ocr_sentences_flatten = list(itertools.chain(*ocr_text_sentences))
60 |     assert len(ocr_text_sentences) == len(clean_sentences)
61 |     assert len(ocr_text_sentences) == len(ocr_labels_sentences)
62 |     assert len(ocr_sentences_flatten) == len(
63 |         ocr_tokens
64 |     )  # ensure aligned ocr tokens == ocr tokens
65 | 


--------------------------------------------------------------------------------
/tests/e2e/test_generaton_n_degradation.py:
--------------------------------------------------------------------------------
 1 | from genalog.degradation.degrader import Degrader
 2 | from genalog.generation.content import CompositeContent, ContentType
 3 | from genalog.generation.document import DocumentGenerator
 4 | 
 5 | 
 6 | TEST_OUTPUT_DIR = "test_out/"
 7 | SAMPLE_TXT = """Everton 's Duncan Ferguson , who scored twice against Manchester United on Wednesday ,
 8 |                  was picked on Thursday for the Scottish squad after a 20-month exile ."""
 9 | DEFAULT_TEMPLATE = "text_block.html.jinja"
10 | DEGRADATION_EFFECTS = [
11 |     ("blur", {"radius": 5}),
12 |     ("bleed_through", {"alpha": 0.8}),
13 |     (
14 |         "morphology",
15 |         {"operation": "open", "kernel_shape": (3, 3), "kernel_type": "plus"},
16 |     ),
17 |     ("morphology", {"operation": "close"}),
18 |     ("morphology", {"operation": "dilate"}),
19 |     ("morphology", {"operation": "erode"}),
20 | ]
21 | 
22 | 
23 | def test_generation_and_degradation():
24 |     # Initiate content
25 |     content = CompositeContent([SAMPLE_TXT], [ContentType.PARAGRAPH])
26 |     doc_gen = DocumentGenerator()
27 |     assert DEFAULT_TEMPLATE in doc_gen.template_list
28 |     # Initate template generator
29 |     generator = doc_gen.create_generator(content, [DEFAULT_TEMPLATE])
30 |     # Initiate degrader
31 |     degrader = Degrader(DEGRADATION_EFFECTS)
32 | 
33 |     for doc in generator:
34 |         # get the image in bytes in RGBA channels
35 |         src = doc.render_array(resolution=100, channel="GRAYSCALE")
36 |         # run each degradation effect
37 |         degrader.apply_effects(src)
38 | 


--------------------------------------------------------------------------------
/tests/e2e/test_image_channel.py:
--------------------------------------------------------------------------------
 1 | import cv2
 2 | import pytest
 3 | 
 4 | from genalog.generation.content import CompositeContent, ContentType
 5 | from genalog.generation.document import DocumentGenerator
 6 | 
 7 | TEMPLATE_PATH = "tests/e2e/templates"
 8 | TEST_OUT_FOLDER = "test_out/"
 9 | SAMPLE_TXT = "foo"
10 | CONTENT = CompositeContent([SAMPLE_TXT], [ContentType.PARAGRAPH])
11 | 
12 | 
13 | @pytest.fixture
14 | def doc_generator():
15 |     return DocumentGenerator(template_path=TEMPLATE_PATH)
16 | 
17 | 
18 | @pytest.mark.io
19 | def test_red_channel(doc_generator):
20 |     generator = doc_generator.create_generator(CONTENT, ["solid_bg.html.jinja"])
21 |     for doc in generator:
22 |         doc.update_style(background_color="red")
23 |         img_array = doc.render_array(resolution=100, channel="BGRA")
24 |         # css "red" is rgb(255,0,0) or bgra(0,0,255,255)
25 |         assert tuple(img_array[0][0]) == (0, 0, 255, 255)
26 |         cv2.imwrite(TEST_OUT_FOLDER + "red.png", img_array)
27 | 
28 | 
29 | @pytest.mark.io
30 | def test_green_channel(doc_generator):
31 |     generator = doc_generator.create_generator(CONTENT, ["solid_bg.html.jinja"])
32 |     for doc in generator:
33 |         doc.update_style(background_color="green")
34 |         img_array = doc.render_array(resolution=100, channel="BGRA")
35 |         # css "green" is rgb(0,128,0) or bgra(0,128,0,255)
36 |         assert tuple(img_array[0][0]) == (0, 128, 0, 255)
37 |         cv2.imwrite(TEST_OUT_FOLDER + "green.png", img_array)
38 | 
39 | 
40 | @pytest.mark.io
41 | def test_blue_channel(doc_generator):
42 |     generator = doc_generator.create_generator(CONTENT, ["solid_bg.html.jinja"])
43 |     for doc in generator:
44 |         doc.update_style(background_color="blue")
45 |         img_array = doc.render_array(resolution=100, channel="BGRA")
46 |         # css "blue" is rgb(0,0,255) or bgra(255,0,0,255)
47 |         assert tuple(img_array[0][0]) == (255, 0, 0, 255)
48 |         cv2.imwrite(TEST_OUT_FOLDER + "blue.png", img_array)
49 | 


--------------------------------------------------------------------------------
/tests/e2e/test_ocr_e2e.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | import pytest
 4 | 
 5 | from genalog.ocr.blob_client import GrokBlobClient
 6 | from genalog.ocr.grok import Grok
 7 | 
 8 | 
 9 | @pytest.fixture(scope="module", autouse=True)
10 | def load_azure_config(load_azure_resources):
11 |     # Loading the non-secrets
12 |     # Assume the secrets are set in the environment variable prior
13 |     pass
14 | 
15 | 
16 | @pytest.mark.azure
17 | class TestBlobClient:
18 |     @pytest.mark.parametrize("use_async", [True, False])
19 |     def test_upload_images(self, use_async):
20 |         blob_client = GrokBlobClient.create_from_env_var()
21 |         subfolder = "tests/unit/ocr/data/img"
22 |         subfolder.replace("/", "_")
23 |         dst_folder, _ = blob_client.upload_images_to_blob(
24 |             subfolder, use_async=use_async
25 |         )
26 |         uploaded_items, _ = blob_client.list_blobs(dst_folder)
27 |         uploaded_items = sorted(list(uploaded_items), key=lambda x: x.name)
28 |         assert uploaded_items[0].name == f"{dst_folder}/0.png"
29 |         assert uploaded_items[1].name == f"{dst_folder}/1.png"
30 |         assert uploaded_items[2].name == f"{dst_folder}/11.png"
31 |         blob_client.delete_blobs_folder(dst_folder)
32 |         assert (
33 |             len(list(blob_client.list_blobs(dst_folder)[0])) == 0
34 |         ), f"folder {dst_folder} was not deleted"
35 | 
36 |         dst_folder, _ = blob_client.upload_images_to_blob(
37 |             subfolder, "test_images", use_async=use_async
38 |         )
39 |         assert dst_folder == "test_images"
40 |         uploaded_items, _ = blob_client.list_blobs(dst_folder)
41 |         uploaded_items = sorted(list(uploaded_items), key=lambda x: x.name)
42 |         assert uploaded_items[0].name == f"{dst_folder}/0.png"
43 |         assert uploaded_items[1].name == f"{dst_folder}/1.png"
44 |         assert uploaded_items[2].name == f"{dst_folder}/11.png"
45 |         blob_client.delete_blobs_folder(dst_folder)
46 |         assert (
47 |             len(list(blob_client.list_blobs(dst_folder)[0])) == 0
48 |         ), f"folder {dst_folder} was not deleted"
49 | 
50 | 
51 | @pytest.mark.skip(reason=(
52 |     "Flaky test. Going to deprecate the ocr module in favor of the official python SDK:\n"
53 |     "https://docs.microsoft.com/en-us/azure/cognitive-services/computer-vision/quickstarts-sdk/client-library?tabs=visual-studio&pivots=programming-language-python"  # noqa:E501
54 | ))
55 | @pytest.mark.azure
56 | class TestGROKe2e:
57 |     @pytest.mark.parametrize("use_async", [False])
58 |     def test_grok_e2e(self, tmpdir, use_async):
59 |         grok = Grok.create_from_env_var()
60 |         src_folder = "tests/unit/ocr/data/img"
61 |         grok.run_grok(
62 |             src_folder,
63 |             tmpdir,
64 |             blob_dest_folder="testimages",
65 |             use_async=use_async,
66 |             cleanup=True,
67 |         )
68 |         assert json.load(open(f"{tmpdir}/0.json", "r"))[0]["text"]
69 |         assert json.load(open(f"{tmpdir}/1.json", "r"))[0]["text"]
70 |         assert json.load(open(f"{tmpdir}/11.json", "r"))[0]["text"]
71 | 


--------------------------------------------------------------------------------
/tests/e2e/test_pipeline.py:
--------------------------------------------------------------------------------
  1 | import glob
  2 | import os
  3 | 
  4 | import numpy as np
  5 | import pytest
  6 | 
  7 | from genalog.generation.document import DocumentGenerator
  8 | from genalog.pipeline import AnalogDocumentGeneration, generate_dataset_multiprocess
  9 | 
 10 | EXAMPLE_TEXT_FILE = "tests/unit/text/data/gt_1.txt"
 11 | INPUT_TEXT_FILENAMES = glob.glob("tests/unit/text/data/gt_*.txt")
 12 | 
 13 | STYLES = {"font_size": ["5px"]}
 14 | STYLES_COMBINATION = {"font_size": ["5px", "6px"]}  # Multiple values per style are not supported right now
 15 | DEGRATIONS = [
 16 |     ("blur", {"radius": 3}),
 17 |     ("morphology", {"operation": "close"})
 18 | ]
 19 | 
 20 | 
 21 | @pytest.fixture
 22 | def default_doc_generator():
 23 |     return AnalogDocumentGeneration()
 24 | 
 25 | 
 26 | @pytest.fixture
 27 | def custom_doc_generator():
 28 |     return AnalogDocumentGeneration(styles=STYLES, degradations=DEGRATIONS, resolution=300)
 29 | 
 30 | 
 31 | @pytest.fixture
 32 | def empty_style_doc_generator():
 33 |     return AnalogDocumentGeneration(styles={})
 34 | 
 35 | 
 36 | @pytest.mark.parametrize("doc_generator", [
 37 |     pytest.lazy_fixture('default_doc_generator'),
 38 |     pytest.lazy_fixture('custom_doc_generator')
 39 | ])
 40 | def test_generate_img_array(doc_generator):
 41 |     # Precondition checks
 42 |     assert len(doc_generator.list_templates()) > 0
 43 | 
 44 |     example_template = doc_generator.list_templates()[0]
 45 |     sample_img = doc_generator.generate_img(
 46 |         EXAMPLE_TEXT_FILE, example_template, target_folder=None
 47 |     )
 48 |     assert sample_img is not None
 49 |     assert isinstance(sample_img, np.ndarray)
 50 | 
 51 | 
 52 | def test_generate_img_array_empty(empty_style_doc_generator):
 53 |     # Precondition checks
 54 |     assert len(empty_style_doc_generator.list_templates()) > 0
 55 | 
 56 |     example_template = empty_style_doc_generator.list_templates()[0]
 57 |     sample_img = empty_style_doc_generator.generate_img(
 58 |         EXAMPLE_TEXT_FILE, example_template, target_folder=None
 59 |     )
 60 |     assert sample_img is None
 61 | 
 62 | 
 63 | @pytest.mark.io
 64 | @pytest.mark.parametrize("doc_generator", [
 65 |     pytest.lazy_fixture('default_doc_generator'),
 66 |     pytest.lazy_fixture('custom_doc_generator')
 67 | ])
 68 | def test_generate_img_write_to_disk(tmpdir, doc_generator):
 69 |     os.makedirs(os.path.join(tmpdir, "img"))  # TODO: generate_img() store image under "img" folder
 70 |     output_img_wildcard = os.path.join(tmpdir, "img", "*.png")
 71 |     num_generated_img = glob.glob(output_img_wildcard)
 72 |     # Precondition checks
 73 |     assert len(num_generated_img) == 0
 74 |     assert len(doc_generator.list_templates()) > 0
 75 | 
 76 |     example_template = doc_generator.list_templates()[0]
 77 |     doc_generator.generate_img(
 78 |         EXAMPLE_TEXT_FILE, example_template, target_folder=tmpdir
 79 |     )
 80 |     num_generated_img = glob.glob(output_img_wildcard)  # look for any jpg on file
 81 |     assert len(num_generated_img) > 0
 82 | 
 83 | 
 84 | @pytest.mark.io
 85 | @pytest.mark.parametrize("styles", [
 86 |     STYLES,
 87 |     pytest.param(
 88 |         STYLES_COMBINATION, marks=pytest.mark.xfail(
 89 |             reason="Style combinations are not supported. Only one value per style", strict=True)
 90 |     )
 91 | ])
 92 | @pytest.mark.parametrize("folder_name", ["result", "result/"])
 93 | def test_generate_dataset_multiprocess(tmpdir, folder_name, styles):
 94 |     assert len(INPUT_TEXT_FILENAMES) > 0
 95 |     output_folder = os.path.join(tmpdir, folder_name)
 96 |     generate_dataset_multiprocess(
 97 |         INPUT_TEXT_FILENAMES, output_folder, styles, DEGRATIONS, "text_block.html.jinja"
 98 |     )
 99 |     num_generated_img = glob.glob(os.path.join(output_folder, "**", "*.png"))
100 |     assert len(num_generated_img) > 0
101 |     assert len(num_generated_img) == len(INPUT_TEXT_FILENAMES) * len(DocumentGenerator.expand_style_combinations(styles))
102 | 


--------------------------------------------------------------------------------
/tests/e2e/test_splitter.py:
--------------------------------------------------------------------------------
 1 | import difflib
 2 | import os
 3 | 
 4 | from genalog.text.splitter import CONLL2003_DOC_SEPERATOR, generate_splits
 5 | 
 6 | 
 7 | def _compare_content(file1, file2):
 8 |     txt1 = open(file1, "r").read()
 9 |     txt2 = open(file2, "r").read()
10 |     sentences_txt1 = txt1.split("\n")
11 |     sentences_txt2 = txt2.split("\n")
12 |     if txt1 != txt2:
13 |         str_diff = "\n".join(difflib.unified_diff(sentences_txt1, sentences_txt2))
14 |         assert False, f"Delta between outputs: \n {str_diff}"
15 | 
16 | 
17 | def test_splitter(tmpdir):
18 |     # tmpdir = "test_out"
19 |     os.makedirs(f"{tmpdir}/clean_labels")
20 |     os.makedirs(f"{tmpdir}/clean_text")
21 | 
22 |     generate_splits(
23 |         "tests/e2e/data/splitter/example_conll2012.txt",
24 |         tmpdir,
25 |         doc_seperator=CONLL2003_DOC_SEPERATOR,
26 |         sentence_seperator="",
27 |     )
28 | 
29 |     _compare_content(
30 |         "tests/e2e/data/splitter/example_splits/clean_text/0.txt",
31 |         f"{tmpdir}/clean_text/0.txt",
32 |     )
33 |     _compare_content(
34 |         "tests/e2e/data/splitter/example_splits/clean_text/1.txt",
35 |         f"{tmpdir}/clean_text/1.txt",
36 |     )
37 |     _compare_content(
38 |         "tests/e2e/data/splitter/example_splits/clean_labels/0.txt",
39 |         f"{tmpdir}/clean_labels/0.txt",
40 |     )
41 |     _compare_content(
42 |         "tests/e2e/data/splitter/example_splits/clean_labels/1.txt",
43 |         f"{tmpdir}/clean_labels/1.txt",
44 |     )
45 | 


--------------------------------------------------------------------------------
/tests/required_env.py:
--------------------------------------------------------------------------------
 1 | from enum import Enum
 2 | from itertools import chain
 3 | 
 4 | 
 5 | class RequiredSecrets(Enum):
 6 |     BLOB_KEY = 'BLOB_KEY'
 7 |     SEARCH_SERVICE_KEY = 'SEARCH_SERVICE_KEY'
 8 |     COGNITIVE_SERVICE_KEY = 'COGNITIVE_SERVICE_KEY'
 9 | 
10 | 
11 | class RequiredConstants(Enum):
12 |     COMPUTER_VISION_ENDPOINT = 'COMPUTER_VISION_ENDPOINT'
13 |     SEARCH_SERVICE_NAME = 'SEARCH_SERVICE_NAME'
14 |     SKILLSET_NAME = 'SKILLSET_NAME'
15 |     INDEX_NAME = "INDEX_NAME"
16 |     INDEXER_NAME = "INDEXER_NAME"
17 |     DATASOURCE_NAME = "DATASOURCE_NAME"
18 |     DATASOURCE_CONTAINER_NAME = "DATASOURCE_CONTAINER_NAME"
19 |     BLOB_NAME = "BLOB_NAME"
20 | 
21 | 
22 | RequiredEnvVar = Enum("RequiredEnvVar", [
23 |     (i.name, i.value) for i in chain(RequiredSecrets, RequiredConstants)
24 | ])
25 | 


--------------------------------------------------------------------------------
/tests/unit/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/tests/unit/__init__.py


--------------------------------------------------------------------------------
/tests/unit/cases/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/tests/unit/cases/__init__.py


--------------------------------------------------------------------------------
/tests/unit/cases/label_propagation.py:
--------------------------------------------------------------------------------
  1 | # Test cases for genalog.text.ner_label.propagate_label_to_ocr() method.
  2 | # For READABILITY purpose, ground truth and noisy text are presented as
  3 | # a whole string, not in their tokenized format.
  4 | 
  5 | # Notice the `propagate_label_to_ocr()` method has the contract of
  6 | # (list, list, list) -> (list, list, list)
  7 | # consuming both ground truth text and noisy text as lists of tokens.
  8 | # We will use `genalog.text.preprocess.tokenize()` to tokenize these strings
  9 | from genalog.text import preprocess
 10 | 
 11 | ner_labels = []
 12 | gt_txt = []
 13 | ns_txt = []
 14 | desired_ocr_labels = []
 15 | 
 16 | # Alignment is one-to-one
 17 | ner_labels.append(["B-PLACE", "I-PLACE"])
 18 | gt_txt.append("New York")
 19 | ns_txt.append("New York")
 20 | desired_ocr_labels.append(["B-PLACE", "I-PLACE"])
 21 | 
 22 | # Alignment is one-to-many
 23 | ner_labels.append(["B-PLACE", "I-PLACE"])
 24 | gt_txt.append("New York")
 25 | ns_txt.append("N ew York")
 26 | desired_ocr_labels.append(["B-PLACE", "I-PLACE", "I-PLACE"])
 27 | 
 28 | # Trailing B-Labels
 29 | ner_labels.append(["B-PLACE", "I-PLACE", "O", "B-PLACE", "O", "B-PLACE"])
 30 | gt_txt.append("New York , Boston , Sidney")
 31 | ns_txt.append("N ew York Boston Sidney")
 32 | desired_ocr_labels.append(["B-PLACE", "I-PLACE", "I-PLACE", "B-PLACE", "B-PLACE"])
 33 | 
 34 | # Alignment is many-to-one
 35 | ner_labels.append(["B-PLACE", "I-PLACE"])
 36 | gt_txt.append("New York")
 37 | ns_txt.append("NewYork")
 38 | desired_ocr_labels.append(["B-PLACE"])
 39 | 
 40 | # Alignment is many-to-many
 41 | ner_labels.append(["B-PLACE", "I-PLACE", "O", "O"])
 42 | gt_txt.append("New York is big")
 43 | ns_txt.append("N ewYorkis big")
 44 | desired_ocr_labels.append(["B-PLACE", "I-PLACE", "O"])
 45 | 
 46 | # Missing tokens (I-label)
 47 | ner_labels.append(["B-PLACE", "I-PLACE", "V", "O"])
 48 | gt_txt.append("New York is big")
 49 | ns_txt.append("New  is big")
 50 | desired_ocr_labels.append(["B-PLACE", "V", "O"])
 51 | 
 52 | # Missing tokens (B-label)
 53 | ner_labels.append(["B-PLACE", "I-PLACE", "V", "O"])
 54 | gt_txt.append("New York is big")
 55 | ns_txt.append(" York is big")
 56 | desired_ocr_labels.append(["B-PLACE", "V", "O"])
 57 | 
 58 | ner_labels.append(["O", "O", "B-PLACE"])
 59 | gt_txt.append("This is home")
 60 | ns_txt.append("Th isis ho me")
 61 | desired_ocr_labels.append(["O", "O", "B-PLACE", "I-PLACE"])
 62 | 
 63 | # Missing tokens + many-to-many
 64 | ner_labels.append(["B-PLACE", "I-PLACE", "O", "O"])
 65 | gt_txt.append("New York is big")
 66 | ns_txt.append("N ewYo rkis big")
 67 | desired_ocr_labels.append(["B-PLACE", "I-PLACE", "I-PLACE", "O"])
 68 | 
 69 | # Missing tokens + many-to-many
 70 | ner_labels.append(["B-PLACE", "O", "O"])
 71 | gt_txt.append("Boston is big ")
 72 | ns_txt.append("B oston bi g")
 73 | desired_ocr_labels.append(["B-PLACE", "I-PLACE", "O", "O"])
 74 | 
 75 | # Single char tokens
 76 | ner_labels.append(["O", "O", "B-PLACE"])
 77 | gt_txt.append("a big city")
 78 | ns_txt.append("abigcity")
 79 | desired_ocr_labels.append(["O"])
 80 | 
 81 | # Splitted into single-char token
 82 | ner_labels.append(["O", "O", "B-PLACE"])
 83 | gt_txt.append("a big city")
 84 | ns_txt.append("abig c it y")
 85 | desired_ocr_labels.append(["O", "B-PLACE", "I-PLACE", "I-PLACE"])
 86 | 
 87 | # Tokens with repeating characters
 88 | ner_labels.append(["O", "FRUIT"])
 89 | gt_txt.append("an apple")
 90 | ns_txt.append("aa aaple")
 91 | desired_ocr_labels.append(["O", "FRUIT"])
 92 | 
 93 | # Tokens with regex special characters
 94 | ner_labels.append(["O", "FRUIT", "O"])
 95 | gt_txt.append("an apple .*/")
 96 | ns_txt.append("@n @ @p|e *. |")
 97 | desired_ocr_labels.append(["O", "FRUIT", "FRUIT", "O", "O"])
 98 | 
 99 | # Tokens with regex special characters with B-labels
100 | ner_labels.append(["O", "B-FRUIT", "O"])
101 | gt_txt.append("an apple .*/")
102 | ns_txt.append("@n @ @p|e *. |")
103 | desired_ocr_labels.append(["O", "B-FRUIT", "I-FRUIT", "O", "O"])
104 | 
105 | # Tokens with regex special characters in BOTH clean and noisy text
106 | ner_labels.append(["O", "O", "ENTERTAINMENT", "O"])
107 | gt_txt.append("@ new TV !")
108 | ns_txt.append("@ n ow T\\/ |")
109 | desired_ocr_labels.append(["O", "O", "O", "ENTERTAINMENT", "O"])
110 | 
111 | # Tokenize ground truth and noisy text strings
112 | gt_tokens = [preprocess.tokenize(txt) for txt in gt_txt]
113 | ns_tokens = [preprocess.tokenize(txt) for txt in ns_txt]
114 | 
115 | # test function expect params in tuple of
116 | # (gt_label, gt_tokens, ocr_tokens, desired_ocr_labels)
117 | LABEL_PROPAGATION_REGRESSION_TEST_CASES = list(
118 |     zip(ner_labels, gt_tokens, ns_tokens, desired_ocr_labels)
119 | )
120 | 


--------------------------------------------------------------------------------
/tests/unit/degradation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/tests/unit/degradation/__init__.py


--------------------------------------------------------------------------------
/tests/unit/degradation/test_degrader.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | from unittest.mock import patch
  3 | 
  4 | import numpy as np
  5 | import pytest
  6 | 
  7 | from genalog.degradation.degrader import DEFAULT_METHOD_PARAM_TO_INCLUDE
  8 | from genalog.degradation.degrader import Degrader, ImageState
  9 | 
 10 | MOCK_IMAGE_SHAPE = (4, 3)
 11 | MOCK_IMAGE = np.arange(12, dtype=np.uint8).reshape(MOCK_IMAGE_SHAPE)
 12 | 
 13 | 
 14 | @pytest.fixture
 15 | def empty_degrader():
 16 |     effects = []
 17 |     return Degrader(effects)
 18 | 
 19 | 
 20 | @pytest.fixture(
 21 |     params=[
 22 |         [("blur", {"radius": 5})],
 23 |         [("blur", {"src": ImageState.ORIGINAL_STATE, "radius": 5})],
 24 |         [("blur", {"src": ImageState.CURRENT_STATE, "radius": 5})],
 25 |         [
 26 |             ("morphology", {"src": ImageState.ORIGINAL_STATE, "operation": "open"}),
 27 |             ("morphology", {"operation": "close"}),
 28 |             ("morphology", {"src": ImageState.ORIGINAL_STATE, "operation": "dilate"}),
 29 |             ("morphology", {"operation": "erode"}),
 30 |         ],
 31 |         [
 32 |             ("blur", {"radius": 5}),
 33 |             (
 34 |                 "bleed_through",
 35 |                 {
 36 |                     "src": ImageState.CURRENT_STATE,
 37 |                     "alpha": 0.7,
 38 |                     "background": ImageState.ORIGINAL_STATE,
 39 |                 },
 40 |             ),
 41 |             (
 42 |                 "morphology",
 43 |                 {"operation": "open", "kernel_shape": (3, 3), "kernel_type": "ones"},
 44 |             ),
 45 |         ],
 46 |     ]
 47 | )
 48 | def degrader(request):
 49 |     effects = request.param
 50 |     return Degrader(effects)
 51 | 
 52 | 
 53 | def test_empty_degrader_init(empty_degrader):
 54 |     assert empty_degrader.effects_to_apply == []
 55 | 
 56 | 
 57 | def test_degrader_init(degrader):
 58 |     assert degrader.effects_to_apply is not []
 59 |     for effect_tuple in degrader.effects_to_apply:
 60 |         method_name, method_kwargs = effect_tuple
 61 |         assert DEFAULT_METHOD_PARAM_TO_INCLUDE in method_kwargs
 62 |         param_value = method_kwargs[DEFAULT_METHOD_PARAM_TO_INCLUDE]
 63 |         assert (
 64 |             param_value is ImageState.ORIGINAL_STATE
 65 |             or param_value is ImageState.CURRENT_STATE
 66 |         )
 67 | 
 68 | 
 69 | @pytest.mark.parametrize(
 70 |     "effects, error_thrown",
 71 |     [
 72 |         ([], None),  # Empty effect
 73 |         (None, TypeError),
 74 |         ([("blur", {"radius": 5})], None),  # Validate input
 75 |         ([("not_a_func", {"radius": 5})], ValueError),  # Invalid method name
 76 |         ([("blur", {"not_a_argument": 5})], ValueError),  # Invalid kwargs
 77 |         ([("blur")], ValueError),  # Missing kwargs
 78 |         (
 79 |             [
 80 |                 ("blur", {"radius": 5}),
 81 |                 ("bleed_through", {"alpha": "0.8"}),
 82 |                 ("morphology", {"operation": "open"}),
 83 |             ],
 84 |             None,
 85 |         ),  # Multiple effects
 86 |         (
 87 |             [
 88 |                 ("blur", {"radius": 5}),
 89 |                 ("bleed_through", {"not_argument": "0.8"}),
 90 |                 ("morphology", {"missing value"}),
 91 |             ],
 92 |             ValueError,
 93 |         ),  # Multiple effects
 94 |     ],
 95 | )
 96 | def test_degrader_validate_effects(effects, error_thrown):
 97 |     if error_thrown:
 98 |         with pytest.raises(error_thrown):
 99 |             Degrader.validate_effects(effects)
100 |     else:
101 |         Degrader.validate_effects(effects)
102 | 
103 | 
104 | def test_degrader_apply_effects(degrader):
105 |     method_names = [effect[0] for effect in degrader.effects_to_apply]
106 |     with patch("genalog.degradation.effect") as mock_effect:
107 |         degrader.apply_effects(MOCK_IMAGE)
108 |         for method in method_names:
109 |             assert mock_effect[method].is_called()
110 |         # assert degraded.shape == MOCK_IMAGE_SHAPE
111 | 
112 | 
113 | def test_degrader_apply_effects_e2e(degrader):
114 |     degraded = degrader.apply_effects(MOCK_IMAGE)
115 |     assert degraded.shape == MOCK_IMAGE_SHAPE
116 |     assert degraded.dtype == np.uint8
117 | 
118 | 
119 | def test_degrader_instructions(degrader):
120 |     original_instruction = copy.deepcopy(degrader.effects_to_apply)
121 |     degrader.apply_effects(MOCK_IMAGE)
122 |     degrader.apply_effects(MOCK_IMAGE)
123 |     # Make sure the degradation instructions are not altered
124 |     assert len(original_instruction) == len(degrader.effects_to_apply)
125 |     for i in range(len(original_instruction)):
126 |         org_method_name, org_method_arg = original_instruction[i]
127 |         method_name, method_arg = degrader.effects_to_apply[i]
128 |         assert org_method_name == method_name
129 |         assert len(org_method_arg) == len(method_arg)
130 |         for key in org_method_arg.keys():
131 |             assert isinstance(org_method_arg[key], type(method_arg[key]))
132 |             assert org_method_arg[key] == method_arg[key]
133 | 


--------------------------------------------------------------------------------
/tests/unit/generation/2x2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/tests/unit/generation/2x2.jpg


--------------------------------------------------------------------------------
/tests/unit/generation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/tests/unit/generation/__init__.py


--------------------------------------------------------------------------------
/tests/unit/generation/templates/font_family.html.jinja:
--------------------------------------------------------------------------------
1 | {{font_family}}


--------------------------------------------------------------------------------
/tests/unit/generation/templates/mock.html.jinja:
--------------------------------------------------------------------------------
1 | {{ content }}


--------------------------------------------------------------------------------
/tests/unit/generation/templates/multipage.html.jinja:
--------------------------------------------------------------------------------
 1 | {% if content and content.content_type.__str__() == "ContentType.COMPOSITE" %}
 2 |     {% for c in content %}
 3 |         {% if c.content_type.__str__() == "ContentType.PARAGRAPH"%}
 4 |             <p style="page-break-after: always;">
 5 |                {{ c }}
 6 |             </p>
 7 |         {% else %}
 8 |             <p> Unsupported Content Type: {{c.content_type.__str__()}} </p>
 9 |         {% endif %}
10 |     {% endfor %}
11 | {% else %}
12 |     No content loaded
13 | {% endif %}
14 | 


--------------------------------------------------------------------------------
/tests/unit/generation/test_content.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from genalog.generation.content import CompositeContent, Content, ContentType
 4 | from genalog.generation.content import Paragraph, Title
 5 | 
 6 | CONTENT_LIST = ["foo", "bar"]
 7 | COMPOSITE_CONTENT_TYPE = [ContentType.TITLE, ContentType.PARAGRAPH]
 8 | TEXT = "foo bar"
 9 | 
10 | 
11 | @pytest.fixture
12 | def content_base_class():
13 |     return Content()
14 | 
15 | 
16 | @pytest.fixture
17 | def paragraph():
18 |     return Paragraph(TEXT)
19 | 
20 | 
21 | @pytest.fixture
22 | def title():
23 |     return Title(TEXT)
24 | 
25 | 
26 | @pytest.fixture
27 | def section():
28 |     return CompositeContent(CONTENT_LIST, COMPOSITE_CONTENT_TYPE)
29 | 
30 | 
31 | def test_content_set_content_type(content_base_class):
32 |     with pytest.raises(TypeError):
33 |         content_base_class.set_content_type("NOT VALID CONTENT TYPE")
34 |     content_base_class.set_content_type(ContentType.PARAGRAPH)
35 | 
36 | 
37 | def test_paragraph_init(paragraph):
38 |     with pytest.raises(TypeError):
39 |         Paragraph([])
40 |     assert paragraph.content_type == ContentType.PARAGRAPH
41 | 
42 | 
43 | def test_paragraph_print(paragraph):
44 |     assert paragraph.__str__()
45 | 
46 | 
47 | def test_paragraph_iterable_indexable(paragraph):
48 |     for index, character in enumerate(paragraph):
49 |         assert character == paragraph[index]
50 | 
51 | 
52 | def test_title_init(title):
53 |     with pytest.raises(TypeError):
54 |         Title([])
55 |     assert title.content_type == ContentType.TITLE
56 | 
57 | 
58 | def test_title_iterable_indexable(title):
59 |     for index, character in enumerate(title):
60 |         assert character == title[index]
61 | 
62 | 
63 | def test_composite_content_init(section):
64 |     with pytest.raises(TypeError):
65 |         CompositeContent((), [])
66 |     assert section.content_type == ContentType.COMPOSITE
67 | 
68 | 
69 | def test_composite_content_iterable(section):
70 |     for index, content in enumerate(section):
71 |         assert content.content_type == COMPOSITE_CONTENT_TYPE[index]
72 | 
73 | 
74 | def test_composite_content_print(section):
75 |     assert "foo" in section.__str__()
76 |     assert "bar" in section.__str__()
77 | 


--------------------------------------------------------------------------------
/tests/unit/ocr/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/tests/unit/ocr/__init__.py


--------------------------------------------------------------------------------
/tests/unit/ocr/data/img/0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/tests/unit/ocr/data/img/0.png


--------------------------------------------------------------------------------
/tests/unit/ocr/data/img/1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/tests/unit/ocr/data/img/1.png


--------------------------------------------------------------------------------
/tests/unit/ocr/data/img/11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/tests/unit/ocr/data/img/11.png


--------------------------------------------------------------------------------
/tests/unit/ocr/data/metrics.csv:
--------------------------------------------------------------------------------
1 | edit_insert,edit_delete,edit_replace,edit_insert_spacing,edit_delete_spacing,insert,delete,replace,spacing,total_chars,total_words,total_alnum_words,matching_chars,matching_alnum_words,matching_words,alnum_word_accuracy,word_accuracy,char_accuracy,txt_path,ocr_json_path,filename
2 | 1,0,0,1,13,1,0,0,14,1027,166,159,1025,144,150,0.9056603773584906,0.9036144578313253,0.9980525803310614,tests/unit/ocr/data/text/0.txt,tests/unit/ocr/data/json/521c38122f783673598856cd81d91c21_0.png.json,0.txt
3 | 3,0,0,0,5,3,0,0,5,958,182,176,955,165,171,0.9375,0.9395604395604396,0.9968684759916493,tests/unit/ocr/data/text/1.txt,tests/unit/ocr/data/json/521c38122f783673598856cd81d91c21_1.png.json,1.txt
4 | 2,0,0,0,9,2,0,0,9,1022,188,183,1020,170,175,0.9289617486338798,0.9308510638297872,0.9980430528375733,tests/unit/ocr/data/text/11.txt,tests/unit/ocr/data/json/521c38122f783673598856cd81d91c21_11.png.json,11.txt
5 | 


--------------------------------------------------------------------------------
/tests/unit/ocr/data/metrics/json/123_001.json:
--------------------------------------------------------------------------------
1 | [{"text": "BIRDS\n\nOF\n\nGREAT BRITAIN AND IRELAND\n\nORDER PASSERES\n\nFAMILY ORIOLID^.\n\nTHIS famil}^ consists of a tropical group of brightly coloured birds in whicli\nyellow and black, or scarlet and black, are the prevailing hues. Although\nin the general form of their heads they somewhat remind one of Starlings,\nthey must not be confounded with the so-called \"Orioles\" of the New World,\nwhich belong to the family IdcridcE or Hang-nests and Troupials, a group of birds\nlinking the Finches and the Starlings, and feeding largely upon seeds and insects.\n\nThe late Henry Seebohm was of opinion that the Orioles were nearly related\nto the Crows ; he, therefore, placed the genus Oriolus in his Subfamily Corvince, from\nwhich he said that they chiefly differed in their exposed nostrils, although he\nadmitted that the tarsus might perhaps be slightly shorter, and the prevailing\ncolours different ; whilst the sexes also were dissimilar.*\n\nIn addition to the above distinctive characters, the third primary of the wing\n(not the fourth or fifth) appears to be the longest, in the Orioles; whilst the\n"}]


--------------------------------------------------------------------------------
/tests/unit/ocr/data/metrics/json/123_002.json:
--------------------------------------------------------------------------------
1 | [{"text": "whole character of the nest, which Seebohm often made much of iu his classifi-\ncation, is quite unlike that of a Crow ; being neatly woven, and slung like a\nhammock between the forks of a branch : moreover, whereas the eggs of the Crows\nare usually of some shade of green or blue, heavily spotted and speckled, or\nblotched and mottled, with various shades of olive or brown, those of the Orioles\nvary from white to salmon-pink, clearly spotted with blackish-brown, and some-\ntimes with lilacine-greyisli shell-markings.\n\nThe call-notes and songs of the Orioles are bright and melodious ; but this\nfact would not be a sufficient reason for dissociating them from the Crows ;\nalthough our native species of Corvidcr do not shine as whistlers, in their wild\nstate. I think, however, that Howard Saunders was fully justified in adopting\nthe present family for the Orioles.\n\nFamilx- ORIOL ID^E.\n\nThe Golden Oriole.\n\nOriolus ga/bula, LiNN.\n\nBREEDS in suitable localities throughout Europe south of the Baltic and in\nAlgeria ; passes through Greece, Asia Minor, Palestine, Egypt, and Nubia,\non migration ; and winters in North Africa, south-eastwards to Madagascar,\nNatal, and westwards to Damara Land : stragglers sometimes occur in Madeira,\nand the Azores.\n\nThe Golden Oriole is a regular visitor to our shores in spring, the largest\nnumber having been seen in the Scilly Islands, and Cornwall ; it has, however,\nbeen met with in not a few of the southern and south-eastern counties, and several\ninstances of its breeding with us have been recorded. In 1868, I saw a male\nspecimen of this bird near Linton, in Devonshire, and in July, 1887, I was just\ntoo late to see the species in Essex ; Mr. Fitch, of Maldon (whom I was visiting)\ninformed me that the bird had been seen in one of his thickets during the previous\n"}]


--------------------------------------------------------------------------------
/tests/unit/ocr/data/metrics/json/123_003.json:
--------------------------------------------------------------------------------
1 | [{"text": "week. We visited the place iu the hope of discovering a uest, but were unsuc-\ncessful.\n\nIn Ireland it has chiefly occurred on the east coast, most of the examples\nbeing females, or immature males ; a specimen was recorded as shot in the Faroe\nIslands, in Maj^ 1893, by Col. H. W. Feildeu. In June, 1906, one was killed\nby a cat on the Marine Parade at Brighton. Perhaps the nearest point to\nLondon at which it has been recognized, was noted in the \"Zoologist\" for 1892,\nan example having apparently been seen in Richmond Park.\n\nThe male of this species is bright gamboge-yellow, the lores, wings (excepting\nthe terminal third of the primary-coverts) and a great part of the tail black ; the\nprimaries, excepting the two outermost, are edged externally, and the secondaries\nare tipped with j^ellowish-white ; the two central tail-feathers are yellowish at the\nbase, and yellow at the tip, and the other feathers have the terminal third of the\nouter webs, and borders of the inner webs yellow ; bill reddish-ochreous ; feet\nleaden-grey ; iris bright red. The female is much duller than the male, greener,\nand with the black colouring replaced by deep brown ; the throat, breast, and\ncentre of belly whitish ; the throat, breast, and flanks streaked with greyish.\nYoung birds are greener and browner than the female, but otherwise similar ;\nnestlings have the upper parts olivaceous, spotted with yellow.\n\nThe Golden Oriole frequents gardens, groves, plantations, thickets, and the\noutskirts of large woods, especially in the neighbourhood of water ; it seems to\nprefer the haunts of man, yet is so shy that it rarely remains in view for more\nthan a minute as it flies rapidly, in somewhat Thrush-like, though more undulating\nfashion, from cover to cover ; choosing ever the densest foliage, as if aware of the\nperilous brilliance of its plumage : possibly it may slowly be acquiring a hereditary\nknowledge of the fact that, if but a glimpse is obtained of it, an attempt at least\nis made to put an end to its life ; or if it fails to comprehend so much, it may\ninherit a dread of the thunder and lightning which, for generations, have heralded\nits appearance : birds are not naturally fearful of man ; for even those which have\nbeen taught by their parents to dread him, can be generally converted by gentleness\nand petting : moreover the fact that a grown man can tame a small bird, whereas\neven the tamest will always show the greatest fear of a little boy, certainly seems\nto prove that the instinctive dread of the monkey-nature in the latter is deeply\nimplanted in all birds ; j ust as is that of a cat, even though that animal may\nnever have been seen by the bird previously.*\n"}]


--------------------------------------------------------------------------------
/tests/unit/ocr/data/metrics/metrics.csv:
--------------------------------------------------------------------------------
1 | edit_insert,edit_delete,edit_replace,edit_insert_spacing,edit_delete_spacing,insert,delete,replace,spacing,total_chars,total_words,total_alnum_words,matching_chars,matching_alnum_words,matching_words,alnum_word_accuracy,word_accuracy,char_accuracy,txt_path,ocr_json_path,filename
2 | 2,5,5,0,2,1,1,5,2,1068,176,176,1061,169,169,0.9602272727272727,0.9602272727272727,0.9934456928838952,tests/unit/ocr/data/metrics/text/001.txt,tests/unit/ocr/data/metrics/json/123_001.png.json,001.txt
3 | 0,5,17,0,11,0,2,8,11,1789,301,301,1772,283,283,0.9401993355481728,0.9401993355481728,0.9904974846282839,tests/unit/ocr/data/metrics/text/002.txt,tests/unit/ocr/data/metrics/json/123_002.png.json,002.txt
4 | 0,1,6,0,17,0,0,5,17,2659,460,459,2653,436,437,0.9498910675381264,0.95,0.9977435125987213,tests/unit/ocr/data/metrics/text/003.txt,tests/unit/ocr/data/metrics/json/123_003.png.json,003.txt
5 | 


--------------------------------------------------------------------------------
/tests/unit/ocr/data/metrics/substitution.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/tests/unit/ocr/data/metrics/substitution.pkl


--------------------------------------------------------------------------------
/tests/unit/ocr/data/metrics/text/001.txt:
--------------------------------------------------------------------------------
 1 | BIRDS
 2 | 
 3 | OF
 4 | 
 5 | GREAT BRITAIN AND IRELAND
 6 | 
 7 | ORDER PASSERES
 8 | 
 9 | FAMILY ORIOLIDA.
10 | 
11 | THIS family consists of a tropical group of brightly coloured birds in which
12 | yellow and black, or scarlet and black, are the prevailing hues. Although
13 | in the general form of their heads they somewhat remind one of Starlings,
14 | they must not be confounded with the so-called "Orioles" of the New World,
15 | which belong to the family Icterida or Hang-nests and Troupials, a group of birds
16 | linking the Finches and the Starlings, and feeding largely upon seeds and insects.
17 | 
18 | The late Henry Seebohm was of opinion that the Orioles were nearly related
19 | to the Crows; he, therefore, placed the genus Oriolus in his Subfamily Corvina, from
20 | which he said that they chiefly differed in their exposed nostrils, although he
21 | admitted that the tarsus might perhaps be slightly shorter, and the prevailing
22 | colours different; whilst the sexes also were dissimilar.*
23 | 
24 | In addition to the above distinctive characters, the third primary of the wing
25 | (not the fourth or fifth) appears to be the longest, in the Orioles; whilst the
26 | 


--------------------------------------------------------------------------------
/tests/unit/ocr/data/metrics/text/002.txt:
--------------------------------------------------------------------------------
 1 | whole character of the nest, which Seebohm often made much of in his classifi-
 2 | cation, is quite unlike that of a Crow; being neatly woven, and slung like a
 3 | hammock between the forks of a branch: moreover, whereas the eggs of the Crows
 4 | are usually of some shade of green or blue, heavily spotted and speckled, or
 5 | blotched and mottled, with various shades of olive or brown, those of the Orioles
 6 | vary from white to salmon-pink, clearly spotted with blackish-brown, and some-
 7 | times with lilacine-greyish shell-markings.
 8 | 
 9 | The call-notes and songs of the Orioles are bright and melodious; but this
10 | fact would not be a sufficient reason for dissociating them from the Crows;
11 | although our native species of Corvid do not shine as whistlers, in their wild
12 | state. I think, however, that Howard Saunders was fully justified in adopting
13 | the present family for the Orioles.
14 | 
15 | Family-ORIOLID.
16 | 
17 | THE GOLDEN ORIOLE.
18 | 
19 | Oriolus galbula, LINN.
20 | 
21 | BREEDS in suitable localities throughout Europe south of the Baltic and in
22 | Algeria; passes through Greece, Asia Minor, Palestine, Egypt, and Nubia,
23 | on migration; and winters in North Africa, south-eastwards to Madagascar,
24 | Natal, and westwards to Damara Land: stragglers sometimes occur in Madeira,
25 | and the Azores.
26 | 
27 | The Golden Oriole is a regular visitor to our shores in spring, the largest
28 | number having been seen in the Scilly Islands, and Cornwall; it has, however,
29 | been met with in not a few of the southern and south-eastern counties, and several
30 | instances of its breeding with us have been recorded. In 1868, I saw a male
31 | specimen of this bird near Linton, in Devonshire, and in July, 1887, I was just
32 | too late to see the species in Essex; Mr. Fitch, of Maldon (whom I was visiting)
33 | informed me that the bird had been seen in one of his thickets during the previous
34 | 


--------------------------------------------------------------------------------
/tests/unit/ocr/data/metrics/text/003.txt:
--------------------------------------------------------------------------------
 1 | week. We visited the place in the hope of discovering a nest, but were unsuc-
 2 | cessful.
 3 | 
 4 | In Ireland it has chiefly occurred on the east coast, most of the examples
 5 | being females, or immature males; a specimen was recorded as shot in the Faroe
 6 | Islands, in May, 1893, by Col. H. W. Feilden. In June, 1906, one was killed
 7 | by a cat on the Marine Parade at Brighton. Perhaps the nearest point to
 8 | London at which it has been recognized, was noted in the "Zoologist" for 1892,
 9 | an example having apparently been seen in Richmond Park.
10 | 
11 | The male of this species is bright gamboge-yellow, the lores, wings (excepting
12 | the terminal third of the primary-coverts) and a great part of the tail black; the
13 | primaries, excepting the two outermost, are edged externally, and the secondaries
14 | are tipped with yellowish-white; the two central tail-feathers are yellowish at the
15 | base, and yellow at the tip, and the other feathers have the terminal third of the
16 | outer webs, and borders of the inner webs yellow; bill reddish-ochreous; feet
17 | leaden-grey; iris bright red. The female is much duller than the male, greener,
18 | and with the black colouring replaced by deep brown; the throat, breast, and
19 | centre of belly whitish; the throat, breast, and flanks streaked with greyish.
20 | Young birds are greener and browner than the female, but otherwise similar;
21 | nestlings have the upper parts olivaceous, spotted with yellow.
22 | 
23 | The Golden Oriole frequents gardens, groves, plantations, thickets, and the
24 | outskirts of large woods, especially in the neighbourhood of water; it seems to
25 | prefer the haunts of man, yet is so shy that it rarely remains in view for more
26 | than a minute as it flies rapidly, in somewhat Thrush-like, though more undulating
27 | fashion, from cover to cover; choosing ever the densest foliage, as if aware of the
28 | perilous brilliance of its plumage: possibly it may slowly be acquiring a hereditary
29 | knowledge of the fact that, if but a glimpse is obtained of it, an attempt at least
30 | is made to put an end to its life; or if it fails to comprehend so much, it may
31 | inherit a dread of the thunder and lightning which, for generations, have heralded
32 | its appearance: birds are not naturally fearful of man ; for even those which have
33 | been taught by their parents to dread him, can be generally converted by gentleness
34 | and petting: moreover the fact that a grown man can tame a small bird, whereas
35 | even the tamest will always show the greatest fear of a little boy, certainly seems
36 | to prove that the instinctive dread of the monkey-nature in the latter is deeply
37 | implanted in all birds; just as is that of a cat, even though that animal may
38 | never have been seen by the bird previously.*
39 | 


--------------------------------------------------------------------------------
/tests/unit/ocr/data/substitution.json:
--------------------------------------------------------------------------------
1 | {"tests/unit/ocr/data/text\\0.txt": {}, "tests/unit/ocr/data/text\\1.txt": {}, "tests/unit/ocr/data/text\\11.txt": {}}


--------------------------------------------------------------------------------
/tests/unit/ocr/data/substitution.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/tests/unit/ocr/data/substitution.pkl


--------------------------------------------------------------------------------
/tests/unit/ocr/data/text/0.txt:
--------------------------------------------------------------------------------
1 |  basically ,it was unanimously agreed  by the various relevant parties .To  its determination ,the Chinese  regulatory department compares this  reform to a die that has been cast . takes time to prove whether the stock  can really meet expectations ,and  any deviations that arise during the  reform can be promptly corrected . viewers ,the China News program will  here .This is Xu Li .Thank you  for watching .Coming up is the Focus  program hosted by Wang Shilin . ,dear viewers .Hello ,dear viewers . to Focus Today .Today ,let 's turn  attention to a road cave -in accident  happened in Beijing over the holiday  Before dawn on January 3 ,a sewage  leakage accident occurred at the main  side roads of Jingguang Bridge ,East  Ring Road ,Beijing Municipality , in the road caving in .Relevant  from Beijing Municipality promptly  emergency contingency plans .The  administration department carried out  supervision near the accident scene . ,how did the emergency response  activated by governmental departments  effectively during the holiday ? 


--------------------------------------------------------------------------------
/tests/unit/ocr/data/text/1.txt:
--------------------------------------------------------------------------------
1 |  After the holiday ,what will be done  handle citizens ' peak commute ? In  ,what measures did relevant  take to resolve issues such as waste  ,heating ,and communication ,in order  ensure that the lives of citizens  not affected ? Well ,we have invited  honorable guests to the studio today  follow this topic with us .One of the  honorable guests in the studio is  Zhou Hanhua from the Institute of Law  the Chinese Academy of Social  .Hello .Next is Yang Yang ,a host of  Traffic Radio Station .Hello .Welcome  of you to the studio to participate  our program .Well ,I especially want  know ,ha ,how the two of you found  the news on the day of the accident ?  ,,about 11:00 m. yesterday ,ah ,I  to find out through an SMS when I was  .Uh-huh .Uh-huh .It happened that I  going to have lunch with a friend ,um  at noon .And then ,the friend first  me an SMS ,Uh-huh .saying he would  pick me up to go together .After that  I received an SMS from 1860 .Uh-huh , was through an SMS .


--------------------------------------------------------------------------------
/tests/unit/ocr/data/text/11.txt:
--------------------------------------------------------------------------------
1 |  Furthermore ,Chaoyang Road is an  .Uh-huh .Whether it is Chaoyang Road  the east -west direction or the main  side roads of East Third Ring Road in  south -north direction ,as we can see  this diagram ,it can be said that the  at the main and side roads of East  Ring Road normally has quite heavy  ,especially during commuting times . ,Chaoyang Road is a very important  in the east -west direction .Yes . people living in the west want to  over from the city ,they have to go  this road .Hence ,if a traffic  occurs at this place ,we can indeed  ,ha ,how widespread ,ah ,the extent  the impact will be ,such as the  of cars caught in traffic jams .Yes , I think everyone can see that from  buses that cross Jingguang Bridge . .As buses that cross the Third Ring  are currently ,right now affected by  Jingguang Bridge accident ,ah ,the  results this morning show that 32 bus  throughout the neighborhood have had  be rerouted .Uh-huh .Well ,I think  perhaps many friends in other places  wondering how one place is able to  32 commuter routes .


--------------------------------------------------------------------------------
/tests/unit/ocr/test_ocr.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | 
  4 | import pytest
  5 | import requests
  6 | 
  7 | from genalog.ocr.rest_client import GrokRestClient
  8 | 
  9 | 
 10 | @pytest.fixture(scope="module", autouse=True)
 11 | def set_azure_dummy_secrets(load_azure_resources):
 12 |     os.environ['BLOB_KEY'] = "<YOUR BLOB KEY>"
 13 |     os.environ['SEARCH_SERVICE_KEY'] = "<YOUR SEARCH SERVICE KEY>"
 14 |     os.environ['COGNITIVE_SERVICE_KEY'] = "<YOUR COGNITIVE SERVICE KEY>"
 15 | 
 16 | 
 17 | @pytest.fixture(autouse=True)
 18 | def setup_monkeypatch(monkeypatch):
 19 |     def mock_http(*args, **kwargs):
 20 |         return MockedResponse(args, kwargs)
 21 | 
 22 |     # apply the monkeypatch for requests.get to mock_get
 23 |     monkeypatch.setattr(requests, "put", mock_http)
 24 |     monkeypatch.setattr(requests, "post", mock_http)
 25 |     monkeypatch.setattr(requests, "get", mock_http)
 26 |     monkeypatch.setattr(requests, "delete", mock_http)
 27 | 
 28 | 
 29 | class MockedResponse:
 30 |     def __init__(self, args, kwargs):
 31 |         self.url = args[0]
 32 |         self.text = "response"
 33 |         # self.data = args[1]
 34 |         self.headers = kwargs["headers"]
 35 | 
 36 |     def json(self):
 37 |         if "search.windows.net/skillsets/" in self.url:
 38 |             return {}
 39 | 
 40 |         if "search.windows.net/indexers/" in self.url:
 41 |             if "status" in self.url:
 42 |                 return {"lastResult": {"status": "success"}, "status": "finished"}
 43 |             return {}
 44 | 
 45 |         if "search.windows.net/indexes/" in self.url:
 46 |             if "docs/search" in self.url:
 47 |                 return {
 48 |                     "value": [
 49 |                         {
 50 |                             "metadata_storage_name": "521c38122f783673598856cd81d91c21_0.png",
 51 |                             "layoutText": json.load(
 52 |                                 open(
 53 |                                     "tests/unit/ocr/data/json/521c38122f783673598856cd81d91c21_0.png.json",
 54 |                                     "r",
 55 |                                 )
 56 |                             ),
 57 |                         },
 58 |                         {
 59 |                             "metadata_storage_name": "521c38122f783673598856cd81d91c21_1.png",
 60 |                             "layoutText": json.load(
 61 |                                 open(
 62 |                                     "tests/unit/ocr/data/json/521c38122f783673598856cd81d91c21_1.png.json",
 63 |                                     "r",
 64 |                                 )
 65 |                             ),
 66 |                         },
 67 |                         {
 68 |                             "metadata_storage_name": "521c38122f783673598856cd81d91c21_11.png",
 69 |                             "layoutText": json.load(
 70 |                                 open(
 71 |                                     "tests/unit/ocr/data/json/521c38122f783673598856cd81d91c21_11.png.json",
 72 |                                     "r",
 73 |                                 )
 74 |                             ),
 75 |                         },
 76 |                     ]
 77 |                 }
 78 |             return json.dumps({})
 79 |         if "search.windows.net/datasources/" in self.url:
 80 |             return {}
 81 | 
 82 |         raise ValueError(f"{self.url} not valid")
 83 | 
 84 |     def raise_for_status(self):
 85 |         pass
 86 | 
 87 | 
 88 | class TestGROK:
 89 |     def test_creating_indexing_pipeline(self):
 90 |         grok_rest_client = GrokRestClient.create_from_env_var()
 91 |         grok_rest_client.create_indexing_pipeline()
 92 |         grok_rest_client.delete_indexer_pipeline()
 93 | 
 94 |     def test_running_indexer(self):
 95 |         grok_rest_client = GrokRestClient.create_from_env_var()
 96 |         grok_rest_client.create_indexing_pipeline()
 97 | 
 98 |         indexer_status = grok_rest_client.get_indexer_status()
 99 |         if indexer_status["status"] == "error":
100 |             raise RuntimeError(f"indexer error: {indexer_status}")
101 | 
102 |         # if not already running start the indexer
103 |         if indexer_status["lastResult"]["status"] != "inProgress":
104 |             grok_rest_client.run_indexer()
105 | 
106 |         grok_rest_client.run_indexer()
107 |         indexer_status = grok_rest_client.poll_indexer_till_complete()
108 |         assert indexer_status["lastResult"]["status"] == "success"
109 |         grok_rest_client.delete_indexer_pipeline()
110 | 


--------------------------------------------------------------------------------
/tests/unit/text/data/gt_1.txt:
--------------------------------------------------------------------------------
1 | The book Between You and Me /. Hello /. Good evening /. Hi /. I 'd like to ask Mr. Wallace if he 's ever turned down an inter- /- Turned down a what /? Interview /. Turned down an interview /? Were you ever asked by CBS to say go do this guy /. and you said no /? If so I do n't remember /. No I do n't think so /. Orlando Florida hello /. Mike I wanted to know /. Go ahead /. the first time I was in New York I saw a nice looking young man on TV in a show Mike and Buffy /. was that you /? That was me he he /. That was me and Buff Cobb who was my /- That 's not Buff talking , is it /? No no ha ha /. cause Buff is up in New Hampshire /. She lives in a home up there /. She 's not well /. Um yeah she and I used to do a show on CBS when I first came to New York /. and it was a fascinating /. it was a little bit like Regis and uh Kathy or uh Regis and Kelly /. But you were married , right /? Yes /. What was it like to do a show with the wife /? Not easy /. Ha ha /. I 'm serious /. You know uh I 'd love to see that /. uh We used to bicker on the air /. and what happened was after a while the bickering continued after we got off the air /. After you got off the air /. You know what I mean /. I know /. Detroit hello /. Hi /. Hi /. How are you /? Fine /. Mr. Wallace this is a big pleasure for me to talk to you /. But um uh what is your most difficult interview that you had in Sixty Minutes the most difficult person that you could have ever interviewed /? I think probably the Aiatola really because he was not anxious to do it /. It was um just after the US hostages had been taken in Iran /. and I was surprised that he was willing to talk to us /. and it was a very very difficult business /. We did it in the holy city of which uh we /- and the circumstances were difficult /. They took good care to see that we did n't get into trouble /. Ha ha /. We 'll take a break /. And he just /- We 'll be back with more of Mike Wallace /. The book is Between You and Me /. the DVD is included /. oh what can one say it 's a terrific work /. We 'll be right back /. That voice was the subject of The Insider /. That man /. that man remains my hero /. Jeff Wygan who took on the tobacco cartel if you will /. And you remember when all those guys who ran the companies raised their hands and said Oh it 's not addictive /. they knew it was addictive /. And he has succeeded /. I mean really he has succeeded /. He runs a foundation for Smoke Free Kids /. and he 's gotten all kinds of success in all kinds of ways in foreign countries and so forth /. The man is my hero /. And you are mine /. And we have a minute and a half left /. I know you 're asked this all the time /. but how long you going to keep on keeping on /. How long you /- you know what the dickens would I do /? what would I do /? How long are you going to keep doing what you 're /- Yeah but you 're /- How old are you Mike /? Eighty - seven /. can you imagine /? I 'm going to be seventy - two /. so you 're fifteen years older than me /. That 's why I feel like a kid compared to you /. /.


--------------------------------------------------------------------------------
/tests/unit/text/data/label_generator/labels/0.tsv:
--------------------------------------------------------------------------------
  1 | basically	O
  2 | ,	O
  3 | it	O
  4 | was	O
  5 | unanimously	O
  6 | agreed	O
  7 | upon	O
  8 | by	O
  9 | the	O
 10 | various	O
 11 | relevant	O
 12 | parties	O
 13 | .	O
 14 | 
 15 | To	O
 16 | express	O
 17 | its	O
 18 | determination	O
 19 | ,	O
 20 | the	O
 21 | Chinese	O
 22 | securities	O
 23 | regulatory	O
 24 | department	O
 25 | compares	O
 26 | this	O
 27 | stock	O
 28 | reform	O
 29 | to	O
 30 | a	O
 31 | die	O
 32 | that	O
 33 | has	O
 34 | been	O
 35 | cast	O
 36 | .	O
 37 | 
 38 | It	O
 39 | takes	O
 40 | time	O
 41 | to	O
 42 | prove	O
 43 | whether	O
 44 | the	O
 45 | stock	O
 46 | reform	O
 47 | can	O
 48 | really	O
 49 | meet	O
 50 | expectations	O
 51 | ,	O
 52 | and	O
 53 | whether	O
 54 | any	O
 55 | deviations	O
 56 | that	O
 57 | arise	O
 58 | during	O
 59 | the	O
 60 | stock	O
 61 | reform	O
 62 | can	O
 63 | be	O
 64 | promptly	O
 65 | corrected	O
 66 | .	O
 67 | 
 68 | Dear	O
 69 | viewers	B-PERSONTYPE
 70 | ,	O
 71 | the	O
 72 | China	B-ORGANIZATION
 73 | News	I-ORGANIZATION
 74 | program	O
 75 | will	O
 76 | end	O
 77 | here	O
 78 | .	O
 79 | 
 80 | This	O
 81 | is	O
 82 | Xu	B-PERSONNAME
 83 | Li	I-PERSONNAME
 84 | .	O
 85 | 
 86 | Thank	O
 87 | you	O
 88 | everyone	O
 89 | for	O
 90 | watching	O
 91 | .	O
 92 | 
 93 | Coming	O
 94 | up	O
 95 | is	O
 96 | the	O
 97 | Focus	B-ORGANIZATION
 98 | Today	I-ORGANIZATION
 99 | program	O
100 | hosted	O
101 | by	O
102 | Wang	B-PERSONNAME
103 | Shilin	I-PERSONNAME
104 | .	O
105 | 
106 | Good-bye	O
107 | ,	O
108 | dear	O
109 | viewers	B-PERSONTYPE
110 | .	O
111 | 
112 | Hello	O
113 | ,	O
114 | dear	O
115 | viewers	B-PERSONTYPE
116 | .	O
117 | 
118 | Welcome	O
119 | to	O
120 | Focus	B-ORGANIZATION
121 | Today	I-ORGANIZATION
122 | .	O
123 | 
124 | Today	B-DATE
125 | ,	O
126 | let	O
127 | 's	O
128 | turn	O
129 | our	O
130 | attention	O
131 | to	O
132 | a	O
133 | road	O
134 | cave	O
135 | -	O
136 | in	O
137 | accident	O
138 | that	O
139 | happened	O
140 | in	O
141 | Beijing	B-GPE
142 | over	O
143 | the	O
144 | holiday	O
145 | .	O
146 | 
147 | Before	B-DATETIMERANGE
148 | dawn	I-DATETIMERANGE
149 | on	O
150 | January	B-DATE
151 | 3	I-DATE
152 | ,	O
153 | a	O
154 | sewage	O
155 | pipe	O
156 | leakage	O
157 | accident	O
158 | occurred	O
159 | at	O
160 | the	O
161 | main	O
162 | and	O
163 | side	O
164 | roads	O
165 | of	O
166 | Jingguang	B-LOCATION
167 | Bridge	I-LOCATION
168 | ,	O
169 | East	B-ADDRESS
170 | Third	I-ADDRESS
171 | Ring	I-ADDRESS
172 | Road	I-ADDRESS
173 | ,	O
174 | Beijing	B-GPE
175 | Municipality	I-GPE
176 | ,	O
177 | resulting	O
178 | in	O
179 | the	O
180 | road	O
181 | caving	O
182 | in	O
183 | .	O
184 | 
185 | Relevant	O
186 | departments	O
187 | from	O
188 | Beijing	B-GPE
189 | Municipality	I-GPE
190 | promptly	O
191 | activated	O
192 | emergency	O
193 | contingency	O
194 | plans	O
195 | .	O
196 | 
197 | The	O
198 | traffic	O
199 | administration	O
200 | department	O
201 | carried	O
202 | out	O
203 | traffic	O
204 | supervision	O
205 | near	O
206 | the	O
207 | accident	O
208 | scene	O
209 | .	O
210 | 
211 | Well	O
212 | ,	O
213 | how	O
214 | did	O
215 | the	O
216 | emergency	O
217 | response	O
218 | mechanisms	O
219 | activated	O
220 | by	O
221 | governmental	O
222 | departments	O
223 | operate	O
224 | effectively	O
225 | during	O
226 | the	O
227 | holiday	O
228 | ?	O
229 | 
230 | 


--------------------------------------------------------------------------------
/tests/unit/text/data/label_generator/labels/1.tsv:
--------------------------------------------------------------------------------
  1 | After	O
  2 | the	O
  3 | holiday	O
  4 | ,	O
  5 | what	O
  6 | will	O
  7 | be	O
  8 | done	O
  9 | to	O
 10 | handle	O
 11 | citizens	B-PERSONTYPE
 12 | '	O
 13 | peak	O
 14 | commute	O
 15 | ?	O
 16 | 
 17 | In	O
 18 | addition	O
 19 | ,	O
 20 | what	O
 21 | measures	O
 22 | did	O
 23 | relevant	O
 24 | departments	O
 25 | take	O
 26 | to	O
 27 | resolve	O
 28 | issues	O
 29 | such	O
 30 | as	O
 31 | waste	O
 32 | discharge	O
 33 | ,	O
 34 | heating	O
 35 | ,	O
 36 | and	O
 37 | communication	O
 38 | ,	O
 39 | in	O
 40 | order	O
 41 | to	O
 42 | ensure	O
 43 | that	O
 44 | the	O
 45 | lives	O
 46 | of	O
 47 | citizens	B-PERSONTYPE
 48 | were	O
 49 | not	O
 50 | affected	O
 51 | ?	O
 52 | 
 53 | Well	O
 54 | ,	O
 55 | we	O
 56 | have	O
 57 | invited	O
 58 | two	B-NUMBER
 59 | honorable	O
 60 | guests	B-PERSONTYPE
 61 | to	O
 62 | the	O
 63 | studio	B-LOCATION
 64 | today	B-DATE
 65 | to	O
 66 | follow	O
 67 | this	O
 68 | topic	O
 69 | with	O
 70 | us	O
 71 | .	O
 72 | 
 73 | One	B-NUMBER
 74 | of	O
 75 | the	O
 76 | two	B-NUMBER
 77 | honorable	O
 78 | guests	B-PERSONTYPE
 79 | in	O
 80 | the	O
 81 | studio	B-LOCATION
 82 | is	O
 83 | Professor	O
 84 | Zhou	B-PERSONNAME
 85 | Hanhua	I-PERSONNAME
 86 | from	O
 87 | the	O
 88 | Institute	B-ORGANIZATION
 89 | of	I-ORGANIZATION
 90 | Law	I-ORGANIZATION
 91 | of	O
 92 | the	O
 93 | Chinese	B-ORGANIZATION
 94 | Academy	I-ORGANIZATION
 95 | of	I-ORGANIZATION
 96 | Social	I-ORGANIZATION
 97 | Sciences	I-ORGANIZATION
 98 | .	O
 99 | 
100 | Hello	O
101 | .	O
102 | 
103 | Next	B-ORDINAL
104 | is	O
105 | Yang	B-PERSONNAME
106 | Yang	I-PERSONNAME
107 | ,	O
108 | a	O
109 | host	O
110 | of	O
111 | Beijing	B-ORGANIZATION
112 | Traffic	I-ORGANIZATION
113 | Radio	I-ORGANIZATION
114 | Station	I-ORGANIZATION
115 | .	O
116 | 
117 | Hello	O
118 | .	O
119 | 
120 | Welcome	O
121 | both	O
122 | of	O
123 | you	O
124 | to	O
125 | the	O
126 | studio	O
127 | to	O
128 | participate	O
129 | in	O
130 | our	O
131 | program	O
132 | .	O
133 | 
134 | Well	O
135 | ,	O
136 | I	O
137 | especially	O
138 | want	O
139 | to	O
140 | know	O
141 | ,	O
142 | ha	O
143 | ,	O
144 | how	O
145 | the	O
146 | two	B-NUMBER
147 | of	O
148 | you	O
149 | found	O
150 | out	O
151 | the	O
152 | news	O
153 | on	O
154 | the	B-DATE
155 | day	I-DATE
156 | of	O
157 | the	O
158 | accident	B-EVENT
159 | ?	O
160 | 
161 | Ah	O
162 | ,	O
163 | ,	O
164 | about	O
165 | 11:00	B-NUMBER
166 | m.	O
167 | yesterday	B-DATE
168 | ,	O
169 | ah	O
170 | ,	O
171 | I	O
172 | happened	O
173 | to	O
174 | find	O
175 | out	O
176 | through	O
177 | an	O
178 | SMS	O
179 | when	O
180 | I	O
181 | was	O
182 | outside	O
183 | .	O
184 | 
185 | Uh-huh	O
186 | .	O
187 | 
188 | Uh-huh	O
189 | .	O
190 | 
191 | It	O
192 | happened	O
193 | that	O
194 | I	O
195 | was	O
196 | going	O
197 | to	O
198 | have	O
199 | lunch	B-TIMERANGE
200 | with	O
201 | a	O
202 | friend	B-PERSONTYPE
203 | ,	O
204 | um	O
205 | ,	O
206 | at	O
207 | noon	B-TIME
208 | .	O
209 | 
210 | And	O
211 | then	O
212 | ,	O
213 | the	O
214 | friend	B-PERSONTYPE
215 | first	B-ORDINAL
216 | sent	O
217 | me	O
218 | an	O
219 | SMS	O
220 | ,	O
221 | Uh-huh	O
222 | .	O
223 | 
224 | saying	O
225 | he	O
226 | would	O
227 | come	O
228 | pick	O
229 | me	O
230 | up	O
231 | to	O
232 | go	O
233 | together	O
234 | .	O
235 | 
236 | After	O
237 | that	O
238 | ,	O
239 | I	O
240 | received	O
241 | an	O
242 | SMS	O
243 | from	B-DATERANGE
244 | 1860	I-DATERANGE
245 | .	O
246 | 
247 | Uh-huh	O
248 | ,	O
249 | it	O
250 | was	O
251 | through	O
252 | an	O
253 | SMS	B-ORGANIZATION
254 | .	O
255 | 
256 | 


--------------------------------------------------------------------------------
/tests/unit/text/data/label_generator/text/0.txt:
--------------------------------------------------------------------------------
1 | basically, it was unanimously agreed upon by the various relevant parties. To express its determination, the Chinese securities regulatory department compares this stock reform to a die that has been cast. It takes time to prove whether the stock reform can really meet expectations, and whether any deviations that arise during the stock reform can be promptly corrected. Dear viewers, the China News program will end here. This is Xu Li. Thank you everyone for watching. Coming up is the Focus Today program hosted by Wang Shilin. Good-bye, dear viewers. Hello, dear viewers. Welcome to Focus Today. Today, let's turn our attention to a road cave - in accident that happened in Beijing over the holiday. Before dawn on January 3, a sewage pipe leakage accident occurred at the main and side roads of Jingguang Bridge , East Third Ring Road, Beijing Municipality, resulting in the road caving in. Relevant departments from Beijing Municipality promptly activated emergency contingency plans. The traffic administration department carried out traffic supervision near the accident scene. Well, how did the emergency response mechanisms activated by governmental departments operate effectively during the holiday ?


--------------------------------------------------------------------------------
/tests/unit/text/data/label_generator/text/1.txt:
--------------------------------------------------------------------------------
1 | After the holiday, what will be done to handle citizens' peak commute? In addition, what measures did relevant departments take to resolve issues such as waste discharge, heating, and communication, in order to ensure that the lives of citizens were not affected? Well, we have invited two honorable guests to the studio today to follow this topic with us. One of the two honorable guests in the studio is Professor Zhou Hanhua from the Institute of Law of the Chinese Academy of Social Sciences. Hello. Next is Yang Yang, a host of Beijing Traffic Radio Station. Hello. Welcome both of you to the studio to participate in our program. Well, I especially want to know, ha, how the two of you found out the news on the day of the accident? Ah,, about 11:00 m. yesterday, ah, I happened to find out through an SMS when I was outside. Uh-huh. Uh-huh. It happened that I was going to have lunch with a friend, um, at noon. And then, the friend first sent me an SMS, Uh-huh. saying he would come pick me up to go together. After that, I received an SMS from 1860. Uh-huh, it was through an SMS. 


--------------------------------------------------------------------------------
/tests/unit/text/data/label_generator/text/11.txt:
--------------------------------------------------------------------------------
1 | And you, Yang Yang? A friend happened to call me. You were not at work that day? No. The station called me at noon and said something happened at Jingguang Bridge and that I had to go to the station immediately to research the upcoming program. Uh-huh, that means, er, you found out the accident through an information source at the station. Right, right, right. Uh-huh. Well, like Professor Zhou, I also received this news, ha, through a mobile phone SMS. At that time,, it can be said that this SMS was among the many, ha, SMS containing New Year wishes, like Happy New Year, received after the start of the New Year. Uh-huh. Ah, actually I felt a lot of warmth when I received that SMS. Although we live in the west instead of the east and it did not affect us much, I think it is very useful, ah, to inform people of this kind of news. Yes, exceptionally. Yes, exceptionally. Well, what in fact was the content of that SMS? Let's take a look via this footage, ha. I remember the SMS was written like this at that time, saying that, ah, there was a sewage pipe leakage accident on the side road at the southeast corner of Jingguang Bridge at East Third Ring Road, and, well, traffic supervision was implemented near Chaoyang Road, Jingguang Bridge, and East Third Ring Road,  and requesting cars to make a detour. Some car owners said that it was very good that the SMS was sent. Furthermore, there was one last sentence in that SMS thanking citizens for their cooperation and support. Ah, after the SMS was sent ,, I felt it seems to be the first time that Beijing Municipality, ah, used an SMS to give notification at the time of a public emergency. I don't know, all of us are living in Beijing, is this the first time, Professor Zhou? Yes, in terms of an official notification , this should be the first time one was sent officially through 1860. Uh-huh.


--------------------------------------------------------------------------------
/tests/unit/text/data/ocr_1.txt:
--------------------------------------------------------------------------------
1 | The book Between you and me /. Hello / Good evening / Is /. I'd we to ask it Wallace if he's ever turned down an inter / Turned down a what / Interview /. Taned down an interview /? Were you ever asked by CBS to say go do this guy ? and your sant no /7 # so I do n't remember / No I don't think so /. Ontario Florida head / like I wanted to know / Go ahead / the first time I was in wes bork I sow a nice looking young man on TV in's show bike and Butty / was that you ? That was me he he /, That was me and Buff Coco who was my ). That's not But taking, Is it /f to no he he /. cause Butt is up in New Hampshire /. She lives in a home up there /. She's not well /. Um yeah she and I used to do a show on CaS when I first came to he's work / and it was a fascinating /. It was a lene be like Regis and on Kathy or on Recy's and Kelly /. But you were married , right /1 Yes /. What was it wise to do a show with the wife 7 Not easy /. Ha ha / I'm serious /. You know in I'd love to see that ! on we used to backer on live or I. and what happened was after a while the bickering corewed sher we got off the air / After you got off the # 1. You know what I mean /. I know / Decree helio / bill. Is /. How are you /7 Fire ). He Wallace this is a big pleasure for me to talk to you / But tan is what is your most officus aderview that you had it Sixty Minutes the most difficult person that you could have over interviewed /? I think probably the Alitois really because he was not anxious to do i / It was on just otter the US hostages had been taken in van /, and I was surprised that he was wasing to talk to us /. and it was a very very autocult business / we did it in the holy city of which of we / and the circumstances were offcult /, They took good care to see that we and n't got into trouble / Ha ha / we # take a break !, And he just / we " be back with more of mike wallace /. The book is Between lexi and his / the DVD is backand / on what can one say it's a terrific work / we'll be right back / That voice was the subject of The Raider /. That man I. that man remains my hero / jet Wygen who took on the tobacco cartel If you will /. And you remen. bar when all those guys who ran the companies raised their hands and said On It's not aceactive /. they knew & was abiktive / And he has succeeded /. I mean really he has succeeded /. He runs s founds tion for Smoke Free Kids / and he's gotten off kind of success in of kinds of ways in foreign countries and so forth /. The man is my have /. And your are mine /. And we have a miage and s hat let /. I know you 're asked this all the time / but how long you going to keep on keeping on /. How ling you /. you know what the dickens would I do /7 what waxat I do /7 How king are you going to keep doing what you 're / was but you 're / How old are you like /? fighty . seven /. can you imagine /? I'm going to be seventy . two /. so you're fifteen years older than me ). That's why I feel we e kil compared to you 1. 1.


--------------------------------------------------------------------------------
/tests/unit/text/test_lcs.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from genalog.text.lcs import LCS
 4 | 
 5 | 
 6 | @pytest.fixture(
 7 |     params=[
 8 |         ("", ""),  # empty
 9 |         ("abcde", "ace"),  # naive case
10 |     ]
11 | )
12 | def lcs(request):
13 |     str1, str2 = request.param
14 |     return LCS(str1, str2)
15 | 
16 | 
17 | def test_lcs_init(lcs):
18 |     assert lcs._lcs_len is not None
19 |     assert lcs._lcs is not None
20 | 
21 | 
22 | @pytest.mark.parametrize(
23 |     "str1, str2, expected_len, expected_lcs",
24 |     [
25 |         ("", "", 0, ""),  # empty
26 |         ("abc", "abc", 3, "abc"),
27 |         ("abcde", "ace", 3, "ace"),  # naive case
28 |         ("a", "", 0, ""),  # no results
29 |         ("abc", "cba", 1, "c"),  # multiple cases
30 |         ("abcdgh", "aedfhr", 3, "adh"),
31 |         ("abc.!\t\nd", "dxab", 2, "ab"),  # with punctuations
32 |         (
33 |             "New York @",
34 |             "New @ York",
35 |             len("New York"),
36 |             "New York",
37 |         ),  # with space-separated, tokens
38 |         ("Is A Big City", "A Big City Is", len("A Big City"), "A Big City"),
39 |         ("Is A Big City", "City Big Is A", len(" Big "), " Big "),  # reversed order
40 |         # mixed order with similar tokens
41 |         ("Is A Big City IS", "IS Big A City Is", len("I Big City I"), "I Big City I"),
42 |         # casing
43 |         (
44 |             "Is A Big City IS a",
45 |             "IS a Big City Is A",
46 |             len("I  Big City I "),
47 |             "I  Big City I ",
48 |         ),
49 |     ],
50 | )
51 | def test_lcs_e2e(str1, str2, expected_len, expected_lcs):
52 |     lcs = LCS(str1, str2)
53 |     assert expected_lcs == lcs.get_str()
54 |     assert expected_len == lcs.get_len()
55 | 


--------------------------------------------------------------------------------
/tests/unit/text/test_preprocess.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | 
  3 | from genalog.text import preprocess
  4 | from genalog.text.alignment import GAP_CHAR
  5 | 
  6 | 
  7 | @pytest.mark.parametrize(
  8 |     "token, replacement, desired_output",
  9 |     [
 10 |         ("", "_", ""),  # Do nothing to empty string
 11 |         (" ", "_", " "),  # Do nothing to whitespaces
 12 |         (" \n\t", "_", " \n\t"),
 13 |         ("ascii", "_", "ascii"),
 14 |         ("a s\nc\tii", "_", "a s\nc\tii"),
 15 |         ("ascii·", "_", "ascii"),  # Tokens with non-ASCII values
 16 |         ("·", "_", "_"),  # Tokens with non-ASCII values
 17 |     ],
 18 | )
 19 | def test_remove_non_ascii(token, replacement, desired_output):
 20 |     for code in range(128, 1000):  # non-ASCII values
 21 |         token.replace("·", chr(code))
 22 |         output = preprocess.remove_non_ascii(token, replacement)
 23 |         assert output == desired_output
 24 | 
 25 | 
 26 | @pytest.mark.parametrize(
 27 |     "s, desired_output",
 28 |     [
 29 |         (" New  \t \n", ["New"]),
 30 |         # Mixed in gap char "@"
 31 |         (" @ @", ["@", "@"]),
 32 |         ("New York is big", ["New", "York", "is", "big"]),
 33 |         # Mixed multiple spaces and tabs
 34 |         (" New  York \t is  \t  big", ["New", "York", "is", "big"]),
 35 |         # Mixed in punctuation
 36 |         ("New .York is, big !", ["New", ".York", "is,", "big", "!"]),
 37 |         # Mixed in gap char "@"
 38 |         ("@N@ew York@@@is,\t  big@@@@@", ["@N@ew", "York@@@is,", "big@@@@@"]),
 39 |     ],
 40 | )
 41 | def test_tokenize(s, desired_output):
 42 |     output = preprocess.tokenize(s)
 43 |     assert output == desired_output
 44 | 
 45 | 
 46 | @pytest.mark.parametrize(
 47 |     "tokens, desired_output",
 48 |     [
 49 |         (
 50 |             ["New", "York", "is", "big"],
 51 |             "New York is big",
 52 |         ),
 53 |         # Mixed in punctuation
 54 |         (
 55 |             ["New", ".York", "is,", "big", "!"],
 56 |             "New .York is, big !",
 57 |         ),
 58 |         # Mixed in gap char "@"
 59 |         (
 60 |             ["@N@ew", "York@@@is,", "big@@@@@"],
 61 |             "@N@ew York@@@is, big@@@@@",
 62 |         ),
 63 |     ],
 64 | )
 65 | def test_join_tokens(tokens, desired_output):
 66 |     output = preprocess.join_tokens(tokens)
 67 |     assert output == desired_output
 68 | 
 69 | 
 70 | @pytest.mark.parametrize(
 71 |     "c, desired_output",
 72 |     [
 73 |         # Gap char
 74 |         (GAP_CHAR, False),
 75 |         # Alphabet char
 76 |         ("a", False),
 77 |         ("A", False),
 78 |         # Punctuation
 79 |         (".", False),
 80 |         ("!", False),
 81 |         (",", False),
 82 |         ("-", False),
 83 |         # Token separators
 84 |         (" ", True),
 85 |         ("\n", True),
 86 |         ("\t", True),
 87 |     ],
 88 | )
 89 | def test__is_spacing(c, desired_output):
 90 |     assert desired_output == preprocess._is_spacing(c)
 91 | 
 92 | 
 93 | @pytest.mark.parametrize(
 94 |     "text, desired_output",
 95 |     [
 96 |         ("", ""),
 97 |         ("w .", "w ."),
 98 |         ("w !", "w !"),
 99 |         ("w ?", "w ?"),
100 |         ("w /.", "w /."),
101 |         ("w /!", "w /!"),
102 |         ("w /?", "w /?"),
103 |         ("w1 , w2 .", "w1 , w2 ."),
104 |         ("w1 . w2 .", "w1 . \nw2 ."),
105 |         ("w1 /. w2 /.", "w1 /. \nw2 /."),
106 |         ("w1 ! w2 .", "w1 ! \nw2 ."),
107 |         ("w1 /! w2 /.", "w1 /! \nw2 /."),
108 |         ("w1 ? w2 .", "w1 ? \nw2 ."),
109 |         ("w1 /? w2 /.", "w1 /? \nw2 /."),
110 |         ("U.S. . w2 .", "U.S. . \nw2 ."),
111 |         ("w1 ??? w2 .", "w1 ??? w2 ."),  # not splitting
112 |         ("w1 !!! w2 .", "w1 !!! w2 ."),
113 |         ("w1 ... . w2 .", "w1 ... . \nw2 ."),
114 |         ("w1 ... /. w2 /.", "w1 ... /. \nw2 /."),
115 |         ("w1 /. /. w2 .", "w1 /. /. \nw2 ."),
116 |         ("w1 /. /.", "w1 /. \n/."),
117 |         ("w1 /. /. ", "w1 /. /. \n"),
118 |         ("w1 ? ? ? ? w2 .", "w1 ? ? ? ? \nw2 ."),
119 |         ("w1 /? /? /? /? w2 /.", "w1 /? /? /? /? \nw2 /."),
120 |         ("w1 ! ! ! ! w2 .", "w1 ! ! ! ! \nw2 ."),
121 |         ("w1 /! /! /! /! w2 /.", "w1 /! /! /! /! \nw2 /."),
122 |     ],
123 | )
124 | def test_split_sentences(text, desired_output):
125 |     assert desired_output == preprocess.split_sentences(text)
126 | 
127 | 
128 | @pytest.mark.parametrize(
129 |     "token, desired_output",
130 |     [
131 |         ("", False),
132 |         (" ", False),
133 |         ("\n", False),
134 |         ("\t", False),
135 |         (" \n \t", False),
136 |         ("...", False),
137 |         ("???", False),
138 |         ("!!!", False),
139 |         (".", True),
140 |         ("!", True),
141 |         ("?", True),
142 |         ("/.", True),
143 |         ("/!", True),
144 |         ("/?", True),
145 |     ],
146 | )
147 | def test_is_sentence_separator(token, desired_output):
148 |     assert desired_output == preprocess.is_sentence_separator(token)
149 | 


--------------------------------------------------------------------------------
/tests/unit/text/test_utf8.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import warnings
 3 | 
 4 | import pytest
 5 | 
 6 | from genalog.text import alignment
 7 | from genalog.text.alignment import GAP_CHAR
 8 | from tests.unit.cases.text_alignment import ALIGNMENT_REGRESSION_TEST_CASES
 9 | 
10 | 
11 | def random_utf8_char(byte_len=1):
12 |     if byte_len == 1:
13 |         return chr(random.randint(0, 0x007F))
14 |     elif byte_len == 2:
15 |         return chr(random.randint(0x007F, 0x07FF))
16 |     elif byte_len == 3:
17 |         return chr(random.randint(0x07FF, 0xFFFF))
18 |     elif byte_len == 4:
19 |         return chr(random.randint(0xFFFF, 0x10FFFF))
20 |     else:
21 |         raise ValueError(
22 |             f"Invalid byte length: {byte_len}."
23 |             + "utf-8 does not encode characters with more than 4 bytes in length"
24 |         )
25 | 
26 | 
27 | @pytest.mark.parametrize(
28 |     "num_utf_char_to_test", [100]
29 | )  # Number of char per byte length
30 | @pytest.mark.parametrize(
31 |     "byte_len", [1, 2, 3, 4]
32 | )  # UTF does not encode with more than 4 bytes
33 | @pytest.mark.parametrize(
34 |     "gt_txt, noisy_txt, expected_aligned_gt, expected_aligned_noise",
35 |     ALIGNMENT_REGRESSION_TEST_CASES,
36 | )
37 | def test_align(
38 |     num_utf_char_to_test,
39 |     byte_len,
40 |     gt_txt,
41 |     noisy_txt,
42 |     expected_aligned_gt,
43 |     expected_aligned_noise,
44 | ):
45 | 
46 |     invalid_char = set(gt_txt).union(
47 |         set(GAP_CHAR)
48 |     )  # character to replace to cannot be in this set
49 |     for _ in range(num_utf_char_to_test):
50 |         utf_char = random_utf8_char(byte_len)
51 |         while (
52 |             utf_char in invalid_char
53 |         ):  # find a utf char not in the input string and not GAP_CHAR
54 |             utf_char = random_utf8_char(byte_len)
55 |         char_to_replace = random.choice(list(invalid_char)) if gt_txt else ""
56 | 
57 |         gt_txt.replace(char_to_replace, utf_char)
58 |         noisy_txt.replace(char_to_replace, utf_char)
59 |         expected_aligned_gt_sub = expected_aligned_gt.replace(char_to_replace, utf_char)
60 |         expected_aligned_noise_sub = expected_aligned_noise.replace(
61 |             char_to_replace, utf_char
62 |         )
63 | 
64 |         # Run alignment
65 |         aligned_gt, aligned_noise = alignment.align(gt_txt, noisy_txt)
66 | 
67 |         aligned_gt = aligned_gt.replace(char_to_replace, utf_char)
68 |         aligned_noise = aligned_noise.replace(char_to_replace, utf_char)
69 |         if aligned_gt != expected_aligned_gt_sub:
70 |             expected_alignment = alignment._format_alignment(
71 |                 expected_aligned_gt_sub, expected_aligned_noise_sub
72 |             )
73 |             result_alignment = alignment._format_alignment(aligned_gt, aligned_noise)
74 |             warnings.warn(
75 |                 RuntimeWarning(
76 |                     f"\n\n****Expect alignment returns:****\n{expected_alignment} \n****But got:****\n{result_alignment}"
77 |                 )
78 |             )
79 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | envlist = flake8, py
 3 | 
 4 | 
 5 | [testenv]
 6 | passenv =
 7 |     # For e2e testing the OCR components
 8 |     BLOB_KEY
 9 |     BLOB_NAME
10 |     COGNITIVE_SERVICE_KEY
11 |     COMPUTER_VISION_SUBSCRIPTION_KEY
12 |     SEARCH_SERVICE_KEY
13 | # Reading additional dependencies to run the test
14 | # https://tox.readthedocs.io/en/latest/example/basic.html#depending-on-requirements-txt-or-defining-constraints
15 | deps = -rrequirements-dev.txt
16 | commands = 
17 |     # {posargs} will be substituded by arguments after the `--` when running.
18 |     # This will allow running subset of the test suite via tox. 
19 |     #
20 |     # EX: tox -- -m "not azure and not slow" 
21 |     #     will pass {-m "not azure and not slow"} to `pytest`
22 |     # See https://tox.readthedocs.io/en/latest/example/general.html for more details
23 |     pytest {posargs}
24 | 
25 | 
26 | [testenv:flake8]
27 | deps = flake8
28 | skip_install = True
29 | commands = flake8 .
30 | 
31 | 
32 | # Configurations for running pytest
33 | [pytest]
34 | log_cli = False
35 | log_format = %(asctime)s %(levelname)s %(message)s
36 | junit_family = xunit2
37 | # This enable custom marker as decorator "@pytest.mark.slow"
38 | markers = 
39 |     # These two markers allow to us to run faster subset of the test:
40 |     # EX: pytest -m "not slow and not azure"
41 |     # See https://docs.pytest.org/en/stable/example/markers.html#registering-markers
42 |     slow: marks tests as slow (deselect with '-m "not slow"')
43 |     azure: marks as integration tests that require azure resource
44 |     io: marks integration tests involving some form of I/O operations (disk, internet, etc)
45 | testpaths =
46 |     tests
47 | addopts =
48 |     # reports all (except passed tests). See https://docs.pytest.org/en/latest/usage.html#detailed-summary-report
49 |     -ra 
50 |     --cov-append --cov=genalog --cov-report=html --cov-report=term-missing --cov-report=xml --junitxml=junit/test-results.xml
51 | 
52 | 
53 | [flake8]
54 | # Configs for flake8-import-order, see https://pypi.org/project/flake8-import-order/ for more info.
55 | import-order-style=edited
56 | application-import-names=genalog, tests
57 | # Native flake8 configs
58 | max-line-length = 140
59 | exclude = 
60 |     build, dist, docs, example,
61 |     .env*,.venv* # local virtual environments
62 |     .tox
63 | 


--------------------------------------------------------------------------------