.html.jinja
17 | HTML_TEMPLATE = "columns.html.jinja"
18 |
19 | # Degration effects applied in sequence
20 | DEGRADATIONS = [
21 | ("blur", {"radius": 3}), # needs to be an odd number
22 | ("bleed_through", {
23 | "src": ImageState.CURRENT_STATE, "background": ImageState.ORIGINAL_STATE,
24 | "alpha": 0.8,
25 | "offset_y": 9, "offset_x": 12
26 | }),
27 | ("morphology", {"operation": "open", "kernel_shape": (3, 3)}),
28 | ("pepper", {"amount": 0.05}),
29 | ("salt", {"amount": 0.05}),
30 | ]
31 |
32 | doc_generation = AnalogDocumentGeneration(styles=STYLE_COMBINATIONS, degradations=DEGRADATIONS)
33 | img_array = doc_generation.generate_img(sample_text, HTML_TEMPLATE, target_folder=None)
34 |
35 | import cv2
36 | from IPython.core.display import Image, display
37 |
38 | _, encoded_image = cv2.imencode('.png', img_array)
39 | display(Image(data=encoded_image, width=600))
40 |
41 |
--------------------------------------------------------------------------------
/example/ocr_label_propagation.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## `genalog.text` module: \n",
8 | "This module is responsible for:\n",
9 | "1. Text alignment\n",
10 | "1. NER label propagation using text alignment results"
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": 1,
16 | "metadata": {},
17 | "outputs": [],
18 | "source": [
19 | "from genalog.text import ner_label\n",
20 | "from genalog.text import preprocess\n",
21 | "\n",
22 | "gt_txt = \"New York is big\"\n",
23 | "ocr_txt = \"New Yo rkis big\"\n",
24 | "\n",
25 | "# Input to the method\n",
26 | "gt_labels = [\"B-P\", \"I-P\", \"O\", \"O\"]\n",
27 | "gt_tokens = preprocess.tokenize(gt_txt) # tokenize into list of tokens\n",
28 | "ocr_tokens = preprocess.tokenize(ocr_txt)"
29 | ]
30 | },
31 | {
32 | "cell_type": "code",
33 | "execution_count": 2,
34 | "metadata": {},
35 | "outputs": [
36 | {
37 | "name": "stdout",
38 | "output_type": "stream",
39 | "text": [
40 | "['B-P', 'I-P', 'O', 'O']\n",
41 | "['New', 'York', 'is', 'big']\n",
42 | "['New', 'Yo', 'rkis', 'big']\n"
43 | ]
44 | }
45 | ],
46 | "source": [
47 | "# Inputs to the method\n",
48 | "print(gt_labels)\n",
49 | "print(gt_tokens)\n",
50 | "print(ocr_tokens)"
51 | ]
52 | },
53 | {
54 | "cell_type": "code",
55 | "execution_count": 5,
56 | "metadata": {},
57 | "outputs": [],
58 | "source": [
59 | "# Method returns a tuple of 4 elements (gt_tokens, gt_labels, ocr_tokens, ocr_labels, gap_char)\n",
60 | "ocr_labels, aligned_gt, aligned_ocr, gap_char = ner_label.propagate_label_to_ocr(gt_labels, gt_tokens, ocr_tokens)"
61 | ]
62 | },
63 | {
64 | "cell_type": "code",
65 | "execution_count": 6,
66 | "metadata": {},
67 | "outputs": [
68 | {
69 | "name": "stdout",
70 | "output_type": "stream",
71 | "text": [
72 | "OCR labels: ['B-P', 'I-P', 'I-P', 'O']\n",
73 | "Aligned ground truth: New Yo@rk is big\n",
74 | "Alinged OCR text: New Yo rk@is big\n"
75 | ]
76 | }
77 | ],
78 | "source": [
79 | "# Outputs\n",
80 | "print(f\"OCR labels: {ocr_labels}\")\n",
81 | "print(f\"Aligned ground truth: {aligned_gt}\")\n",
82 | "print(f\"Alinged OCR text: {aligned_ocr}\")"
83 | ]
84 | },
85 | {
86 | "cell_type": "code",
87 | "execution_count": 9,
88 | "metadata": {},
89 | "outputs": [
90 | {
91 | "name": "stdout",
92 | "output_type": "stream",
93 | "text": [
94 | "B-P I-P O O \n",
95 | "New York is big \n",
96 | "New Yo@rk is big\n",
97 | "||||||.||.||||||\n",
98 | "New Yo rk@is big\n",
99 | "New Yo rkis big \n",
100 | "B-P I-P I-P O \n",
101 | "\n"
102 | ]
103 | }
104 | ],
105 | "source": [
106 | "# Format result for display\n",
107 | "print(ner_label.format_label_propagation(gt_tokens, gt_labels, ocr_tokens, ocr_labels, aligned_gt, aligned_ocr))"
108 | ]
109 | },
110 | {
111 | "cell_type": "code",
112 | "execution_count": 12,
113 | "metadata": {},
114 | "outputs": [
115 | {
116 | "name": "stdout",
117 | "output_type": "stream",
118 | "text": [
119 | "B-P I-P O O \n",
120 | "New York is big \n",
121 | "New Yo rkis big \n",
122 | "B-P I-P I-P O \n",
123 | "\n"
124 | ]
125 | }
126 | ],
127 | "source": [
128 | "# To turn off alignment information:\n",
129 | "print(ner_label.format_label_propagation(gt_tokens, gt_labels, ocr_tokens, ocr_labels, aligned_gt, aligned_ocr, show_alignment=False))"
130 | ]
131 | },
132 | {
133 | "cell_type": "code",
134 | "execution_count": 14,
135 | "metadata": {},
136 | "outputs": [
137 | {
138 | "name": "stdout",
139 | "output_type": "stream",
140 | "text": [
141 | "B-P I-P I-P O \n",
142 | "New Yo rkis big \n",
143 | "\n"
144 | ]
145 | }
146 | ],
147 | "source": [
148 | "# Format tokens and labels\n",
149 | "print(ner_label.format_labels(ocr_tokens, ocr_labels))"
150 | ]
151 | },
152 | {
153 | "cell_type": "code",
154 | "execution_count": null,
155 | "metadata": {},
156 | "outputs": [],
157 | "source": []
158 | }
159 | ],
160 | "metadata": {
161 | "kernelspec": {
162 | "display_name": "Python 3",
163 | "language": "python",
164 | "name": "python3"
165 | },
166 | "language_info": {
167 | "codemirror_mode": {
168 | "name": "ipython",
169 | "version": 3
170 | },
171 | "file_extension": ".py",
172 | "mimetype": "text/x-python",
173 | "name": "python",
174 | "nbconvert_exporter": "python",
175 | "pygments_lexer": "ipython3",
176 | "version": "3.6.9"
177 | }
178 | },
179 | "nbformat": 4,
180 | "nbformat_minor": 4
181 | }
182 |
--------------------------------------------------------------------------------
/example/sample/degradation/bleed_through.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/example/sample/degradation/bleed_through.png
--------------------------------------------------------------------------------
/example/sample/degradation/blur.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/example/sample/degradation/blur.png
--------------------------------------------------------------------------------
/example/sample/degradation/close_dilate.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/example/sample/degradation/close_dilate.png
--------------------------------------------------------------------------------
/example/sample/degradation/degrader.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/example/sample/degradation/degrader.png
--------------------------------------------------------------------------------
/example/sample/degradation/degrader_heavy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/example/sample/degradation/degrader_heavy.png
--------------------------------------------------------------------------------
/example/sample/degradation/kernel_morph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/example/sample/degradation/kernel_morph.png
--------------------------------------------------------------------------------
/example/sample/degradation/open_erode.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/example/sample/degradation/open_erode.png
--------------------------------------------------------------------------------
/example/sample/degradation/salt_pepper.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/example/sample/degradation/salt_pepper.png
--------------------------------------------------------------------------------
/example/sample/degradation/text_block.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/example/sample/degradation/text_block.png
--------------------------------------------------------------------------------
/example/sample/degradation/text_zoomed.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/example/sample/degradation/text_zoomed.png
--------------------------------------------------------------------------------
/example/sample/generation/columns_Times_11px.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/example/sample/generation/columns_Times_11px.pdf
--------------------------------------------------------------------------------
/example/sample/generation/columns_Times_11px.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/example/sample/generation/columns_Times_11px.png
--------------------------------------------------------------------------------
/example/sample/generation/example.txt:
--------------------------------------------------------------------------------
1 | Time magazine , in a move to reduce the costs of wooing new subscribers , is lowering its circulation guarantee to advertisers for the second consecutive year , increasing its subscription rates and cutting back on merchandise giveaways .
2 | In an announcement to its staff last week , executives at Time Warner Inc. 's weekly magazine said Time will `` dramatically de-emphasize '' its use of electronic giveaways such as telephones in television subscription drives ; cut the circulation it guarantees advertisers by 300,000 , to four million ; and increase the cost of its annual subscription rate by about $ 4 to $ 55 .
3 | In a related development , the news - weekly , for the fourth year in a row , said it wo n't increase its advertising rates in 1990 ; a full , four - color page in the magazine costs about $ 120,000 .
4 | However , because the guaranteed circulation base is being lowered , ad rates will be effectively 7.5 % higher per subscriber , according to Richard Heinemann , Time associate publisher .
5 | Time is following the course of some other mass - circulation magazines that in recent years have challenged the publishing myth that maintaining artificially high , and expensive , circulations is the way to draw advertisers .
6 | In recent years , Reader 's Digest , New York Times Co. 's McCall 's , and most recently News Corp. 's TV Guide , have cut their massive circulation rate bases to eliminate marginal circulation and hold down rates for advertisers .
7 | Deep discounts in subscriptions and offers of free clock radios and watches have become accepted forms of attracting new subscribers in the hyper-competitive world of magazine news - weeklies .
8 | But Time , as part of the more cost - conscious Time Warner , wants to wean itself away from expensive gimmicks .
9 | Besides , Time executives think selling a news magazine with a clock radio is tacky .
10 |
11 |
12 | `` Giveaways just give people the wrong image , '' said Mr. Heinemann .
13 | `` That perception takes the focus off the magazine . ''
14 | Time magazine executives predictably paint the circulation cut as a show of strength and actually a benefit to advertisers .
15 | `` What we are doing is screening out the readers who are only casually related to the magazine and do n't really read it , '' said Mr. Heinemann .
16 | `` We are trying to create quality and involvement . ''
17 | However , Time executives used the same explanation when in October 1988 the magazine cut its guaranteed circulation from 4.6 million to 4.3 million .
18 | And Time 's paid circulation , according to Audit Bureau of Circulations , dropped 7.3 % to 4,393,237 in the six months ended June 30 , 1989 .
19 | Still , Time 's move is being received well , once again .
20 | `` It 's terrific for advertisers to know the reader will be paying more , '' said Michael Drexler , national media director at Bozell Inc. ad agency .
21 | `` A few drops in circulation are of no consequence .
22 | It 's not a show of weakness ; they are improving the quality of circulation while insuring their profits . ''
23 | Mr. Heinemann said the changes represent a new focus in the magazine industry : a magazine 's net revenue per subscriber , or the actual revenue from subscribers after discounts and the cost of premiums have been stripped away .
24 | `` The question is how much are we getting from each reader , '' said Mr. Heinemann .
25 | Time 's rivals news - weeklies , Washington Post Co. 's Newsweek and U.S. News & World Report , are less reliant on electronic giveaways , and in recent years both have been increasing their circulation rate bases .
26 | Both magazines are expected to announce their ad rates and circulation levels for 1990 within a month .
--------------------------------------------------------------------------------
/example/sample/generation/letter_Times_11px.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/example/sample/generation/letter_Times_11px.pdf
--------------------------------------------------------------------------------
/example/sample/generation/letter_Times_11px.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/example/sample/generation/letter_Times_11px.png
--------------------------------------------------------------------------------
/example/sample/generation/text_block_Times_11px.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/example/sample/generation/text_block_Times_11px.pdf
--------------------------------------------------------------------------------
/example/sample/generation/text_block_Times_11px.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/example/sample/generation/text_block_Times_11px.png
--------------------------------------------------------------------------------
/example/sample/generation/text_block_Times_11px_pg_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/example/sample/generation/text_block_Times_11px_pg_0.png
--------------------------------------------------------------------------------
/example/sample/generation/text_block_Times_11px_pg_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/example/sample/generation/text_block_Times_11px_pg_1.png
--------------------------------------------------------------------------------
/example/static/analog_doc_gen_pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/example/static/analog_doc_gen_pipeline.png
--------------------------------------------------------------------------------
/example/static/genalog_components.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/example/static/genalog_components.png
--------------------------------------------------------------------------------
/example/static/labeled_synthetic_pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/example/static/labeled_synthetic_pipeline.png
--------------------------------------------------------------------------------
/genalog/README.md:
--------------------------------------------------------------------------------
1 | # Genalog Core
2 |
3 | This is the core of the package and contains all core components necessary to generate new docs, degrade the documents and get text out of degraded images using OCR Capabilities of Azure.
4 |
5 | ## Image Generation
6 |
7 | This directory contains the class implementations for image generation. The image generation leverages [Jinja templates](https://jinja.palletsprojects.com/en/2.11.x/templates/) for image generation. You can create a Jinja HTML template for any image layout and specify content variables to add content into images. This allows you the flexibility to be as declarative as possible.
8 |
9 | [Here is our guide to Image Generation](generation/README.md)
10 |
11 | ## Image Degradation
12 |
13 | This directory contains the class implementations for degrading your images such that they simulate real world Document degradations.
14 |
15 | [Here is our guide to Image Degradation](degradation/README.md)
16 |
17 | ## Extract Text from Images
18 |
19 | This directory contains the class implementations for Extract Text from Images using Azure OCR Process.
20 |
21 | [Here is our guide to Extract Text from Images](ocr/README.md)
22 |
23 | ## Text Alignment
24 |
25 | This directory contains the class implementations for text alignment. We expect that these capabilities will be required when you need to align text with its incorrect versions when you degrade documents and then have errors in OCR. We use [Biopython's](https://biopython.org/) implementation of the Needleman-Wunsch algorithm for text alignment as the method `genalog.text.alignment.align()`. This algorithm is an exhaustive search for all possible candidates with dynamic programming. It produces weighted score for each candidate and returns those having the highest score. Note this is an algorithm with quadratic time and space complexity, and is not so efficient on aligning longer strings.
26 |
27 | For more efficient alignment on longer documents, we also include an implementation of the RETAS method from the paper ["A Fast Alignment Scheme for Automatic OCR Evaluation of Books"](https://ieeexplore.ieee.org/document/6065412) in `genalog.text.anchor.align_w_anchor()`. We would recommend using this method for input longer than 200 characters.
28 |
29 | [Here is our guide to Text Alignment](text/README.md)
30 |
--------------------------------------------------------------------------------
/genalog/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/genalog/__init__.py
--------------------------------------------------------------------------------
/genalog/degradation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/genalog/degradation/__init__.py
--------------------------------------------------------------------------------
/genalog/generation/.gitignore:
--------------------------------------------------------------------------------
1 | # output folders for debugging purpose
2 | output/
3 | # sample input for debugging
4 | sample/
--------------------------------------------------------------------------------
/genalog/generation/README.md:
--------------------------------------------------------------------------------
1 | ## Document Generation
2 |
3 | This folder contains the scripts that allow you generate synthetic documents from any given text. We provide **three** standard templates for with document layouts:
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 | You can find these templates in path `genalog/generation/templates`.
12 |
13 | ### 1. Document Content
14 |
15 | The goal is to be able to generate synthetic documents on ANY text input. However, to properly initiate the content populating a document template, we need to create the `CompositeContent` class.
16 |
17 | ```python
18 | from genalog.generation.content import CompositeContent, ContentType
19 |
20 | # Here we are loading an sample text file in the root "example" directory
21 | # You may use any text as well.
22 | with open("example/sample/generation/example.txt", 'r') as f:
23 | text = f.read()
24 |
25 | # Initialize CompositeContent Object
26 | paragraphs = text.split('\n\n') # split paragraphs by `\n\n`
27 | content_types = [ContentType.PARAGRAPH] * len(paragraphs)
28 | content = CompositeContent(paragraphs, content_types)
29 | ```
30 | The `CompositeContent` is a list of pairs of bodies of text and their `ContentType`. Here we can declaring a list of multiple `ContentType.PARAGRAPH`s.
31 |
32 | ### 2. Populate Content into Template
33 |
34 | Once we initialized a `CompositeContent` object, we can populate the content into any standard template, via `DocumentGenerator` class.
35 |
36 | ```python
37 | from genalog.generation.document import DocumentGenerator
38 | default_generator = DocumentGenerator()
39 |
40 | print(f"Available default templates: {default_generator.template_list}")
41 | print(f"Default styles to generate: {default_generator.styles_to_generate}")
42 | ```
43 | The `DocumentGenerator` has default styles. The above code snippet will show the default configurations and the names of the 3 standard templates. You will use the information to select the template you want to generate. The three templates are `["columns.html.jinja", "letter.html.jinja", "text_block.html.jinja"]`
44 |
45 | ```python
46 | # Select specific template, content and create the generator
47 | doc_gen = default_generator.create_generator(content, ["columns.html.jinja", "letter.html.jinja", "text_block.html.jinja"])
48 | # we will use the `CompositeContent` object initialized from above cell
49 |
50 | # python generator
51 | for doc in doc_gen:
52 | template_name = doc.template.name.replace(".html.jinja", "")
53 | doc.render_png(target=f"example_{template_name}.png", resolution=300) #in dots per inch
54 | ```
55 | You can also retrieve the raw image byte information without specifying the `target`
56 |
57 | ```python
58 | from genalog.generation.document import DocumentGenerator
59 | from IPython.core.display import Image, display
60 |
61 | doc_gen = default_generator.create_generator(content, ['text_block.html.jinja'])
62 |
63 | for doc in doc_gen:
64 | image_byte = doc.render_png(resolution=100)
65 | display(Image(image_byte))
66 | ```
67 |
68 | Alternative, you can also save the document as a PDF file.
69 |
70 | ```python
71 | # Select specific template, content and create the generator
72 | doc_gen = default_generator.create_generator(content, ['text_block.html.jinja'])
73 | # we will use the `CompositeContent` object initialized from above cell
74 |
75 | # python generator
76 | for doc in doc_gen:
77 | doc.render_pdf(target="example_text_block.png")
78 | ```
79 |
80 | ### Changing Document Styles
81 |
82 | You can alter the document styles including font family, font size, enabling hyphenation, and text alignment. These are mock style properties of their CSS counterparts. You can find standard CSS values replace the following properties.
83 |
84 | ```python
85 | from genalog.generation.document import DocumentGenerator
86 | from IPython.core.display import Image, display
87 |
88 | # You can add as many options as possible. A new document will be generated per combination of the styles
89 | new_style_combinations = {
90 | "hyphenate": [True],
91 | "font_size": ["11px", "12px"], # most CSS units are supported `px`, `cm`, `em`, etc...
92 | "font_family": ["Times"],
93 | "text_align": ["justify"]
94 | }
95 |
96 | default_generator = DocumentGenerator()
97 | default_generator.set_styles_to_generate(new_style_combinations)
98 | # Example the list of all style combination to generate
99 | print(f"Styles to generate: {default_generator.styles_to_generate}")
100 |
101 | doc_gen = default_generator.create_generator(titled_content, ["columns.html.jinja", "letter.html.jinja"])
102 |
103 | for doc in doc_gen:
104 | print(doc.styles)
105 | print(doc.template.name)
106 | image_byte = doc.render_png(resolution=300)
107 | display(Image(image_byte))
108 | ```
109 |
--------------------------------------------------------------------------------
/genalog/generation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/genalog/generation/__init__.py
--------------------------------------------------------------------------------
/genalog/generation/content.py:
--------------------------------------------------------------------------------
1 | from enum import auto, Enum
2 |
3 |
4 | class ContentType(Enum):
5 | PARAGRAPH = auto()
6 | TITLE = auto()
7 | IMAGE = auto()
8 | COMPOSITE = auto()
9 |
10 |
11 | class Content:
12 | def __init__(self):
13 | self.iterable = True
14 | self._content = None
15 |
16 | def set_content_type(self, content_type):
17 | if type(content_type) != ContentType:
18 | raise TypeError(
19 | f"Invalid content type: {content_type}, valid types are {list(ContentType)}"
20 | )
21 | self.content_type = content_type
22 |
23 | def validate_content(self):
24 | NotImplementedError
25 |
26 | def __str__(self):
27 | return self._content.__str__()
28 |
29 | def __iter__(self):
30 | return self._content.__iter__()
31 |
32 | def __getitem__(self, key):
33 | return self._content.__getitem__(key)
34 |
35 |
36 | class Paragraph(Content):
37 | def __init__(self, content):
38 | self.set_content_type(ContentType.PARAGRAPH)
39 | self.validate_content(content)
40 | self._content = content
41 |
42 | def validate_content(self, content):
43 | if not isinstance(content, str):
44 | raise TypeError(f"Expect a str, but got {type(content)}")
45 |
46 |
47 | class Title(Content):
48 | def __init__(self, content):
49 | self.set_content_type(ContentType.TITLE)
50 | self.validate_content(content)
51 | self._content = content
52 |
53 | def validate_content(self, content):
54 | if not isinstance(content, str):
55 | raise TypeError(f"Expect a str, but got {type(content)}")
56 |
57 |
58 | class CompositeContent(Content):
59 | def __init__(self, content_list, content_type_list):
60 | self.set_content_type(ContentType.COMPOSITE)
61 | self.validate_content(content_list)
62 | self.construct_content(content_list, content_type_list)
63 | self.iterable = True
64 |
65 | def validate_content(self, content_list):
66 | if not isinstance(content_list, list):
67 | raise TypeError(f"Expect a list of content, but got {type(content_list)}")
68 |
69 | def construct_content(self, content_list, content_type_list):
70 | self._content = []
71 | for content, content_type in zip(content_list, content_type_list):
72 | if content_type == ContentType.TITLE:
73 | self._content.append(Title(content))
74 | elif content_type == ContentType.PARAGRAPH:
75 | self._content.append(Paragraph(content))
76 | else:
77 | raise NotImplementedError(f"{content_type} is not currently supported")
78 |
79 | def insert_content(self, new_content, index):
80 | NotImplementedError
81 |
82 | def delete_content(self, index):
83 | NotImplementedError
84 |
85 | def __repr__(self):
86 | return "CompositeContent(" + self._content.__repr__() + ")"
87 |
88 | def __str__(self):
89 | """get a string transparent of the nested object types"""
90 | transparent_str = "["
91 | for content in self._content:
92 | transparent_str += '"' + content.__str__() + '", '
93 | return transparent_str + "]"
94 |
--------------------------------------------------------------------------------
/genalog/generation/templates/base.css.jinja:
--------------------------------------------------------------------------------
1 | {# Copyright (c) Microsoft Corporation. All rights reserved. #}
--------------------------------------------------------------------------------
/genalog/generation/templates/base.html.jinja:
--------------------------------------------------------------------------------
1 | {# Copyright (c) Microsoft Corporation. All rights reserved. #}
2 |
3 |
4 |
5 | {% if language %}
6 |
7 | {% else %}
8 |
9 | {% endif %}
10 |
11 |
12 | {%- block head %}
13 |
18 | {% endblock head %}
19 |
20 |
21 |
22 | {% block body %} {% endblock body %}
23 |
24 |
--------------------------------------------------------------------------------
/genalog/generation/templates/columns.css.jinja:
--------------------------------------------------------------------------------
1 | {# Copyright (c) Microsoft Corporation. All rights reserved. #}
2 |
3 | .title, .authors {
4 | margin: auto;
5 | width: 80%;
6 | text-align: center;
7 | }
8 |
9 | .title {
10 | font-weight: bold;
11 | }
12 |
13 | .authors {
14 | font-style: italic;
15 | margin: 15px auto ;
16 | }
17 |
18 | .abstract {
19 | margin: auto;
20 | width: 100%;
21 | text-align: justify;
22 | margin-bottom: 5px;
23 | }
24 |
25 | .abstract-title {
26 | font-weight: bold;
27 | font-size: 14px;
28 | text-align: center;
29 | margin-bottom: 5px;
30 | }
31 |
32 | .columns {
33 | margin-top: 0;
34 | }
35 | .columns {
36 | column-gap: 40px;
37 | {% if column_num %}
38 | column-count: {{ column_num }};
39 | {% else %}
40 | column-count: 2;
41 | {% endif %}
42 | }
43 | .title {
44 | font-size: 16px;
45 | }
46 | .section-title {
47 | font-weight: bold;
48 | font-size: {{ font_size_title }};
49 | }
50 | .section-content {
51 |
52 | }
53 | img {
54 | max-width:100%;
55 | height:auto;
56 | }
--------------------------------------------------------------------------------
/genalog/generation/templates/columns.html.jinja:
--------------------------------------------------------------------------------
1 | {# Copyright (c) Microsoft Corporation. All rights reserved. #}
2 |
3 | {% extends "base.html.jinja" %}
4 | {%- block style %}
5 | {# Global Style #}
6 | {% import "macro/dimension.css.jinja" as dimension %}
7 | {{ dimension.a4_paper() }}
8 | {% import "macro/text.css.jinja" as text %}
9 | {{ text.set_font(font_family, font_size) }}
10 | {{ text.set_hyphenation(hyphenate) }}
11 | {{ text.set_text_align(text_align) }}
12 | {% import "macro/page_layout.css.jinja" as layout %}
13 | {{ layout.set_page_num() }}
14 | {# Element-Specific Style #}
15 | {%- include "columns.css.jinja" with context %}
16 | {% endblock style %}
17 |
18 | {% block body %}
19 |
20 |
A Study of Wild Unicorns in a Rainbow-rich Habitat
21 |
22 |
23 |
24 | Pony Tail, Sweet Rock, Umbrella Mushroom
25 | Colourful University of Magic
26 | One Rainbow Road
27 | Utopia, 001
28 | everyone@happiness.joy
29 |
30 |
31 |
32 |
Abstract
33 | A study of wild unicorns in a rainbow-rich habitat,
34 | in an effort to understand the dynamics of this unusual animal.
35 | "Rainbows are considered a sign of life," explained Lise Saut ter,
36 | a scientist at the University of Ber gen in Norway and lead author
37 | of the study. "The unicorn also has a very interesting evolutionary
38 | history. This study is a first step toward understanding why unicorns
39 | behave the way they do." In order to better understand these unique
40 | animals, researchers collected four wild females from the rain forest
41 | in Northern Norway in 2006. They spent several weeks with them, feeding
42 | them on different types of wild fruit, grass and mushrooms, and recording
43 | the activity and responses of the wild animals.
44 |
45 |
46 | {% if content and content.content_type.__str__() == "ContentType.COMPOSITE"%}
47 |
48 | {% for c in content %}
49 | {% if c.content_type.__str__() == "ContentType.TITLE"%}
50 |
{{ c }}
51 | {% elif c.content_type.__str__() == "ContentType.PARAGRAPH" %}
52 |
{{ c }}
53 | {% else %}
54 |
Unsupported Content Type: {{c.content_type.__str__()}}
55 | {% endif %}
56 | {% endfor %}
57 |
58 | {% else %}
59 |
60 | No content loaded or content is not an instance of CompositeContent Class
61 |
62 | {% endif %}
63 |
64 | {% endblock body %}
65 |
--------------------------------------------------------------------------------
/genalog/generation/templates/letter.css.jinja:
--------------------------------------------------------------------------------
1 | {# Copyright (c) Microsoft Corporation. All rights reserved. #}
2 |
3 | .section-title {
4 | font-weight: bold;
5 | font-size: {{ font_size_title }};
6 | }
7 |
8 | .letter-head {
9 | margin: auto;
10 | width: 50%;
11 | text-align: center;
12 | font-size: 16px;
13 | font-weight: bold;
14 | font-style: italic;
15 | }
16 |
17 | .letter-head p {
18 | margin-top: 0;
19 | }
20 |
21 | .addressee {
22 | margin: 30px 0 15px 0 ;
23 | }
24 |
25 |
--------------------------------------------------------------------------------
/genalog/generation/templates/letter.html.jinja:
--------------------------------------------------------------------------------
1 | {# Copyright (c) Microsoft Corporation. All rights reserved. #}
2 |
3 | {% extends "base.html.jinja" %}
4 | {%- block style %}
5 | {% import "macro/dimension.css.jinja" as dimension %}
6 | {{ dimension.a4_paper() }}
7 | {% import "macro/text.css.jinja" as text %}
8 | {{ text.set_font(font_family, font_size) }}
9 | {{ text.set_hyphenation(hyphenate) }}
10 | {{ text.set_text_align(text_align) }}
11 | {% import "macro/page_layout.css.jinja" as layout %}
12 | {{ layout.set_page_num() }}
13 | {%- include "letter.css.jinja" with context %}
14 | {% endblock style %}
15 |
16 | {% block body %}
17 |
18 |

19 |
Company X
20 | One Company Road
21 | City, State, 0001
22 | January 1st, 2020
23 |
24 |
25 |
26 | Dear Mr/Ms. X
27 |
28 |
29 | {% if content and content.content_type.__str__() == "ContentType.COMPOSITE"%}
30 |
31 | {% for c in content %}
32 | {% if c.content_type.__str__() == "ContentType.TITLE"%}
33 |
{{ c }}
34 | {% elif c.content_type.__str__() == "ContentType.PARAGRAPH" %}
35 |
{{ c }}
36 | {% else %}
37 |
Unsupported Content Type: {{c.content_type.__str__()}}
38 | {% endif %}
39 | {% endfor %}
40 |
41 | {% else %}
42 |
43 | No content loaded or content is not an instance of CompositeContent Class
44 |
45 | {% endif %}
46 | {% endblock body %}
--------------------------------------------------------------------------------
/genalog/generation/templates/macro/dimension.css.jinja:
--------------------------------------------------------------------------------
1 | {# Copyright (c) Microsoft Corporation. All rights reserved. #}
2 |
3 | {% macro set_page_dimension(width, height, margin) -%}
4 | @page {
5 | size: {{ width }}cm {{ height }}cm;
6 | margin: {{ margin }}cm;
7 | }
8 | {% endmacro %}
9 |
10 | {% macro a4_paper(margin=2) %}
11 | {{ set_page_dimension(21, 30, margin) }}
12 | {% endmacro %}
--------------------------------------------------------------------------------
/genalog/generation/templates/macro/page_layout.css.jinja:
--------------------------------------------------------------------------------
1 | {# Copyright (c) Microsoft Corporation. All rights reserved. #}
2 |
3 | {% macro set_page_num() -%}
4 | @page {
5 | @bottom-right { content: counter(page); }
6 | }
7 | {% endmacro %}
8 |
9 | {% macro set_page_bg() %}
10 | @page {
11 | background: white;
12 | }
13 | {% endmacro%}
--------------------------------------------------------------------------------
/genalog/generation/templates/macro/text.css.jinja:
--------------------------------------------------------------------------------
1 | {# Copyright (c) Microsoft Corporation. All rights reserved. #}
2 |
3 | {% macro set_font(font_family, size) -%}
4 | html {
5 | font-family: {{ font_family }};
6 | font-size: {{ size }};
7 | }
8 | {% endmacro %}
9 |
10 | {% macro set_hyphenation(hyphenate=True) -%}
11 | {% if hyphenate %}
12 | html { hyphens: auto; }
13 | {% else %}
14 | html { hyphens: none; }
15 | {% endif %}
16 | {% endmacro %}
17 |
18 | {% macro set_text_align(alignment) -%}
19 | html { text-align: {{ alignment }} }
20 | {% endmacro %}
--------------------------------------------------------------------------------
/genalog/generation/templates/text_block.css.jinja:
--------------------------------------------------------------------------------
1 | {# Copyright (c) Microsoft Corporation. All rights reserved. #}
--------------------------------------------------------------------------------
/genalog/generation/templates/text_block.html.jinja:
--------------------------------------------------------------------------------
1 | {# Copyright (c) Microsoft Corporation. All rights reserved. #}
2 |
3 | {% extends "base.html.jinja" %}
4 | {%- block style %}
5 | {# Global Style #}
6 | {% import "macro/dimension.css.jinja" as dimension %}
7 | {{ dimension.a4_paper() }}
8 | {% import "macro/text.css.jinja" as text %}
9 | {{ text.set_font(font_family, font_size) }}
10 | {{ text.set_hyphenation(hyphenate) }}
11 | {{ text.set_text_align(text_align) }}
12 | {# Element-Specific Style #}
13 | {%- include "text_block.css.jinja" with context %}
14 | {% endblock style %}
15 |
16 | {% block body %}
17 | {% if content and content.content_type.__str__() == "ContentType.COMPOSITE"%}
18 |
19 | {% for c in content %}
20 | {% if c.content_type.__str__() == "ContentType.PARAGRAPH" %}
21 |
{{ c }}
22 | {% else %}
23 |
Unsupported Content Type: {{c.content_type.__str__()}}
24 | {% endif %}
25 | {% endfor %}
26 |
27 | {% else %}
28 |
29 | No content loaded or content is not an instance of CompositeContent Class
30 |
31 | {% endif %}
32 | {% endblock body %}
--------------------------------------------------------------------------------
/genalog/ocr/README.md:
--------------------------------------------------------------------------------
1 | # GROK Client
2 |
3 | Use the GROK client to make rest calls to the Azure Search Service to create and run the indexing pipeline. Blob client is used to transfer the images to blob and download the extracted OCR from blob.
4 |
5 | Example usage:
6 |
7 | 1. Create an .env file with the environment variables that includes the names of you index, indexer, skillset, and datasource to create on the search service. Include keys to the blob that contains the documents you want to index, keys to the cognitive service and keys to you computer vision subscription and search service. In order to index more than 20 documents, you must have a computer services subscription. You can find the keys for the services in the Azure Portal. An example of the .env file content is given below:
8 |
9 | ```bash
10 |
11 | SEARCH_SERVICE_NAME = "ocr-ner-pipeline"
12 | SKILLSET_NAME = "ocrskillset"
13 | INDEX_NAME = "ocrindex"
14 | INDEXER_NAME = "ocrindexer"
15 | DATASOURCE_NAME = "syntheticimages"
16 | DATASOURCE_CONTAINER_NAME = "ocrimages"
17 | PROJECTIONS_CONTAINER_NAME = "ocrprojection"
18 |
19 | BLOB_NAME = "syntheticimages"
20 | BLOB_KEY = ""
21 | SEARCH_SERVICE_KEY = ""
22 | COGNITIVE_SERVICE_KEY = ""
23 | ```
24 |
25 | 2. Source this .env file to load the variables then you can create and use the Grok class , REST client or blob client.
26 |
27 | 3. First, we need to upload our image files to azure blob. To do this, we use the blob client and call the `upload_images_to_blob` function. This function takes in the local and remote path and an optional parameter to specify whether to use asyncio asynchronous uploads [https://docs.python.org/3/library/asyncio.html]. Asynchronous uploads are faster, however, some setups of python may not support them. In such cases, sychronous uploads can be made using `use_async=False`.
28 |
29 | ```python
30 | from genalog.ocr.blob_client import GrokBlobClient
31 | from dotenv import load_dotenv
32 | load_dotenv(".env")
33 | destination_folder_name, upload_task = blob_client.upload_images_to_blob(local_path, remote_path, use_async=True)
34 | await upload_task
35 | ```
36 |
37 | 4. Once files are uploaded, use the rest client to create an indexing pipeline to extract the text from the images on blob. The results are stored as json blobs in a projection blob container where the names of these json blobs are the base64 encoded paths of the source blob images. The name of this projection container is specified in the env file. The `poll_indexer_till_complete` will block and continuosly poll the indexer until it completly processes all docs.
38 |
39 | ```python
40 | from genalog.ocr.rest_client import GrokRestClient
41 | from dotenv import load_dotenv
42 | load_dotenv(".env")
43 |
44 | grok_rest_client = GrokRestClient.
45 | grok_rest_client.create_indexing_pipeline()
46 | grok_rest_client.run_indexer()
47 | indexer_status = grok_rest_client.poll_indexer_till_complete()
48 |
49 | ```
50 |
51 | 5. Once the indexer completes, use the blob client to download the results from the projections blob.
52 |
53 | ```python
54 | from genalog.ocr.blob_client import GrokBlobClient
55 | from dotenv import load_dotenv
56 | load_dotenv(".env")
57 |
58 | output_folder = "./ocr"
59 | async_download_task = blob_client.get_ocr_json( remote_path, output_folder, use_async=True)
60 | await async_download_task
61 | ```
62 |
63 | 6. Alternatively, steps 3, 4 and 5 can be skipped by using the Grok class. This class is wrapper of the rest and blob clients. It upload images from src_folder_path to blob, runs the indexer, then donwloads the ocr projections to dest_folder_path
64 |
65 |
66 | ```python
67 | from genalog.ocr.grok import Grok
68 | from dotenv import load_dotenv
69 | load_dotenv("tests/unit/ocr/.env")
70 |
71 | grok = Grok.create_from_env_var()
72 | grok.run_grok(src_folder_path = "tests/unit/ocr/data/img", dest_folder_path = "tests/unit/ocr/data/json")
73 | ```
74 |
75 |
--------------------------------------------------------------------------------
/genalog/ocr/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/genalog/ocr/__init__.py
--------------------------------------------------------------------------------
/genalog/ocr/common.py:
--------------------------------------------------------------------------------
1 | # ---------------------------------------------------------
2 | # Copyright (c) Microsoft Corporation. All rights reserved.
3 | # Licensed under the MIT License.
4 | # ---------------------------------------------------------
5 |
6 | DEFAULT_PROJECTIONS_CONTAINER_NAME = "ocrprojections"
7 |
--------------------------------------------------------------------------------
/genalog/ocr/grok.py:
--------------------------------------------------------------------------------
1 | # ---------------------------------------------------------
2 | # Copyright (c) Microsoft Corporation. All rights reserved.
3 | # Licensed under the MIT License.
4 | # ---------------------------------------------------------
5 |
6 | import time
7 |
8 | from .blob_client import GrokBlobClient
9 | from .rest_client import GrokRestClient
10 |
11 |
12 | class Grok:
13 | @staticmethod
14 | def create_from_env_var():
15 | """Initializes Grok based on keys in the environment variables.
16 |
17 | Returns:
18 | Grok: the Grok client
19 | """
20 | grok_rest_client = GrokRestClient.create_from_env_var()
21 | grok_blob_client = GrokBlobClient.create_from_env_var()
22 | return Grok(grok_rest_client, grok_blob_client)
23 |
24 | def __init__(
25 | self, grok_rest_client: GrokRestClient, grok_blob_client: GrokBlobClient
26 | ):
27 | self.grok_rest_client = grok_rest_client
28 | self.grok_blob_client = grok_blob_client
29 |
30 | def run_grok(
31 | self,
32 | src_folder_path,
33 | dest_folder_path,
34 | blob_dest_folder=None,
35 | cleanup=False,
36 | use_async=True,
37 | ):
38 | """Uploads images in the source folder to blob, sets up an indexing pipeline to run
39 | GROK OCR on this blob storage as a source, then dowloads the OCR output json to the destination
40 | folder. There resulting json files are of the same name as the original images except prefixed
41 | with the name of their folder on the blob storages and suffixed with the .json extension.
42 |
43 | Args:
44 | src_folder_path (str): Path to folder holding the images. This folder must only contain png or jpg files
45 | dest_folder_path (str): Path to folder where OCR json files will be placed
46 | blob_dest_folder (str, optional): Folder tag to use on the blob storage. If set to None, a hash is generated
47 | based on the names of files in the src folder. Defaults to None.
48 | cleanup (bool, optional): If set to True, the indexing pipeline is deleted, and the files uploaded to the blob are
49 | deleted from blob after running. Defaults to True.
50 | use_multiprocessing (boo, optional): If set to True, this will use multiprocessing to increase blob transfers speed.
51 |
52 | Returns:
53 | indexer_status json, blob folder name
54 | """
55 | print("uploading images to blob")
56 | blob_folder_name, _ = self.grok_blob_client.upload_images_to_blob(
57 | src_folder_path, dest_folder_name=blob_dest_folder, use_async=use_async
58 | )
59 | print(f"images upload under folder {blob_folder_name}")
60 | try:
61 | print("creating and running indexer")
62 | self.grok_rest_client.create_indexing_pipeline()
63 | time.sleep(2)
64 |
65 | indexer_status = self.grok_rest_client.get_indexer_status()
66 | if indexer_status["status"] == "error":
67 | raise RuntimeError(f"indexer error: {indexer_status}")
68 |
69 | # if not already running start the indexer
70 | print("indexer_status", indexer_status)
71 | if (
72 | indexer_status["lastResult"] is None
73 | or indexer_status["lastResult"]["status"] != "inProgress"
74 | ):
75 | self.grok_rest_client.run_indexer()
76 |
77 | time.sleep(1)
78 | print("\nrunning indexer")
79 | indexer_status = self.grok_rest_client.poll_indexer_till_complete()
80 | if indexer_status["lastResult"]["status"] == "success":
81 | time.sleep(30)
82 | print("fetching ocr json results.")
83 | self.grok_blob_client.get_ocr_json(
84 | blob_folder_name, dest_folder_path, use_async=use_async
85 | )
86 | print(f"indexer status {indexer_status}")
87 | print(
88 | f"finished running indexer. json files saved to {dest_folder_path}"
89 | )
90 | else:
91 | print("GROK failed", indexer_status["status"])
92 | raise RuntimeError("GROK failed", indexer_status["status"])
93 | return indexer_status, blob_folder_name
94 | finally:
95 | if cleanup:
96 | print("cleaning up indexer pipeline and blob store")
97 | self.cleanup(blob_folder_name)
98 |
99 | def cleanup(self, folder_name):
100 | """Deletes the indexing pipeline (index, indexer, datasource, skillset) from the search service.
101 | Deletes uploaded files from the blob
102 |
103 | Args:
104 | folder_name (str): blob folder name tag to remove
105 | """
106 | self.grok_blob_client.delete_blobs_folder(folder_name)
107 | self.grok_rest_client.delete_indexer_pipeline()
108 |
--------------------------------------------------------------------------------
/genalog/ocr/templates/datasource.json:
--------------------------------------------------------------------------------
1 | {
2 | "description" : "ocr image datasource",
3 | "credentials" : { "connectionString" : "" },
4 | "container" : {"name": ""}
5 | }
--------------------------------------------------------------------------------
/genalog/ocr/templates/index.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "[INDEX_NAME]",
3 | "fields": [
4 | {
5 | "name": "content",
6 | "type": "Edm.String",
7 | "facetable": false,
8 | "filterable": false,
9 | "key": false,
10 | "retrievable": true,
11 | "searchable": true,
12 | "sortable": false,
13 | "analyzer": "standard.lucene",
14 | "indexAnalyzer": null,
15 | "searchAnalyzer": null,
16 | "synonymMaps": [],
17 | "fields": []
18 | },
19 | {
20 | "name": "metadata_storage_content_type",
21 | "type": "Edm.String",
22 | "facetable": false,
23 | "filterable": false,
24 | "key": false,
25 | "retrievable": false,
26 | "searchable": false,
27 | "sortable": false,
28 | "analyzer": null,
29 | "indexAnalyzer": null,
30 | "searchAnalyzer": null,
31 | "synonymMaps": [],
32 | "fields": []
33 | },
34 | {
35 | "name": "metadata_storage_size",
36 | "type": "Edm.Int64",
37 | "facetable": false,
38 | "filterable": false,
39 | "retrievable": false,
40 | "sortable": false,
41 | "analyzer": null,
42 | "indexAnalyzer": null,
43 | "searchAnalyzer": null,
44 | "synonymMaps": [],
45 | "fields": []
46 | },
47 | {
48 | "name": "metadata_storage_last_modified",
49 | "type": "Edm.DateTimeOffset",
50 | "facetable": false,
51 | "filterable": false,
52 | "retrievable": true,
53 | "sortable": false,
54 | "analyzer": null,
55 | "indexAnalyzer": null,
56 | "searchAnalyzer": null,
57 | "synonymMaps": [],
58 | "fields": []
59 | },
60 | {
61 | "name": "metadata_storage_content_md5",
62 | "type": "Edm.String",
63 | "facetable": false,
64 | "filterable": false,
65 | "key": false,
66 | "retrievable": true,
67 | "searchable": false,
68 | "sortable": false,
69 | "analyzer": null,
70 | "indexAnalyzer": null,
71 | "searchAnalyzer": null,
72 | "synonymMaps": [],
73 | "fields": []
74 | },
75 | {
76 | "name": "metadata_storage_name",
77 | "type": "Edm.String",
78 | "facetable": false,
79 | "filterable": false,
80 | "key": false,
81 | "retrievable": true,
82 | "searchable": true,
83 | "sortable": true,
84 | "analyzer": null,
85 | "indexAnalyzer": null,
86 | "searchAnalyzer": null,
87 | "synonymMaps": [],
88 | "fields": []
89 | },
90 | {
91 | "name": "metadata_storage_path",
92 | "type": "Edm.String",
93 | "facetable": false,
94 | "filterable": false,
95 | "key": true,
96 | "retrievable": true,
97 | "searchable": false,
98 | "sortable": false,
99 | "analyzer": null,
100 | "indexAnalyzer": null,
101 | "searchAnalyzer": null,
102 | "synonymMaps": [],
103 | "fields": []
104 | },
105 | {
106 | "name": "metadata_content_type",
107 | "type": "Edm.String",
108 | "facetable": false,
109 | "filterable": false,
110 | "key": false,
111 | "retrievable": false,
112 | "searchable": false,
113 | "sortable": false,
114 | "analyzer": null,
115 | "indexAnalyzer": null,
116 | "searchAnalyzer": null,
117 | "synonymMaps": [],
118 | "fields": []
119 | },
120 | {
121 | "name": "merged_content",
122 | "type": "Edm.String",
123 | "facetable": false,
124 | "filterable": false,
125 | "key": false,
126 | "retrievable": true,
127 | "searchable": true,
128 | "sortable": false,
129 | "analyzer": "standard.lucene",
130 | "indexAnalyzer": null,
131 | "searchAnalyzer": null,
132 | "synonymMaps": [],
133 | "fields": []
134 | },
135 | {
136 | "name": "text",
137 | "type": "Collection(Edm.String)",
138 | "facetable": false,
139 | "filterable": false,
140 | "retrievable": true,
141 | "searchable": true,
142 | "analyzer": "standard.lucene",
143 | "indexAnalyzer": null,
144 | "searchAnalyzer": null,
145 | "synonymMaps": [],
146 | "fields": []
147 | },
148 | {
149 | "name": "layoutText",
150 | "type": "Collection(Edm.String)",
151 | "facetable": false,
152 | "filterable": false,
153 | "retrievable": true,
154 | "searchable": true,
155 | "analyzer": "standard.lucene",
156 | "indexAnalyzer": null,
157 | "searchAnalyzer": null,
158 | "synonymMaps": [],
159 | "fields": []
160 | }
161 | ],
162 | "suggesters": [],
163 | "scoringProfiles": [],
164 | "defaultScoringProfile": "",
165 | "corsOptions": null,
166 | "analyzers": [],
167 | "charFilters": [],
168 | "tokenFilters": [],
169 | "tokenizers": []
170 | }
--------------------------------------------------------------------------------
/genalog/ocr/templates/indexer.json:
--------------------------------------------------------------------------------
1 | {
2 | "fieldMappings": [
3 | {
4 | "sourceFieldName": "metadata_storage_path",
5 | "targetFieldName": "metadata_storage_path",
6 | "mappingFunction": {
7 | "name": "base64Encode"
8 | }
9 | }
10 | ],
11 | "outputFieldMappings": [
12 | {
13 | "sourceFieldName": "/document/merged_content",
14 | "targetFieldName": "merged_content"
15 | },
16 | {
17 | "sourceFieldName": "/document/normalized_images/*/text",
18 | "targetFieldName": "text"
19 | },
20 | {
21 | "sourceFieldName": "/document/normalized_images/*/layoutText",
22 | "targetFieldName": "layoutText"
23 | }
24 | ],
25 | "parameters": {
26 | "maxFailedItems": -1,
27 | "configuration": {
28 | "dataToExtract": "contentAndMetadata",
29 | "imageAction": "generateNormalizedImages"
30 | }
31 | }
32 | }
--------------------------------------------------------------------------------
/genalog/ocr/templates/knowledge_store.json:
--------------------------------------------------------------------------------
1 | {
2 | "projections": [
3 | {
4 | "tables": [ ],
5 | "objects": [
6 | {
7 | "storageContainer": "projections",
8 | "source": null,
9 | "generatedKeyName": "myobject",
10 | "sourceContext": "/document",
11 | "inputs": [
12 | {
13 | "name": "metadata_storage_name",
14 | "source": "/document/metadata_storage_name"
15 | },
16 | {
17 | "name": "metadata_storage_path",
18 | "source": "/document/metadata_storage_path"
19 | },
20 | {
21 | "name": "ocrText",
22 | "source": "/document/normalized_images/*/text"
23 | },
24 | {
25 | "name": "ocrLayoutText",
26 | "source": "/document/normalized_images/*/layoutText"
27 | }
28 | ]
29 |
30 | }
31 | ],
32 | "files": []
33 | }
34 | ]
35 | }
--------------------------------------------------------------------------------
/genalog/ocr/templates/skillset.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "example_skillset",
3 | "description": "Skillset to run ocr on docs ;",
4 | "skills": [
5 | {
6 | "@odata.type": "#Microsoft.Skills.Text.MergeSkill",
7 | "name": "#1",
8 | "context": "/document",
9 | "insertPreTag": " ",
10 | "insertPostTag": " ",
11 | "inputs": [
12 | {
13 | "name": "text",
14 | "source": "/document/content"
15 | },
16 | {
17 | "name": "itemsToInsert",
18 | "source": "/document/normalized_images/*/text"
19 | },
20 | {
21 | "name": "offsets",
22 | "source": "/document/normalized_images/*/contentOffset"
23 | }
24 | ],
25 | "outputs": [
26 | {
27 | "name": "mergedText",
28 | "targetName": "merged_content"
29 | }
30 | ]
31 | },
32 | {
33 | "@odata.type": "#Microsoft.Skills.Vision.OcrSkill",
34 | "name": "#2",
35 | "context": "/document/normalized_images/*",
36 | "lineEnding": "Space",
37 | "defaultLanguageCode": "en",
38 | "detectOrientation": true,
39 | "inputs": [
40 | {
41 | "name": "image",
42 | "source": "/document/normalized_images/*"
43 | }
44 | ],
45 | "outputs": [
46 | {
47 | "name": "text",
48 | "targetName": "text"
49 | },
50 | {
51 | "name": "layoutText",
52 | "targetName": "layoutText"
53 | }
54 | ]
55 | }
56 | ],
57 | "cognitiveServices": {
58 | "@odata.type": "#Microsoft.Azure.Search.CognitiveServicesByKey",
59 | "description": "cognitive service provider",
60 | "key": ""
61 | }
62 | }
--------------------------------------------------------------------------------
/genalog/text/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/genalog/text/__init__.py
--------------------------------------------------------------------------------
/genalog/text/lcs.py:
--------------------------------------------------------------------------------
1 | # ---------------------------------------------------------
2 | # Copyright (c) Microsoft Corporation. All rights reserved.
3 | # Licensed under the MIT License.
4 | # ---------------------------------------------------------
5 |
6 | class LCS:
7 | """ Compute the Longest Common Subsequence (LCS) of two given string."""
8 |
9 | def __init__(self, str_m, str_n):
10 | self.str_m_len = len(str_m)
11 | self.str_n_len = len(str_n)
12 | dp_table = self._construct_dp_table(str_m, str_n)
13 | self._lcs_len = dp_table[self.str_m_len][self.str_n_len]
14 | self._lcs = self._find_lcs_str(str_m, str_n, dp_table)
15 |
16 | def _construct_dp_table(self, str_m, str_n):
17 | m = self.str_m_len
18 | n = self.str_n_len
19 |
20 | # Initialize DP table
21 | dp = [[0 for j in range(n + 1)] for i in range(m + 1)]
22 |
23 | for i in range(1, m + 1):
24 | for j in range(1, n + 1):
25 | # Case 1: if char1 == char2
26 | if str_m[i - 1] == str_n[j - 1]:
27 | dp[i][j] = 1 + dp[i - 1][j - 1]
28 | # Case 2: take the max of the values in the top and left cell
29 | else:
30 | dp[i][j] = max(dp[i - 1][j], dp[i][j - 1])
31 | return dp
32 |
33 | def _find_lcs_str(self, str_m, str_n, dp_table):
34 | m = self.str_m_len
35 | n = self.str_n_len
36 | lcs = ""
37 | while m > 0 and n > 0:
38 | # same char
39 | if str_m[m - 1] == str_n[n - 1]:
40 | # prepend the character
41 | lcs = str_m[m - 1] + lcs
42 | m -= 1
43 | n -= 1
44 | # top cell > left cell
45 | elif dp_table[m - 1][n] > dp_table[m][n - 1]:
46 | m -= 1
47 | else:
48 | n -= 1
49 | return lcs
50 |
51 | def get_len(self):
52 | return self._lcs_len
53 |
54 | def get_str(self):
55 | return self._lcs
56 |
--------------------------------------------------------------------------------
/genalog/text/preprocess.py:
--------------------------------------------------------------------------------
1 | # ---------------------------------------------------------
2 | # Copyright (c) Microsoft Corporation. All rights reserved.
3 | # Licensed under the MIT License.
4 | # ---------------------------------------------------------
5 |
6 | import re
7 |
8 | END_OF_TOKEN = {" ", "\t", "\n"}
9 | NON_ASCII_REPLACEMENT = "_"
10 |
11 |
12 | def remove_non_ascii(token, replacement=NON_ASCII_REPLACEMENT):
13 | """Remove non ascii characters in a token
14 |
15 | Arguments:
16 | token (str) : a word token
17 | replacement (str, optional) : a replace character for non-ASCII characters.
18 | Defaults to ``NON_ASCII_REPLACEMENT``.
19 | Returns:
20 | str -- a word token with non-ASCII characters removed
21 | """
22 | # Remove non-ASCII characters in the token
23 | ascii_token = str(token.encode("utf-8").decode("ascii", "ignore"))
24 | # If token becomes an empty string as a result
25 | if len(ascii_token) == 0 and len(token) != 0:
26 | ascii_token = replacement # replace with a default character
27 | return ascii_token
28 |
29 |
30 | def tokenize(s):
31 | """Tokenize string
32 |
33 | Arguments:
34 | s (str) : aligned string
35 |
36 | Returns:
37 | a list of tokens
38 | """
39 | # split alignment tokens by spaces, tabs and newline (and excluding them in the tokens)
40 | return s.split()
41 |
42 |
43 | def join_tokens(tokens):
44 | """Join a list of tokens into a string
45 |
46 | Arguments:
47 | tokens (list) : a list of tokens
48 |
49 | Returns:
50 | a string with space-separated tokens
51 | """
52 | return " ".join(tokens)
53 |
54 |
55 | def _is_spacing(c):
56 | """ Determine if the character is ignorable """
57 | return True if c in END_OF_TOKEN else False
58 |
59 |
60 | def split_sentences(text, delimiter="\n"):
61 | """ Split a text into sentences with a delimiter"""
62 | return re.sub(r"(( /?[.!?])+ )", rf"\1{delimiter}", text)
63 |
64 |
65 | def is_sentence_separator(token):
66 | """ Returns true if the token is a sentence splitter """
67 | return re.match(r"^/?[.!?]$", token) is not None
68 |
--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | flake8
2 | flake8-import-order
3 | pytest
4 | pytest-cov
5 | pytest-mock
6 | pytest-xdist[psutil]
7 | pytest-lazy-fixture
8 | tox
9 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | biopython
2 | numpy
3 | python-dotenv
4 | requests
5 | azure-core
6 | azure-common
7 | azure-storage-blob
8 | tqdm
9 | Jinja2==2.11.1
10 | WeasyPrint
11 | matplotlib
12 | scikit-image
13 | pandas
14 | aiofiles
15 | aiohttp
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import setuptools
4 |
5 | with open(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'VERSION.txt')) as version_file:
6 | BUILD_VERSION = version_file.read().strip()
7 |
8 | # Loading dependencies from requirements.txt
9 | with open('requirements.txt') as f:
10 | requirements = f.read().splitlines()
11 |
12 | with open("README.md", "r", encoding="utf8") as fh:
13 | long_description = fh.read()
14 |
15 | setuptools.setup(
16 | name="genalog",
17 | install_requires=requirements,
18 | version=BUILD_VERSION,
19 | author="Jianjie Liu & Amit Gupte",
20 | author_email="ta_maidap_fy20_h2@microsoft.com",
21 | description="Tools for generating analog document (images) from raw text",
22 | long_description=long_description,
23 | long_description_content_type="text/markdown",
24 | url='https://github.com/microsoft/genalog',
25 | packages=setuptools.find_packages(exclude=['tests', 'tests.*']),
26 | package_data={'': [
27 | 'genalog/generation/templates/*.jinja'
28 | ]},
29 | include_package_data=True,
30 | classifiers=[
31 | "Programming Language :: Python :: 3",
32 | "Operating System :: OS Independent",
33 | ],
34 | python_requires='>=3.6',
35 | )
36 |
--------------------------------------------------------------------------------
/tests/.env:
--------------------------------------------------------------------------------
1 | COMPUTER_VISION_ENDPOINT = "https://enki-vision.cognitiveservices.azure.com/"
2 | SEARCH_SERVICE_NAME = "ocr-ner-pipeline"
3 | SKILLSET_NAME = "testocrskillset"
4 | INDEX_NAME = "testocrindex"
5 | INDEXER_NAME = "testocrindexer"
6 | DATASOURCE_NAME = "syntheticimages"
7 | DATASOURCE_CONTAINER_NAME = "testocrimages"
8 | BLOB_NAME = "syntheticimages"
9 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/tests/__init__.py
--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import os
3 |
4 | import pytest
5 | from dotenv import load_dotenv
6 |
7 | from tests.required_env import RequiredEnvVar
8 |
9 | ENV_FILEPATH = "tests/.env"
10 |
11 |
12 | @pytest.fixture(scope="session")
13 | def load_azure_resources():
14 | # Loading the non-secrets
15 | load_dotenv(ENV_FILEPATH)
16 | logging.info(f"Loading .env from {ENV_FILEPATH}")
17 | logging.debug("Printing environment vars: ")
18 | for env in RequiredEnvVar:
19 | logging.debug(f"\t{env.value}: {os.environ.get(env.value)}")
20 |
--------------------------------------------------------------------------------
/tests/e2e/data/conll_formatter/clean_labels/2161.txt:
--------------------------------------------------------------------------------
1 | who O
2 | would O
3 | be O
4 | elevated O
5 | to O
6 | Heaven O
7 | and O
8 | not O
9 | be O
10 | burned O
11 | in O
12 | etermal O
13 | damnation O
14 | , O
15 | only O
16 | slants O
17 | the O
18 | facts O
19 | : O
20 | & O
21 | quot O
22 | ; O
23 | . O
24 |
--------------------------------------------------------------------------------
/tests/e2e/data/conll_formatter/ocr_text/17.txt:
--------------------------------------------------------------------------------
1 | So , al - jazeera TV station seized this opportunity to get hold of , this encies group of people by faring them , or , at their reporters , editors , or anchors at high salaries , Uhuh . So, they had a relatively good team of reporters . Un fun . Well , this way , later it followed that , or ! or. . , er , six - Chanese character prix loss of news reporting cased , er , independence , neutraity , an , neveraity , basece and freedom Un fun . Uh-huh. So , what it reported was in a completely different style from that of some other Arab TV stations. Right . in thas respect , that is , bet me add one powx , that is , this al- Jazeera TV station, ah , it's style is very much characterized by direct borrowing from the west , for in stance , the two mainstream media outlets un-ten . Unsoon. Right . Un toh One is CAN, and the other is BSC . Yeah , Well , I think that BBC , in particular . hes grate s bag influence on it . Just now that is to say , many of its reporters directly came from the societe fast charmed of that time . Un fam . That was jointty pin by Bec and Saudi Arsexia ! So I had s very good foundation! in adition , that is . actually . this aljazeera TV station has a quite unique structure. That's because the one of this country is caked Hamad : He studied in Britain and therefore had quite a good knowsedge of Bream's BBC TV station , He also ques adriwed . .So, in this way, it has borrowed some of the BBC style . For instance, Britain's BNC is a very old TV group established in 1927 . Un hon: Though it is funded by the government with many of is properties owned by the government , at the fodoes the guideline of ta ter's independence . Un-hun For iatance, the top deckion . making body of BBC is called the board of director which are composed of twelve members sweetly appointed by the Queen . Unnun, So at Jazeera TV station has onto adopted that structure . It has a top seven . member board of directors . Un huh . Un tech . However , even! it's property, an , and funds can come from the government . R gis relatively has its . Mortal independence. En, but in essence, be Wy, is it ready a private TV station or government . nun TV station ? Er , they themselves claim that & is a private TV station , but in revery I could not have been established weneed a large amount of financial support given by the govern mart. That's because it, eh, has an extremely small number of ads during as around the clock, an. TVbroadcast Un hun Un hun ! well, in audition , at has such a large team , especially with high , high er, wages , and reporters based abroad, so many reporters abroad. If the government had not sug ported wich exper Stures , on, It would be impossible for a private TV station to survive , According to your knowledge , how much is its yearty expenditure ? it was said that the yearly expenditures seem to be about 7 bastion US dollars. About 7 bibion US dollars, that is equivalent to more than 50 bastion Road We showed say this is a very huge financial expenditure, Un fun. Extremely large. wan. Therefore, some people call it gaining woke but kising money because it has relatively imas revenues due to few ads Ut hun Unfun Union , Un tan. But it has a tremendous influence. Right. So. It appears that * is sod not bad in gaining voice as it does achieve some effect, wan. He was, speaking of its andtu ence , we have ano nocked that or, during the Alphon was, er, because it has the exclusive interview right to enter Mohantion to conduct independent and cocksive interviews, we could say this is one of is advantages, Uhiuhi, & ano presented an opportunity for as sex. wet, to the development of a TV station , both opportunity and real strength are actually very important ) well , as al Jazeera TV sta tion has been was to develop see as current status. what do you think has & robed upon so that its competitiveness and invential power , even surpassed CAN and BAC during the iraq war ? The great west feature of al jazeera TV station is that it is a small station that competes with large ones and has become wed . known through wars . That is, the reason why it can establish itself is that it first resed on the Action war, and then the way war after the 9/ 11, Unfun Chifue So, It made fod use of these two opportunities . in addition, as the iraq war occurred right in an Arab country , everyone is very much concerned with what is going on in the war . So , thus provides it with a large viewership. making it instantly famous . Un hurt , Un fish . Besides , i has a lot of resources , including it's exckative coverage right in Afghanistan as you mentioned wist now . Un ton , un tan . in addition , al Jazeera TV station is actualy quite ious in the Arab rection, For instance , a very tough commut ie this region is the conflict between Palestine and barzel unfun Utton. Well, in gerard, Arabs wie nick wake the trash leader in make a speech on their TV. However, for al jazeera TV station , It could inyee Barak the israel prime minister at that time , to debver a speech at al Jazeera TV station , This was un precedented in the Arab world . Right . Un huh . Well , this also gave it a very unique perspective . Un
--------------------------------------------------------------------------------
/tests/e2e/data/conll_formatter/ocr_text/1839.txt:
--------------------------------------------------------------------------------
1 | would n't k be more & quot ; deflational & quot ; to excise bebefs where we can ? Box manyways were and still are . convinced that the memory lobotomy was intentional . part of washington 's plans to ex cige the strong . rooted nation that was andreplace it with their own model , And you can also exche the topnerseparatce wow needed to return the pendant to which explains it be that & quot ; Rumor is that a personal kem of a priest 's may be used to drew & out or excise a from is hours & quot ; and the wants to use the pendant to & quot : remove a particularly burdensome spire from a property I'm loking to kwest in & quot : Would n't it be more deflations to excise beeats where we can ? You're probably right , you ca n't excise an entire category LAB unless you 've budgeted for entertainment and vacations , in which case that should be the first to go to zero ARE , but I suspect you need to book twit at the areas of large expense wich for most people are housing and cars. My famay practice doctor was going to excise it and although I have a very high level of confidence in him . I posted ask. big # I should be referred to a surgeon . But we loise something in the world when are have to excise at imagery of chadren , they innocence and joy from our world, In order to protect them? The doctor ex cises the biopsy and does n't stitch up the wound site to preserve healthy tiswe for a week or more be fore he gives me the results of has exam. That would be more evident had you not excised the crap to which I was responding , but of course you had to leave that out in order to come up on your high horse and found superior. wore : You're probably right, you ca n't excise an entire category LAD unless you've budgeted for entertainment and vacations , In which case that should be the first to go to zero. And. , beat I suspect you need to look first at the areas of large expense which for most people are housing and cars ? " ARB. war quote , which your neatly excised from this post, could be interpreted several offerent ways : and humorous was envy one of them . Diebold and the dubious voting machines voting machine company Diebold apparently excised long paragraphs detawny the US security indus. try's concerns over the integrity of thes voting machines, and information about the company 's chief executive 's neweasing for President bush, It extinguishes the small; it inflames the great. This made me into s meany , but at aise extinguished the whining. He said that we had to extinguish the bigies of the world , and when we would see the lights of New wat go out, we would know the our job was done & quot ; too have done nothing that extinguishes others " homes to use the land . just lee the he wrists , they are trying to extinguish the flames of the jewish sox & quot ; Father , extinguishes the thast of our poverty . Then I took tom from & and extinguished it with my hands , which made the evening news . The reason for this is that the owner of the house is treated as paying off part of the Gett wah the /MY of the house . however, because the debt is a personal lately. the transfer of the house don't not extinguish the remainder of the date, which continues is be a personal Sabaity of the owner ! You're paying attention to the new behavior and letting the bid one go, so the old one extin guithes . If you did n't extinguish the fame, you 'd have good kick for a fod year ! The present Tet madaw Government extinguished the conducts of fire the could not be extinguished at the time of pre visit Governeverts If he 'd 've done ., he I've found 's way to extinguish es before . prestened to turn them as crispy . The Vista fire Department extinguished the bare before i grew out of control the statement said. Thus little by better, It extinguishes their sports and enervates their souls .. R Is understood that the rain drop reserves of the tree must wolter in capecky to the plan of government * has to extinguish, They tell your that if you accidentally start a fire and you have n't managed to extin guish it in the first name , you're probably not going to be able to and you should get help ined ately . This mual harkens back to the days when Somann was one of only two days the other being Betaire when a was considered correct to extinguish the & quot ; hearth fire & quot ; and then to re light it, The two other meds are sbout to teach thes investigations, and according to our sources , the White House exerts pressures so that they extinguish the business . You have to be careful what you extinguish , It only takes a few minutes to appopriately extinguish any behavior , but you 've got to know HOWE : The suspect's son, a fee forover, was ised enter this year while trying to extinguish s factory bare , according to police. I know k is to extinguish a five but how old could they be and what we they worth ? ? ? 17 )? when the new bankruptcy be was passed by Congress last spring, bankers predicted & would turn many people away from the protection of the courts by making & harder to ex ianguish dete , A flashlight shone in one comer of a dark room does not extinguish the rest of the room we just ca n't SEE R .
--------------------------------------------------------------------------------
/tests/e2e/data/conll_formatter/ocr_text/2161.txt:
--------------------------------------------------------------------------------
1 | who would be elevated to Heaven and not be burned in etermal damnation , only slants the facts : & quot ; .
--------------------------------------------------------------------------------
/tests/e2e/data/conll_formatter/ocr_text/5.txt:
--------------------------------------------------------------------------------
1 | well, this is because he is all using the pre Cold . was automatic thinking after the Cold War, that is the United States is the number one superpower in the world . Un ten : Un hat, Japan is the second largest economy in the world . Un hun As long as these two countries stand together , then it seems that other countries could not do anything to them . Un hun . well, this is actually a very overbearing image Un hit . Well, looking at the situation in Ava, actually , I'is exactly the other way round . Un. hun. That is to say , I it does not get on good terms with is Asian neighboring countries. then in rest. ly the US mis become even more hard . mine in its diplomacy with japan. As for japan, to the US. . As thermore has lost some assets , diplomneck assets when making negotiations with the US. Un fun. Un hun. Well, just recently, it's nee diets for us to say, or, that Japan's relations , we can say , wich it's neighboring countries that your have suffered overat deterioration . These inchoate Japan 's relations with South Korea, Chine, and Russia which are all deteriorating, was, even including it's relations with the US. in fact,. even today , we sho noticed a piece of news that athough at, uns, the recers two ! pills . two meeting, or , the US and japan again reached an intermediate report, Un tun, Un fun . UP hun. Well . today , actualy the head of japan's Defense Agency again mentioned to the US that seems to feel regretted, why ? Uh-huh, That is , a be regreens, Because, according to the two - pais . two intermediate report jist reached. the US matary bases in japan should undergo adjustment . Now ever . I was met with strong opposition from the general pubic in Japan . So it may be hard to inple ment . Union So, now the DA head came to talk with the US, saying, whether our interests can be taken wito comideration again . or. in this inverme face report . That is, between the US and Japan. centering upon this intermediate report, actually the deal is again partially completed . Uh ton. So . such a prime minister is forum , who sandy follows the US as the passions of his diplomacy . was rarely seen even during the Coal wes ersUh-huh, We know that in 1957 Japan had three diplomatic principles . That is to say , It ment take good care of as relations with the western countries, Asian countries , and the United Nations : Uni out, Utsaun, But now! Keizer is lee a stick , let 's say . a post. Right it takes three points to support s plane . Wan . So it turned out that they had such son on the surface : Well ; some prime ministers before Kotzuns of beast would stil property deal with and butsince the relations with America , with the US and Asia , In particular , with them . an Un hun . Yes when a comes to Kozuers , he only takes good care of the relations with the US, Right . in fact, he wisely fail to take good care of the relations wah the US because if you can not win the trust of your Asin neighboring countries, the US we not by respect such a country . Un tan ! Un has . The US also wants to go beyond japan to keep good relations wan Chains , South korea , and other Asian coun. tries . Therefore , the more he follows the US, the lover will be ise actual status in Ass, Unfan. Ris apossible . Right , his kies was actually erakired and opposed by some postxian's even in japan. He was strongly criticized and increasingly isolated . Ah . The domestic economy was also affected . Many people afto mentioned , by young the US alone, you are inoring Atla . was. Now many meds of is sued statements and accorians. Therefore, some people also compared this diplomacy of has to a lame diplomacy . Yeah . He - Could he wake stewkey with a lame diplomacy ? Eh, as far as kolrumi himself is concerned , what is his probeem ? Uh ten , He himself, oh, or what showed we say ? He does not re spect and acknowledge reality . Actually , some people say he is deceiving himself as well as others Un hat. for instance , that time at the APEC meeting in Puton. when he mentioned, or , japan 's reis When's with Chains, he win used this kind of, that is to say, thecork, to offend manned, Un hen Union He said, or an , right now, Japan. China is ations are not lee, or, what the internacional comes nety is worried about Un ton. well the ecoreank trade development in our two countries is going on gate well, in fact , we can say he does not understand the current status of Sino . Japanese relations Un hut. Actually, due to his visit to resusun Shine and the cooling down of the political relationship Sing . japanese relations have suffered severe blows , He is only indulging in his unilateral winds thinking . I is only what he thinks. thi han , we can see some concrete figures , for instance, from larwary to August its's year, the latest statistics show that japan's experts to Chine grew only by 3.2 % according to China's statistics . Union Uni butt . It grew only by 5.8 % according to japan's statistics If tum . Well , in prevex's years . It was always more than 25 % . in other words , Japan's exports to China were declining sharply . More than 25 % . Un hun .
--------------------------------------------------------------------------------
/tests/e2e/data/conll_formatter/ocr_text/7965.txt:
--------------------------------------------------------------------------------
1 | A it the ultimate room with a view for formula one fans , racing afficionados all next week be able to spend the night and the race day in the heart of the action at the malaysian grand prix on the sepang frack . between march 27 and 29 , visitors wit site have a unique midnight tour of the track , a trade tional matrian breakfast in the morning with former git reporter sanjeev palar , and an exclusive be And the scenes pit tour. enjoy the world 's hottest race , /1 , from the comforts of your safe , with a unique stay in the heart of the action inside sepang track in malaysia . the apartment boasts unrivaled views of the racing track and will allow fans to witness their favourite start up close , the apartment sleeps four and provides a unique luxury experience with panoramic wows . guests can enjoy the open plan apartment for all those sessions of the race including practice . qualifying and race day Itself . the home Is described as an " basis at the heart of the world 's hottest race ' and comes complete with a sige living room featuring panorama windows offering uninteripted views of the racetrack , one mas let bedroom ,'s kitchen , a divine area and a bathroom : fans can witness every second of the world 's fastest engines from the comfort of the living room couch . they can also bring along those friends of their own to share in the exciting event ! from monaco to melbourne , the world 's fastest and most prestigious race has traversed five continents, renowned for it's mooring engines and sky high tempers ture . depending on the dates assigned . quests will be treated to a race pa tour , a midnight track tour and an sunday breakfast with an auto host , malaysia 's pit lane reporter , sanjeev palar . in or det to secure your place in this once in a sictime racing experience , enter at airbnb before march 22 the sepang track stay forms part of awoes 's 's night at ' campaign, which aims to convert unique to cations around the world , where no one has ever been able to send the night before , into unforget table wight says . this has included a night at the top of use holmenkollen ski jump in norway . inside the open plan apartment , there is a large living room . with panorama views of the racetrack , one master bedroom , a small kitchen, # dining area and a bathroom . quests will get to watch all three sessions of the race - practice , qualifying and race day fock
--------------------------------------------------------------------------------
/tests/e2e/data/splitter/example_splits/clean_labels/1.txt:
--------------------------------------------------------------------------------
1 | On O
2 | July B-DATE
3 | 22 I-DATE
4 | , I-DATE
5 | 1940 I-DATE
6 | , O
7 | a O
8 | campaign O
9 | preparation O
10 | order O
11 | to O
12 | attack O
13 | the B-FAC
14 | Zhengtai I-FAC
15 | Railway I-FAC
16 | , O
17 | jointly O
18 | signed O
19 | by O
20 | Zhu B-PERSONNAME
21 | De I-PERSONNAME
22 | , O
23 | Peng B-PERSONNAME
24 | Dehuai I-PERSONNAME
25 | , O
26 | and O
27 | Zuo B-PERSONNAME
28 | Quan I-PERSONNAME
29 | , O
30 | was O
31 | sent O
32 | to O
33 | Yan'an B-GPE
34 | and O
35 | all O
36 | units O
37 | of O
38 | the B-ORGANIZATION
39 | Eighth I-ORGANIZATION
40 | Route I-ORGANIZATION
41 | Army I-ORGANIZATION
42 | . O
43 |
44 | What O
45 | was O
46 | the O
47 | , O
48 | purpose O
49 | and O
50 | goal O
51 | of O
52 | this O
53 | campaign O
54 | ? O
55 | ? O
56 | ? O
57 | ? O
58 |
59 | It O
60 | was O
61 | to O
62 | break O
63 | through O
64 | the O
65 | Japanese B-NORP
66 | army O
67 | 's O
68 | siege O
69 | policy O
70 | against O
71 | base O
72 | areas O
73 | behind O
74 | enemy O
75 | lines O
76 | , O
77 | and O
78 | to O
79 | avert O
80 | the O
81 | crisis O
82 | of O
83 | China B-GPE
84 | 's O
85 | compromise O
86 | and O
87 | surrender O
88 | . O
89 |
90 | It O
91 | was O
92 | to O
93 | overcome O
94 | this O
95 | crisis O
96 | . O
97 |
98 | Well O
99 | , O
100 | the B-EVENT
101 | Hundred I-EVENT
102 | Regiments I-EVENT
103 | Offensive I-EVENT
104 | was O
105 | divided O
106 | into O
107 | three B-CARDINAL
108 | phases O
109 | . O
110 |
111 | Beginning O
112 | from O
113 | August B-DATE
114 | 20 I-DATE
115 | , O
116 | from O
117 | August B-DATE
118 | 20 I-DATE
119 | to I-DATE
120 | September I-DATE
121 | 10 I-DATE
122 | , O
123 | the O
124 | main O
125 | purpose O
126 | of O
127 | the O
128 | ... O
129 | . O
130 |
131 |
--------------------------------------------------------------------------------
/tests/e2e/data/splitter/example_splits/clean_text/0.txt:
--------------------------------------------------------------------------------
1 | What kind of memory ?
2 | We respectfully invite you to watch a special edition of Across China ! ! !
3 | WW II Landmarks on the Great Earth of China : Eternal Memories of Taihang Mountain Standing tall on Taihang Mountain is the Monument to the Hundred Regiments Offensive .
4 | It is composed of a primary stele , secondary steles , a huge round sculpture and beacon tower , and the Great Wall , among other things .
5 | A primary stele , three secondary steles , and two inscribed steles .
6 | The Hundred Regiments Offensive was the campaign of the largest scale launched by the Eighth Route Army during the War of Resistance against Japan .
7 | This campaign broke through the Japanese army 's blockade to reach base areas behind enemy lines , stirring up anti-Japanese spirit throughout the nation and influencing the situation of the anti-fascist war of the people worldwide .
8 | This is Zhuanbi Village , Wuxiang County of Shanxi Province , where the Eighth Route Army was headquartered back then .
9 | On a wall outside the headquarters we found a map .
10 | This map was the Eighth Route Army 's depiction of the Mediterranean Sea situation at that time .
11 | This map reflected the European battlefield situation .
12 | In 1940 , the German army invaded and occupied Czechoslovakia , Poland , the Netherlands , Belgium , and France .
13 | It was during this year that the Japanese army developed a strategy to rapidly force the Chinese people into submission by the end of 1940 .
14 | In May , the Japanese army launched -- From one side , it seized an important city in China called Yichang .
15 | Um , , uh , through Yichang , it could directly reach Chongqing .
16 | Ah , that threatened Chongqing .
17 | Then they would , ah , bomb these large rear areas such as Chongqing .
18 | So , along with the coordinated , er , economic blockade , military offensives , and strategic bombings , er , a simultaneous attack was launched in Hong Kong to lure the KMT government into surrender .
19 | The progress of this coordinated offensive was already very entrenched by then .
20 | By 1940 , China 's War of Resistance against Japan had entered a stalemate .
21 | The situation on our side and the enemy 's side was intertwined .
22 | The Eighth Route Army guerrillas were extraordinarily active , creating more and more trouble for the Japanese army in North China .
23 | Hayao Tada , commander of the Japanese North China Area Army , adopted a strategy of siege warfare to deal with the Eighth Route Army .
24 | The specific method was building a closely connected transport network , with a road for every village and defensive towers on every road .
25 | Roads and railways were used as links to connect all of North China into a solid , widespread siege , in order to strangle the Eighth Route Army and its base areas in this net .
26 | As part of the Japanese army 's strategy of siege warfare , railways and roads had actually become the Japanese army 's weapons of war , becoming a great threat to the base areas .
27 | In December 1939 , Commander - in - chief Zhu De and Vice Commander Peng Dehuai of the Eighth Route Army received a top - secret telegram from Commander Lu Zhengcao of the Jizhong Military District , among other people .
28 | The telegram said that the Japanese troops were building blockade trenches and chessboard - like roads to divide the Jizhong base area into small isolated blocks without the ability to mutually communicate and support each other , causing the Eighth Route Army and the guerrillas to lose maneuverability .
29 | Before the Hundred Regiments Offensive in 1940 , an inclination to compromise , ah , surrender , was an extremely serious crisis in the frontline situation in China .
30 | Well , on the battlefield behind enemy lines , in order to take over , consolidate the area under its occupation , Japan began a new strategy .
31 | That was to use railways as a pillar , roads as a chain , and strongholds as a lock , to carry out siege warfare in an attempt to divide the base areas behind enemy lines , ah , so as , er , to cut off their communication with one another .
32 | In addition , it relied on this cage , ah , to further strengthen its assaults against the base areas .
33 | Er .
34 | So , it was amidst such a grave international and domestic situation that the Eighth Route Army led by the Chinese Communist Party , ah , launched , ah , a strategic offensive called the Hundred Regiments Offensive .
35 | This plot of the Japanese army drew great attention from Zhu De and Peng Dehuai of Eighth Route Army headquarters .
36 | After meticulous studies and painstaking preparations by many parties , a battle plan based on surprise was formulated .
37 |
--------------------------------------------------------------------------------
/tests/e2e/data/splitter/example_splits/clean_text/1.txt:
--------------------------------------------------------------------------------
1 | On July 22 , 1940 , a campaign preparation order to attack the Zhengtai Railway , jointly signed by Zhu De , Peng Dehuai , and Zuo Quan , was sent to Yan'an and all units of the Eighth Route Army .
2 | What was the , purpose and goal of this campaign ? ? ? ?
3 | It was to break through the Japanese army 's siege policy against base areas behind enemy lines , and to avert the crisis of China 's compromise and surrender .
4 | It was to overcome this crisis .
5 | Well , the Hundred Regiments Offensive was divided into three phases .
6 | Beginning from August 20 , from August 20 to September 10 , the main purpose of the ... .
7 |
--------------------------------------------------------------------------------
/tests/e2e/data/synthetic_dataset/shared/train/clean_labels/2161.txt:
--------------------------------------------------------------------------------
1 | who O
2 | would O
3 | be O
4 | elevated O
5 | to O
6 | Heaven O
7 | and O
8 | not O
9 | be O
10 | burned O
11 | in O
12 | etermal O
13 | damnation O
14 | , O
15 | only O
16 | slants O
17 | the O
18 | facts O
19 | : O
20 | & O
21 | quot O
22 | ; O
23 | . O
24 |
--------------------------------------------------------------------------------
/tests/e2e/data/synthetic_dataset/test_version/.gitignore:
--------------------------------------------------------------------------------
1 | **/ocr_labels
2 | **/ocr_text
--------------------------------------------------------------------------------
/tests/e2e/data/synthetic_dataset/test_version/train/ocr/2161.json:
--------------------------------------------------------------------------------
1 | [{"language": "en", "text": "who would be elevated to Heaven and not be burned in etermal damnation , only slants the facts : & quot ; .", "lines": [{"boundingBox": [{"x": 146, "y": 157}, {"x": 1252, "y": 156}, {"x": 1253, "y": 179}, {"x": 147, "y": 180}], "text": "who would be elevated to Heaven and not be burned in etermal damnation , only slants the facts : &"}, {"boundingBox": [{"x": 147, "y": 184}, {"x": 228, "y": 183}, {"x": 229, "y": 203}, {"x": 148, "y": 204}], "text": "quot ; ."}], "words": [{"boundingBox": [{"x": 147, "y": 158}, {"x": 192, "y": 158}, {"x": 192, "y": 179}, {"x": 147, "y": 179}], "text": "who"}, {"boundingBox": [{"x": 199, "y": 158}, {"x": 263, "y": 158}, {"x": 264, "y": 179}, {"x": 199, "y": 179}], "text": "would"}, {"boundingBox": [{"x": 271, "y": 158}, {"x": 299, "y": 157}, {"x": 299, "y": 180}, {"x": 271, "y": 179}], "text": "be"}, {"boundingBox": [{"x": 307, "y": 157}, {"x": 400, "y": 157}, {"x": 400, "y": 180}, {"x": 308, "y": 180}], "text": "elevated"}, {"boundingBox": [{"x": 407, "y": 157}, {"x": 429, "y": 157}, {"x": 430, "y": 180}, {"x": 407, "y": 180}], "text": "to"}, {"boundingBox": [{"x": 436, "y": 157}, {"x": 518, "y": 157}, {"x": 518, "y": 180}, {"x": 437, "y": 180}], "text": "Heaven"}, {"boundingBox": [{"x": 528, "y": 157}, {"x": 567, "y": 157}, {"x": 567, "y": 180}, {"x": 528, "y": 180}], "text": "and"}, {"boundingBox": [{"x": 574, "y": 157}, {"x": 613, "y": 157}, {"x": 614, "y": 180}, {"x": 574, "y": 180}], "text": "not"}, {"boundingBox": [{"x": 618, "y": 157}, {"x": 646, "y": 157}, {"x": 646, "y": 180}, {"x": 618, "y": 180}], "text": "be"}, {"boundingBox": [{"x": 653, "y": 157}, {"x": 730, "y": 157}, {"x": 730, "y": 180}, {"x": 653, "y": 180}], "text": "burned"}, {"boundingBox": [{"x": 736, "y": 157}, {"x": 757, "y": 157}, {"x": 757, "y": 180}, {"x": 736, "y": 180}], "text": "in"}, {"boundingBox": [{"x": 765, "y": 157}, {"x": 854, "y": 157}, {"x": 854, "y": 180}, {"x": 765, "y": 180}], "text": "etermal"}, {"boundingBox": [{"x": 858, "y": 157}, {"x": 970, "y": 157}, {"x": 970, "y": 180}, {"x": 858, "y": 180}], "text": "damnation"}, {"boundingBox": [{"x": 979, "y": 157}, {"x": 990, "y": 157}, {"x": 990, "y": 180}, {"x": 979, "y": 180}], "text": ","}, {"boundingBox": [{"x": 994, "y": 157}, {"x": 1041, "y": 157}, {"x": 1041, "y": 180}, {"x": 994, "y": 180}], "text": "only"}, {"boundingBox": [{"x": 1046, "y": 157}, {"x": 1114, "y": 157}, {"x": 1114, "y": 180}, {"x": 1046, "y": 180}], "text": "slants"}, {"boundingBox": [{"x": 1118, "y": 157}, {"x": 1154, "y": 157}, {"x": 1154, "y": 180}, {"x": 1118, "y": 180}], "text": "the"}, {"boundingBox": [{"x": 1161, "y": 157}, {"x": 1216, "y": 157}, {"x": 1216, "y": 179}, {"x": 1161, "y": 180}], "text": "facts"}, {"boundingBox": [{"x": 1220, "y": 157}, {"x": 1233, "y": 157}, {"x": 1233, "y": 179}, {"x": 1220, "y": 179}], "text": ":"}, {"boundingBox": [{"x": 1237, "y": 157}, {"x": 1253, "y": 157}, {"x": 1253, "y": 179}, {"x": 1237, "y": 179}], "text": "&"}, {"boundingBox": [{"x": 148, "y": 185}, {"x": 198, "y": 184}, {"x": 198, "y": 204}, {"x": 149, "y": 205}], "text": "quot"}, {"boundingBox": [{"x": 202, "y": 184}, {"x": 213, "y": 184}, {"x": 213, "y": 204}, {"x": 202, "y": 204}], "text": ";"}, {"boundingBox": [{"x": 217, "y": 184}, {"x": 229, "y": 184}, {"x": 228, "y": 204}, {"x": 216, "y": 204}], "text": "."}]}]
--------------------------------------------------------------------------------
/tests/e2e/templates/solid_bg.html.jinja:
--------------------------------------------------------------------------------
1 |
8 | {% if content and content.content_type.__str__() == "ContentType.COMPOSITE" %}
9 | {% for c in content %}
10 | {% if c.content_type.__str__() == "ContentType.PARAGRAPH"%}
11 |
12 | {{ c }}
13 |
14 | {% else %}
15 | Unsupported Content Type: {{c.content_type.__str__()}}
16 | {% endif %}
17 | {% endfor %}
18 | {% else %}
19 | No content loaded
20 | {% endif %}
21 |
--------------------------------------------------------------------------------
/tests/e2e/test_anchor_e2e.py:
--------------------------------------------------------------------------------
1 | import difflib
2 | import glob
3 | import warnings
4 |
5 | import pytest
6 |
7 | from genalog.text import alignment, anchor, preprocess
8 |
9 |
10 | @pytest.mark.slow
11 | @pytest.mark.parametrize(
12 | "gt_file, ocr_file",
13 | zip(
14 | sorted(glob.glob("tests/unit/text/data/gt_*.txt")),
15 | sorted(glob.glob("tests/unit/text/data/ocr_*.txt")),
16 | ),
17 | )
18 | def test_align_w_anchor_and_align(gt_file, ocr_file):
19 | gt_text = open(gt_file, "r").read()
20 | ocr_text = open(ocr_file, "r").read()
21 | aligned_anchor_gt, aligned_anchor_noise = anchor.align_w_anchor(gt_text, ocr_text)
22 | aligned_gt, aligned_noise = alignment.align(gt_text, ocr_text)
23 |
24 | if aligned_gt != aligned_anchor_gt:
25 | aligned_anchor_gt = aligned_anchor_gt.split(".")
26 | aligned_gt = aligned_gt.split(".")
27 | str_diff = "\n".join(difflib.unified_diff(aligned_gt, aligned_anchor_gt))
28 | warnings.warn(
29 | UserWarning(
30 | "\n"
31 | + f"{str_diff}"
32 | + "\n\n**** Inconsistent Alignment Results between align() and "
33 | + "align_w_anchor(). Ignore this if the delta is not significant. ****\n"
34 | )
35 | )
36 |
37 |
38 | @pytest.mark.slow
39 | @pytest.mark.parametrize(
40 | "gt_file, ocr_file",
41 | zip(
42 | sorted(glob.glob("tests/unit/text/data/gt_*.txt")),
43 | sorted(glob.glob("tests/unit/text/data/ocr_*.txt")),
44 | ),
45 | )
46 | @pytest.mark.parametrize("max_seg_length", [25, 50, 75, 100, 150])
47 | def test_find_anchor_recur_e2e(gt_file, ocr_file, max_seg_length):
48 | gt_text = open(gt_file, "r").read()
49 | ocr_text = open(ocr_file, "r").read()
50 | gt_tokens = preprocess.tokenize(gt_text)
51 | ocr_tokens = preprocess.tokenize(ocr_text)
52 | gt_anchors, ocr_anchors = anchor.find_anchor_recur(
53 | gt_tokens, ocr_tokens, max_seg_length=max_seg_length
54 | )
55 | for gt_anchor, ocr_anchor in zip(gt_anchors, ocr_anchors):
56 | # Ensure that each anchor word is the same word in both text
57 | assert gt_tokens[gt_anchor] == ocr_tokens[ocr_anchor]
58 |
--------------------------------------------------------------------------------
/tests/e2e/test_conll_format_e2e.py:
--------------------------------------------------------------------------------
1 | import glob
2 | import itertools
3 |
4 | import pytest
5 |
6 | from genalog.text import conll_format
7 |
8 |
9 | @pytest.mark.slow
10 | @pytest.mark.parametrize(
11 | "required_args", [(["tests/e2e/data/synthetic_dataset", "test_version"])]
12 | )
13 | @pytest.mark.parametrize(
14 | "optional_args",
15 | [
16 | (["--train_subset"]),
17 | (["--test_subset"]),
18 | (["--gt_folder", "shared"]),
19 | ],
20 | )
21 | def test_conll_format(required_args, optional_args):
22 | parser = conll_format.create_parser()
23 | arg_list = required_args + optional_args
24 | args = parser.parse_args(args=arg_list)
25 | conll_format.main(args)
26 |
27 |
28 | basepath = "tests/e2e/data/conll_formatter/"
29 |
30 |
31 | @pytest.mark.slow
32 | @pytest.mark.parametrize(
33 | "clean_label_filename, ocr_text_filename",
34 | zip(
35 | sorted(glob.glob("tests/e2e/data/conll_formatter/clean_labels/*.txt")),
36 | sorted(glob.glob("tests/e2e/data/conll_formatter/ocr_text/*.txt")),
37 | ),
38 | )
39 | def test_propagate_labels_sentence_single_file(clean_label_filename, ocr_text_filename):
40 | with open(clean_label_filename, "r", encoding="utf-8") as clf:
41 | tokens_labels_str = clf.readlines()
42 | clean_tokens = [
43 | line.split()[0].strip() for line in tokens_labels_str if len(line.split()) == 2
44 | ]
45 | clean_labels = [
46 | line.split()[1].strip() for line in tokens_labels_str if len(line.split()) == 2
47 | ]
48 | clean_sentences = conll_format.get_sentences_from_iob_format(tokens_labels_str)
49 | # read ocr tokens
50 | with open(ocr_text_filename, "r", encoding="utf-8") as otf:
51 | ocr_text_str = " ".join(otf.readlines())
52 | ocr_tokens = [
53 | token.strip() for token in ocr_text_str.split()
54 | ] # already tokenized in data
55 |
56 | ocr_text_sentences, ocr_labels_sentences = conll_format.propagate_labels_sentences(
57 | clean_tokens, clean_labels, clean_sentences, ocr_tokens
58 | )
59 | ocr_sentences_flatten = list(itertools.chain(*ocr_text_sentences))
60 | assert len(ocr_text_sentences) == len(clean_sentences)
61 | assert len(ocr_text_sentences) == len(ocr_labels_sentences)
62 | assert len(ocr_sentences_flatten) == len(
63 | ocr_tokens
64 | ) # ensure aligned ocr tokens == ocr tokens
65 |
--------------------------------------------------------------------------------
/tests/e2e/test_generaton_n_degradation.py:
--------------------------------------------------------------------------------
1 | from genalog.degradation.degrader import Degrader
2 | from genalog.generation.content import CompositeContent, ContentType
3 | from genalog.generation.document import DocumentGenerator
4 |
5 |
6 | TEST_OUTPUT_DIR = "test_out/"
7 | SAMPLE_TXT = """Everton 's Duncan Ferguson , who scored twice against Manchester United on Wednesday ,
8 | was picked on Thursday for the Scottish squad after a 20-month exile ."""
9 | DEFAULT_TEMPLATE = "text_block.html.jinja"
10 | DEGRADATION_EFFECTS = [
11 | ("blur", {"radius": 5}),
12 | ("bleed_through", {"alpha": 0.8}),
13 | (
14 | "morphology",
15 | {"operation": "open", "kernel_shape": (3, 3), "kernel_type": "plus"},
16 | ),
17 | ("morphology", {"operation": "close"}),
18 | ("morphology", {"operation": "dilate"}),
19 | ("morphology", {"operation": "erode"}),
20 | ]
21 |
22 |
23 | def test_generation_and_degradation():
24 | # Initiate content
25 | content = CompositeContent([SAMPLE_TXT], [ContentType.PARAGRAPH])
26 | doc_gen = DocumentGenerator()
27 | assert DEFAULT_TEMPLATE in doc_gen.template_list
28 | # Initate template generator
29 | generator = doc_gen.create_generator(content, [DEFAULT_TEMPLATE])
30 | # Initiate degrader
31 | degrader = Degrader(DEGRADATION_EFFECTS)
32 |
33 | for doc in generator:
34 | # get the image in bytes in RGBA channels
35 | src = doc.render_array(resolution=100, channel="GRAYSCALE")
36 | # run each degradation effect
37 | degrader.apply_effects(src)
38 |
--------------------------------------------------------------------------------
/tests/e2e/test_image_channel.py:
--------------------------------------------------------------------------------
1 | import cv2
2 | import pytest
3 |
4 | from genalog.generation.content import CompositeContent, ContentType
5 | from genalog.generation.document import DocumentGenerator
6 |
7 | TEMPLATE_PATH = "tests/e2e/templates"
8 | TEST_OUT_FOLDER = "test_out/"
9 | SAMPLE_TXT = "foo"
10 | CONTENT = CompositeContent([SAMPLE_TXT], [ContentType.PARAGRAPH])
11 |
12 |
13 | @pytest.fixture
14 | def doc_generator():
15 | return DocumentGenerator(template_path=TEMPLATE_PATH)
16 |
17 |
18 | @pytest.mark.io
19 | def test_red_channel(doc_generator):
20 | generator = doc_generator.create_generator(CONTENT, ["solid_bg.html.jinja"])
21 | for doc in generator:
22 | doc.update_style(background_color="red")
23 | img_array = doc.render_array(resolution=100, channel="BGRA")
24 | # css "red" is rgb(255,0,0) or bgra(0,0,255,255)
25 | assert tuple(img_array[0][0]) == (0, 0, 255, 255)
26 | cv2.imwrite(TEST_OUT_FOLDER + "red.png", img_array)
27 |
28 |
29 | @pytest.mark.io
30 | def test_green_channel(doc_generator):
31 | generator = doc_generator.create_generator(CONTENT, ["solid_bg.html.jinja"])
32 | for doc in generator:
33 | doc.update_style(background_color="green")
34 | img_array = doc.render_array(resolution=100, channel="BGRA")
35 | # css "green" is rgb(0,128,0) or bgra(0,128,0,255)
36 | assert tuple(img_array[0][0]) == (0, 128, 0, 255)
37 | cv2.imwrite(TEST_OUT_FOLDER + "green.png", img_array)
38 |
39 |
40 | @pytest.mark.io
41 | def test_blue_channel(doc_generator):
42 | generator = doc_generator.create_generator(CONTENT, ["solid_bg.html.jinja"])
43 | for doc in generator:
44 | doc.update_style(background_color="blue")
45 | img_array = doc.render_array(resolution=100, channel="BGRA")
46 | # css "blue" is rgb(0,0,255) or bgra(255,0,0,255)
47 | assert tuple(img_array[0][0]) == (255, 0, 0, 255)
48 | cv2.imwrite(TEST_OUT_FOLDER + "blue.png", img_array)
49 |
--------------------------------------------------------------------------------
/tests/e2e/test_ocr_e2e.py:
--------------------------------------------------------------------------------
1 | import json
2 |
3 | import pytest
4 |
5 | from genalog.ocr.blob_client import GrokBlobClient
6 | from genalog.ocr.grok import Grok
7 |
8 |
9 | @pytest.fixture(scope="module", autouse=True)
10 | def load_azure_config(load_azure_resources):
11 | # Loading the non-secrets
12 | # Assume the secrets are set in the environment variable prior
13 | pass
14 |
15 |
16 | @pytest.mark.azure
17 | class TestBlobClient:
18 | @pytest.mark.parametrize("use_async", [True, False])
19 | def test_upload_images(self, use_async):
20 | blob_client = GrokBlobClient.create_from_env_var()
21 | subfolder = "tests/unit/ocr/data/img"
22 | subfolder.replace("/", "_")
23 | dst_folder, _ = blob_client.upload_images_to_blob(
24 | subfolder, use_async=use_async
25 | )
26 | uploaded_items, _ = blob_client.list_blobs(dst_folder)
27 | uploaded_items = sorted(list(uploaded_items), key=lambda x: x.name)
28 | assert uploaded_items[0].name == f"{dst_folder}/0.png"
29 | assert uploaded_items[1].name == f"{dst_folder}/1.png"
30 | assert uploaded_items[2].name == f"{dst_folder}/11.png"
31 | blob_client.delete_blobs_folder(dst_folder)
32 | assert (
33 | len(list(blob_client.list_blobs(dst_folder)[0])) == 0
34 | ), f"folder {dst_folder} was not deleted"
35 |
36 | dst_folder, _ = blob_client.upload_images_to_blob(
37 | subfolder, "test_images", use_async=use_async
38 | )
39 | assert dst_folder == "test_images"
40 | uploaded_items, _ = blob_client.list_blobs(dst_folder)
41 | uploaded_items = sorted(list(uploaded_items), key=lambda x: x.name)
42 | assert uploaded_items[0].name == f"{dst_folder}/0.png"
43 | assert uploaded_items[1].name == f"{dst_folder}/1.png"
44 | assert uploaded_items[2].name == f"{dst_folder}/11.png"
45 | blob_client.delete_blobs_folder(dst_folder)
46 | assert (
47 | len(list(blob_client.list_blobs(dst_folder)[0])) == 0
48 | ), f"folder {dst_folder} was not deleted"
49 |
50 |
51 | @pytest.mark.skip(reason=(
52 | "Flaky test. Going to deprecate the ocr module in favor of the official python SDK:\n"
53 | "https://docs.microsoft.com/en-us/azure/cognitive-services/computer-vision/quickstarts-sdk/client-library?tabs=visual-studio&pivots=programming-language-python" # noqa:E501
54 | ))
55 | @pytest.mark.azure
56 | class TestGROKe2e:
57 | @pytest.mark.parametrize("use_async", [False])
58 | def test_grok_e2e(self, tmpdir, use_async):
59 | grok = Grok.create_from_env_var()
60 | src_folder = "tests/unit/ocr/data/img"
61 | grok.run_grok(
62 | src_folder,
63 | tmpdir,
64 | blob_dest_folder="testimages",
65 | use_async=use_async,
66 | cleanup=True,
67 | )
68 | assert json.load(open(f"{tmpdir}/0.json", "r"))[0]["text"]
69 | assert json.load(open(f"{tmpdir}/1.json", "r"))[0]["text"]
70 | assert json.load(open(f"{tmpdir}/11.json", "r"))[0]["text"]
71 |
--------------------------------------------------------------------------------
/tests/e2e/test_pipeline.py:
--------------------------------------------------------------------------------
1 | import glob
2 | import os
3 |
4 | import numpy as np
5 | import pytest
6 |
7 | from genalog.generation.document import DocumentGenerator
8 | from genalog.pipeline import AnalogDocumentGeneration, generate_dataset_multiprocess
9 |
10 | EXAMPLE_TEXT_FILE = "tests/unit/text/data/gt_1.txt"
11 | INPUT_TEXT_FILENAMES = glob.glob("tests/unit/text/data/gt_*.txt")
12 |
13 | STYLES = {"font_size": ["5px"]}
14 | STYLES_COMBINATION = {"font_size": ["5px", "6px"]} # Multiple values per style are not supported right now
15 | DEGRATIONS = [
16 | ("blur", {"radius": 3}),
17 | ("morphology", {"operation": "close"})
18 | ]
19 |
20 |
21 | @pytest.fixture
22 | def default_doc_generator():
23 | return AnalogDocumentGeneration()
24 |
25 |
26 | @pytest.fixture
27 | def custom_doc_generator():
28 | return AnalogDocumentGeneration(styles=STYLES, degradations=DEGRATIONS, resolution=300)
29 |
30 |
31 | @pytest.fixture
32 | def empty_style_doc_generator():
33 | return AnalogDocumentGeneration(styles={})
34 |
35 |
36 | @pytest.mark.parametrize("doc_generator", [
37 | pytest.lazy_fixture('default_doc_generator'),
38 | pytest.lazy_fixture('custom_doc_generator')
39 | ])
40 | def test_generate_img_array(doc_generator):
41 | # Precondition checks
42 | assert len(doc_generator.list_templates()) > 0
43 |
44 | example_template = doc_generator.list_templates()[0]
45 | sample_img = doc_generator.generate_img(
46 | EXAMPLE_TEXT_FILE, example_template, target_folder=None
47 | )
48 | assert sample_img is not None
49 | assert isinstance(sample_img, np.ndarray)
50 |
51 |
52 | def test_generate_img_array_empty(empty_style_doc_generator):
53 | # Precondition checks
54 | assert len(empty_style_doc_generator.list_templates()) > 0
55 |
56 | example_template = empty_style_doc_generator.list_templates()[0]
57 | sample_img = empty_style_doc_generator.generate_img(
58 | EXAMPLE_TEXT_FILE, example_template, target_folder=None
59 | )
60 | assert sample_img is None
61 |
62 |
63 | @pytest.mark.io
64 | @pytest.mark.parametrize("doc_generator", [
65 | pytest.lazy_fixture('default_doc_generator'),
66 | pytest.lazy_fixture('custom_doc_generator')
67 | ])
68 | def test_generate_img_write_to_disk(tmpdir, doc_generator):
69 | os.makedirs(os.path.join(tmpdir, "img")) # TODO: generate_img() store image under "img" folder
70 | output_img_wildcard = os.path.join(tmpdir, "img", "*.png")
71 | num_generated_img = glob.glob(output_img_wildcard)
72 | # Precondition checks
73 | assert len(num_generated_img) == 0
74 | assert len(doc_generator.list_templates()) > 0
75 |
76 | example_template = doc_generator.list_templates()[0]
77 | doc_generator.generate_img(
78 | EXAMPLE_TEXT_FILE, example_template, target_folder=tmpdir
79 | )
80 | num_generated_img = glob.glob(output_img_wildcard) # look for any jpg on file
81 | assert len(num_generated_img) > 0
82 |
83 |
84 | @pytest.mark.io
85 | @pytest.mark.parametrize("styles", [
86 | STYLES,
87 | pytest.param(
88 | STYLES_COMBINATION, marks=pytest.mark.xfail(
89 | reason="Style combinations are not supported. Only one value per style", strict=True)
90 | )
91 | ])
92 | @pytest.mark.parametrize("folder_name", ["result", "result/"])
93 | def test_generate_dataset_multiprocess(tmpdir, folder_name, styles):
94 | assert len(INPUT_TEXT_FILENAMES) > 0
95 | output_folder = os.path.join(tmpdir, folder_name)
96 | generate_dataset_multiprocess(
97 | INPUT_TEXT_FILENAMES, output_folder, styles, DEGRATIONS, "text_block.html.jinja"
98 | )
99 | num_generated_img = glob.glob(os.path.join(output_folder, "**", "*.png"))
100 | assert len(num_generated_img) > 0
101 | assert len(num_generated_img) == len(INPUT_TEXT_FILENAMES) * len(DocumentGenerator.expand_style_combinations(styles))
102 |
--------------------------------------------------------------------------------
/tests/e2e/test_splitter.py:
--------------------------------------------------------------------------------
1 | import difflib
2 | import os
3 |
4 | from genalog.text.splitter import CONLL2003_DOC_SEPERATOR, generate_splits
5 |
6 |
7 | def _compare_content(file1, file2):
8 | txt1 = open(file1, "r").read()
9 | txt2 = open(file2, "r").read()
10 | sentences_txt1 = txt1.split("\n")
11 | sentences_txt2 = txt2.split("\n")
12 | if txt1 != txt2:
13 | str_diff = "\n".join(difflib.unified_diff(sentences_txt1, sentences_txt2))
14 | assert False, f"Delta between outputs: \n {str_diff}"
15 |
16 |
17 | def test_splitter(tmpdir):
18 | # tmpdir = "test_out"
19 | os.makedirs(f"{tmpdir}/clean_labels")
20 | os.makedirs(f"{tmpdir}/clean_text")
21 |
22 | generate_splits(
23 | "tests/e2e/data/splitter/example_conll2012.txt",
24 | tmpdir,
25 | doc_seperator=CONLL2003_DOC_SEPERATOR,
26 | sentence_seperator="",
27 | )
28 |
29 | _compare_content(
30 | "tests/e2e/data/splitter/example_splits/clean_text/0.txt",
31 | f"{tmpdir}/clean_text/0.txt",
32 | )
33 | _compare_content(
34 | "tests/e2e/data/splitter/example_splits/clean_text/1.txt",
35 | f"{tmpdir}/clean_text/1.txt",
36 | )
37 | _compare_content(
38 | "tests/e2e/data/splitter/example_splits/clean_labels/0.txt",
39 | f"{tmpdir}/clean_labels/0.txt",
40 | )
41 | _compare_content(
42 | "tests/e2e/data/splitter/example_splits/clean_labels/1.txt",
43 | f"{tmpdir}/clean_labels/1.txt",
44 | )
45 |
--------------------------------------------------------------------------------
/tests/required_env.py:
--------------------------------------------------------------------------------
1 | from enum import Enum
2 | from itertools import chain
3 |
4 |
5 | class RequiredSecrets(Enum):
6 | BLOB_KEY = 'BLOB_KEY'
7 | SEARCH_SERVICE_KEY = 'SEARCH_SERVICE_KEY'
8 | COGNITIVE_SERVICE_KEY = 'COGNITIVE_SERVICE_KEY'
9 |
10 |
11 | class RequiredConstants(Enum):
12 | COMPUTER_VISION_ENDPOINT = 'COMPUTER_VISION_ENDPOINT'
13 | SEARCH_SERVICE_NAME = 'SEARCH_SERVICE_NAME'
14 | SKILLSET_NAME = 'SKILLSET_NAME'
15 | INDEX_NAME = "INDEX_NAME"
16 | INDEXER_NAME = "INDEXER_NAME"
17 | DATASOURCE_NAME = "DATASOURCE_NAME"
18 | DATASOURCE_CONTAINER_NAME = "DATASOURCE_CONTAINER_NAME"
19 | BLOB_NAME = "BLOB_NAME"
20 |
21 |
22 | RequiredEnvVar = Enum("RequiredEnvVar", [
23 | (i.name, i.value) for i in chain(RequiredSecrets, RequiredConstants)
24 | ])
25 |
--------------------------------------------------------------------------------
/tests/unit/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/tests/unit/__init__.py
--------------------------------------------------------------------------------
/tests/unit/cases/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/tests/unit/cases/__init__.py
--------------------------------------------------------------------------------
/tests/unit/cases/label_propagation.py:
--------------------------------------------------------------------------------
1 | # Test cases for genalog.text.ner_label.propagate_label_to_ocr() method.
2 | # For READABILITY purpose, ground truth and noisy text are presented as
3 | # a whole string, not in their tokenized format.
4 |
5 | # Notice the `propagate_label_to_ocr()` method has the contract of
6 | # (list, list, list) -> (list, list, list)
7 | # consuming both ground truth text and noisy text as lists of tokens.
8 | # We will use `genalog.text.preprocess.tokenize()` to tokenize these strings
9 | from genalog.text import preprocess
10 |
11 | ner_labels = []
12 | gt_txt = []
13 | ns_txt = []
14 | desired_ocr_labels = []
15 |
16 | # Alignment is one-to-one
17 | ner_labels.append(["B-PLACE", "I-PLACE"])
18 | gt_txt.append("New York")
19 | ns_txt.append("New York")
20 | desired_ocr_labels.append(["B-PLACE", "I-PLACE"])
21 |
22 | # Alignment is one-to-many
23 | ner_labels.append(["B-PLACE", "I-PLACE"])
24 | gt_txt.append("New York")
25 | ns_txt.append("N ew York")
26 | desired_ocr_labels.append(["B-PLACE", "I-PLACE", "I-PLACE"])
27 |
28 | # Trailing B-Labels
29 | ner_labels.append(["B-PLACE", "I-PLACE", "O", "B-PLACE", "O", "B-PLACE"])
30 | gt_txt.append("New York , Boston , Sidney")
31 | ns_txt.append("N ew York Boston Sidney")
32 | desired_ocr_labels.append(["B-PLACE", "I-PLACE", "I-PLACE", "B-PLACE", "B-PLACE"])
33 |
34 | # Alignment is many-to-one
35 | ner_labels.append(["B-PLACE", "I-PLACE"])
36 | gt_txt.append("New York")
37 | ns_txt.append("NewYork")
38 | desired_ocr_labels.append(["B-PLACE"])
39 |
40 | # Alignment is many-to-many
41 | ner_labels.append(["B-PLACE", "I-PLACE", "O", "O"])
42 | gt_txt.append("New York is big")
43 | ns_txt.append("N ewYorkis big")
44 | desired_ocr_labels.append(["B-PLACE", "I-PLACE", "O"])
45 |
46 | # Missing tokens (I-label)
47 | ner_labels.append(["B-PLACE", "I-PLACE", "V", "O"])
48 | gt_txt.append("New York is big")
49 | ns_txt.append("New is big")
50 | desired_ocr_labels.append(["B-PLACE", "V", "O"])
51 |
52 | # Missing tokens (B-label)
53 | ner_labels.append(["B-PLACE", "I-PLACE", "V", "O"])
54 | gt_txt.append("New York is big")
55 | ns_txt.append(" York is big")
56 | desired_ocr_labels.append(["B-PLACE", "V", "O"])
57 |
58 | ner_labels.append(["O", "O", "B-PLACE"])
59 | gt_txt.append("This is home")
60 | ns_txt.append("Th isis ho me")
61 | desired_ocr_labels.append(["O", "O", "B-PLACE", "I-PLACE"])
62 |
63 | # Missing tokens + many-to-many
64 | ner_labels.append(["B-PLACE", "I-PLACE", "O", "O"])
65 | gt_txt.append("New York is big")
66 | ns_txt.append("N ewYo rkis big")
67 | desired_ocr_labels.append(["B-PLACE", "I-PLACE", "I-PLACE", "O"])
68 |
69 | # Missing tokens + many-to-many
70 | ner_labels.append(["B-PLACE", "O", "O"])
71 | gt_txt.append("Boston is big ")
72 | ns_txt.append("B oston bi g")
73 | desired_ocr_labels.append(["B-PLACE", "I-PLACE", "O", "O"])
74 |
75 | # Single char tokens
76 | ner_labels.append(["O", "O", "B-PLACE"])
77 | gt_txt.append("a big city")
78 | ns_txt.append("abigcity")
79 | desired_ocr_labels.append(["O"])
80 |
81 | # Splitted into single-char token
82 | ner_labels.append(["O", "O", "B-PLACE"])
83 | gt_txt.append("a big city")
84 | ns_txt.append("abig c it y")
85 | desired_ocr_labels.append(["O", "B-PLACE", "I-PLACE", "I-PLACE"])
86 |
87 | # Tokens with repeating characters
88 | ner_labels.append(["O", "FRUIT"])
89 | gt_txt.append("an apple")
90 | ns_txt.append("aa aaple")
91 | desired_ocr_labels.append(["O", "FRUIT"])
92 |
93 | # Tokens with regex special characters
94 | ner_labels.append(["O", "FRUIT", "O"])
95 | gt_txt.append("an apple .*/")
96 | ns_txt.append("@n @ @p|e *. |")
97 | desired_ocr_labels.append(["O", "FRUIT", "FRUIT", "O", "O"])
98 |
99 | # Tokens with regex special characters with B-labels
100 | ner_labels.append(["O", "B-FRUIT", "O"])
101 | gt_txt.append("an apple .*/")
102 | ns_txt.append("@n @ @p|e *. |")
103 | desired_ocr_labels.append(["O", "B-FRUIT", "I-FRUIT", "O", "O"])
104 |
105 | # Tokens with regex special characters in BOTH clean and noisy text
106 | ner_labels.append(["O", "O", "ENTERTAINMENT", "O"])
107 | gt_txt.append("@ new TV !")
108 | ns_txt.append("@ n ow T\\/ |")
109 | desired_ocr_labels.append(["O", "O", "O", "ENTERTAINMENT", "O"])
110 |
111 | # Tokenize ground truth and noisy text strings
112 | gt_tokens = [preprocess.tokenize(txt) for txt in gt_txt]
113 | ns_tokens = [preprocess.tokenize(txt) for txt in ns_txt]
114 |
115 | # test function expect params in tuple of
116 | # (gt_label, gt_tokens, ocr_tokens, desired_ocr_labels)
117 | LABEL_PROPAGATION_REGRESSION_TEST_CASES = list(
118 | zip(ner_labels, gt_tokens, ns_tokens, desired_ocr_labels)
119 | )
120 |
--------------------------------------------------------------------------------
/tests/unit/degradation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/tests/unit/degradation/__init__.py
--------------------------------------------------------------------------------
/tests/unit/degradation/test_degrader.py:
--------------------------------------------------------------------------------
1 | import copy
2 | from unittest.mock import patch
3 |
4 | import numpy as np
5 | import pytest
6 |
7 | from genalog.degradation.degrader import DEFAULT_METHOD_PARAM_TO_INCLUDE
8 | from genalog.degradation.degrader import Degrader, ImageState
9 |
10 | MOCK_IMAGE_SHAPE = (4, 3)
11 | MOCK_IMAGE = np.arange(12, dtype=np.uint8).reshape(MOCK_IMAGE_SHAPE)
12 |
13 |
14 | @pytest.fixture
15 | def empty_degrader():
16 | effects = []
17 | return Degrader(effects)
18 |
19 |
20 | @pytest.fixture(
21 | params=[
22 | [("blur", {"radius": 5})],
23 | [("blur", {"src": ImageState.ORIGINAL_STATE, "radius": 5})],
24 | [("blur", {"src": ImageState.CURRENT_STATE, "radius": 5})],
25 | [
26 | ("morphology", {"src": ImageState.ORIGINAL_STATE, "operation": "open"}),
27 | ("morphology", {"operation": "close"}),
28 | ("morphology", {"src": ImageState.ORIGINAL_STATE, "operation": "dilate"}),
29 | ("morphology", {"operation": "erode"}),
30 | ],
31 | [
32 | ("blur", {"radius": 5}),
33 | (
34 | "bleed_through",
35 | {
36 | "src": ImageState.CURRENT_STATE,
37 | "alpha": 0.7,
38 | "background": ImageState.ORIGINAL_STATE,
39 | },
40 | ),
41 | (
42 | "morphology",
43 | {"operation": "open", "kernel_shape": (3, 3), "kernel_type": "ones"},
44 | ),
45 | ],
46 | ]
47 | )
48 | def degrader(request):
49 | effects = request.param
50 | return Degrader(effects)
51 |
52 |
53 | def test_empty_degrader_init(empty_degrader):
54 | assert empty_degrader.effects_to_apply == []
55 |
56 |
57 | def test_degrader_init(degrader):
58 | assert degrader.effects_to_apply is not []
59 | for effect_tuple in degrader.effects_to_apply:
60 | method_name, method_kwargs = effect_tuple
61 | assert DEFAULT_METHOD_PARAM_TO_INCLUDE in method_kwargs
62 | param_value = method_kwargs[DEFAULT_METHOD_PARAM_TO_INCLUDE]
63 | assert (
64 | param_value is ImageState.ORIGINAL_STATE
65 | or param_value is ImageState.CURRENT_STATE
66 | )
67 |
68 |
69 | @pytest.mark.parametrize(
70 | "effects, error_thrown",
71 | [
72 | ([], None), # Empty effect
73 | (None, TypeError),
74 | ([("blur", {"radius": 5})], None), # Validate input
75 | ([("not_a_func", {"radius": 5})], ValueError), # Invalid method name
76 | ([("blur", {"not_a_argument": 5})], ValueError), # Invalid kwargs
77 | ([("blur")], ValueError), # Missing kwargs
78 | (
79 | [
80 | ("blur", {"radius": 5}),
81 | ("bleed_through", {"alpha": "0.8"}),
82 | ("morphology", {"operation": "open"}),
83 | ],
84 | None,
85 | ), # Multiple effects
86 | (
87 | [
88 | ("blur", {"radius": 5}),
89 | ("bleed_through", {"not_argument": "0.8"}),
90 | ("morphology", {"missing value"}),
91 | ],
92 | ValueError,
93 | ), # Multiple effects
94 | ],
95 | )
96 | def test_degrader_validate_effects(effects, error_thrown):
97 | if error_thrown:
98 | with pytest.raises(error_thrown):
99 | Degrader.validate_effects(effects)
100 | else:
101 | Degrader.validate_effects(effects)
102 |
103 |
104 | def test_degrader_apply_effects(degrader):
105 | method_names = [effect[0] for effect in degrader.effects_to_apply]
106 | with patch("genalog.degradation.effect") as mock_effect:
107 | degrader.apply_effects(MOCK_IMAGE)
108 | for method in method_names:
109 | assert mock_effect[method].is_called()
110 | # assert degraded.shape == MOCK_IMAGE_SHAPE
111 |
112 |
113 | def test_degrader_apply_effects_e2e(degrader):
114 | degraded = degrader.apply_effects(MOCK_IMAGE)
115 | assert degraded.shape == MOCK_IMAGE_SHAPE
116 | assert degraded.dtype == np.uint8
117 |
118 |
119 | def test_degrader_instructions(degrader):
120 | original_instruction = copy.deepcopy(degrader.effects_to_apply)
121 | degrader.apply_effects(MOCK_IMAGE)
122 | degrader.apply_effects(MOCK_IMAGE)
123 | # Make sure the degradation instructions are not altered
124 | assert len(original_instruction) == len(degrader.effects_to_apply)
125 | for i in range(len(original_instruction)):
126 | org_method_name, org_method_arg = original_instruction[i]
127 | method_name, method_arg = degrader.effects_to_apply[i]
128 | assert org_method_name == method_name
129 | assert len(org_method_arg) == len(method_arg)
130 | for key in org_method_arg.keys():
131 | assert isinstance(org_method_arg[key], type(method_arg[key]))
132 | assert org_method_arg[key] == method_arg[key]
133 |
--------------------------------------------------------------------------------
/tests/unit/generation/2x2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/tests/unit/generation/2x2.jpg
--------------------------------------------------------------------------------
/tests/unit/generation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/tests/unit/generation/__init__.py
--------------------------------------------------------------------------------
/tests/unit/generation/templates/font_family.html.jinja:
--------------------------------------------------------------------------------
1 | {{font_family}}
--------------------------------------------------------------------------------
/tests/unit/generation/templates/mock.html.jinja:
--------------------------------------------------------------------------------
1 | {{ content }}
--------------------------------------------------------------------------------
/tests/unit/generation/templates/multipage.html.jinja:
--------------------------------------------------------------------------------
1 | {% if content and content.content_type.__str__() == "ContentType.COMPOSITE" %}
2 | {% for c in content %}
3 | {% if c.content_type.__str__() == "ContentType.PARAGRAPH"%}
4 |
5 | {{ c }}
6 |
7 | {% else %}
8 | Unsupported Content Type: {{c.content_type.__str__()}}
9 | {% endif %}
10 | {% endfor %}
11 | {% else %}
12 | No content loaded
13 | {% endif %}
14 |
--------------------------------------------------------------------------------
/tests/unit/generation/test_content.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from genalog.generation.content import CompositeContent, Content, ContentType
4 | from genalog.generation.content import Paragraph, Title
5 |
6 | CONTENT_LIST = ["foo", "bar"]
7 | COMPOSITE_CONTENT_TYPE = [ContentType.TITLE, ContentType.PARAGRAPH]
8 | TEXT = "foo bar"
9 |
10 |
11 | @pytest.fixture
12 | def content_base_class():
13 | return Content()
14 |
15 |
16 | @pytest.fixture
17 | def paragraph():
18 | return Paragraph(TEXT)
19 |
20 |
21 | @pytest.fixture
22 | def title():
23 | return Title(TEXT)
24 |
25 |
26 | @pytest.fixture
27 | def section():
28 | return CompositeContent(CONTENT_LIST, COMPOSITE_CONTENT_TYPE)
29 |
30 |
31 | def test_content_set_content_type(content_base_class):
32 | with pytest.raises(TypeError):
33 | content_base_class.set_content_type("NOT VALID CONTENT TYPE")
34 | content_base_class.set_content_type(ContentType.PARAGRAPH)
35 |
36 |
37 | def test_paragraph_init(paragraph):
38 | with pytest.raises(TypeError):
39 | Paragraph([])
40 | assert paragraph.content_type == ContentType.PARAGRAPH
41 |
42 |
43 | def test_paragraph_print(paragraph):
44 | assert paragraph.__str__()
45 |
46 |
47 | def test_paragraph_iterable_indexable(paragraph):
48 | for index, character in enumerate(paragraph):
49 | assert character == paragraph[index]
50 |
51 |
52 | def test_title_init(title):
53 | with pytest.raises(TypeError):
54 | Title([])
55 | assert title.content_type == ContentType.TITLE
56 |
57 |
58 | def test_title_iterable_indexable(title):
59 | for index, character in enumerate(title):
60 | assert character == title[index]
61 |
62 |
63 | def test_composite_content_init(section):
64 | with pytest.raises(TypeError):
65 | CompositeContent((), [])
66 | assert section.content_type == ContentType.COMPOSITE
67 |
68 |
69 | def test_composite_content_iterable(section):
70 | for index, content in enumerate(section):
71 | assert content.content_type == COMPOSITE_CONTENT_TYPE[index]
72 |
73 |
74 | def test_composite_content_print(section):
75 | assert "foo" in section.__str__()
76 | assert "bar" in section.__str__()
77 |
--------------------------------------------------------------------------------
/tests/unit/ocr/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/tests/unit/ocr/__init__.py
--------------------------------------------------------------------------------
/tests/unit/ocr/data/img/0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/tests/unit/ocr/data/img/0.png
--------------------------------------------------------------------------------
/tests/unit/ocr/data/img/1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/tests/unit/ocr/data/img/1.png
--------------------------------------------------------------------------------
/tests/unit/ocr/data/img/11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/tests/unit/ocr/data/img/11.png
--------------------------------------------------------------------------------
/tests/unit/ocr/data/metrics.csv:
--------------------------------------------------------------------------------
1 | edit_insert,edit_delete,edit_replace,edit_insert_spacing,edit_delete_spacing,insert,delete,replace,spacing,total_chars,total_words,total_alnum_words,matching_chars,matching_alnum_words,matching_words,alnum_word_accuracy,word_accuracy,char_accuracy,txt_path,ocr_json_path,filename
2 | 1,0,0,1,13,1,0,0,14,1027,166,159,1025,144,150,0.9056603773584906,0.9036144578313253,0.9980525803310614,tests/unit/ocr/data/text/0.txt,tests/unit/ocr/data/json/521c38122f783673598856cd81d91c21_0.png.json,0.txt
3 | 3,0,0,0,5,3,0,0,5,958,182,176,955,165,171,0.9375,0.9395604395604396,0.9968684759916493,tests/unit/ocr/data/text/1.txt,tests/unit/ocr/data/json/521c38122f783673598856cd81d91c21_1.png.json,1.txt
4 | 2,0,0,0,9,2,0,0,9,1022,188,183,1020,170,175,0.9289617486338798,0.9308510638297872,0.9980430528375733,tests/unit/ocr/data/text/11.txt,tests/unit/ocr/data/json/521c38122f783673598856cd81d91c21_11.png.json,11.txt
5 |
--------------------------------------------------------------------------------
/tests/unit/ocr/data/metrics/json/123_001.json:
--------------------------------------------------------------------------------
1 | [{"text": "BIRDS\n\nOF\n\nGREAT BRITAIN AND IRELAND\n\nORDER PASSERES\n\nFAMILY ORIOLID^.\n\nTHIS famil}^ consists of a tropical group of brightly coloured birds in whicli\nyellow and black, or scarlet and black, are the prevailing hues. Although\nin the general form of their heads they somewhat remind one of Starlings,\nthey must not be confounded with the so-called \"Orioles\" of the New World,\nwhich belong to the family IdcridcE or Hang-nests and Troupials, a group of birds\nlinking the Finches and the Starlings, and feeding largely upon seeds and insects.\n\nThe late Henry Seebohm was of opinion that the Orioles were nearly related\nto the Crows ; he, therefore, placed the genus Oriolus in his Subfamily Corvince, from\nwhich he said that they chiefly differed in their exposed nostrils, although he\nadmitted that the tarsus might perhaps be slightly shorter, and the prevailing\ncolours different ; whilst the sexes also were dissimilar.*\n\nIn addition to the above distinctive characters, the third primary of the wing\n(not the fourth or fifth) appears to be the longest, in the Orioles; whilst the\n"}]
--------------------------------------------------------------------------------
/tests/unit/ocr/data/metrics/json/123_002.json:
--------------------------------------------------------------------------------
1 | [{"text": "whole character of the nest, which Seebohm often made much of iu his classifi-\ncation, is quite unlike that of a Crow ; being neatly woven, and slung like a\nhammock between the forks of a branch : moreover, whereas the eggs of the Crows\nare usually of some shade of green or blue, heavily spotted and speckled, or\nblotched and mottled, with various shades of olive or brown, those of the Orioles\nvary from white to salmon-pink, clearly spotted with blackish-brown, and some-\ntimes with lilacine-greyisli shell-markings.\n\nThe call-notes and songs of the Orioles are bright and melodious ; but this\nfact would not be a sufficient reason for dissociating them from the Crows ;\nalthough our native species of Corvidcr do not shine as whistlers, in their wild\nstate. I think, however, that Howard Saunders was fully justified in adopting\nthe present family for the Orioles.\n\nFamilx- ORIOL ID^E.\n\nThe Golden Oriole.\n\nOriolus ga/bula, LiNN.\n\nBREEDS in suitable localities throughout Europe south of the Baltic and in\nAlgeria ; passes through Greece, Asia Minor, Palestine, Egypt, and Nubia,\non migration ; and winters in North Africa, south-eastwards to Madagascar,\nNatal, and westwards to Damara Land : stragglers sometimes occur in Madeira,\nand the Azores.\n\nThe Golden Oriole is a regular visitor to our shores in spring, the largest\nnumber having been seen in the Scilly Islands, and Cornwall ; it has, however,\nbeen met with in not a few of the southern and south-eastern counties, and several\ninstances of its breeding with us have been recorded. In 1868, I saw a male\nspecimen of this bird near Linton, in Devonshire, and in July, 1887, I was just\ntoo late to see the species in Essex ; Mr. Fitch, of Maldon (whom I was visiting)\ninformed me that the bird had been seen in one of his thickets during the previous\n"}]
--------------------------------------------------------------------------------
/tests/unit/ocr/data/metrics/json/123_003.json:
--------------------------------------------------------------------------------
1 | [{"text": "week. We visited the place iu the hope of discovering a uest, but were unsuc-\ncessful.\n\nIn Ireland it has chiefly occurred on the east coast, most of the examples\nbeing females, or immature males ; a specimen was recorded as shot in the Faroe\nIslands, in Maj^ 1893, by Col. H. W. Feildeu. In June, 1906, one was killed\nby a cat on the Marine Parade at Brighton. Perhaps the nearest point to\nLondon at which it has been recognized, was noted in the \"Zoologist\" for 1892,\nan example having apparently been seen in Richmond Park.\n\nThe male of this species is bright gamboge-yellow, the lores, wings (excepting\nthe terminal third of the primary-coverts) and a great part of the tail black ; the\nprimaries, excepting the two outermost, are edged externally, and the secondaries\nare tipped with j^ellowish-white ; the two central tail-feathers are yellowish at the\nbase, and yellow at the tip, and the other feathers have the terminal third of the\nouter webs, and borders of the inner webs yellow ; bill reddish-ochreous ; feet\nleaden-grey ; iris bright red. The female is much duller than the male, greener,\nand with the black colouring replaced by deep brown ; the throat, breast, and\ncentre of belly whitish ; the throat, breast, and flanks streaked with greyish.\nYoung birds are greener and browner than the female, but otherwise similar ;\nnestlings have the upper parts olivaceous, spotted with yellow.\n\nThe Golden Oriole frequents gardens, groves, plantations, thickets, and the\noutskirts of large woods, especially in the neighbourhood of water ; it seems to\nprefer the haunts of man, yet is so shy that it rarely remains in view for more\nthan a minute as it flies rapidly, in somewhat Thrush-like, though more undulating\nfashion, from cover to cover ; choosing ever the densest foliage, as if aware of the\nperilous brilliance of its plumage : possibly it may slowly be acquiring a hereditary\nknowledge of the fact that, if but a glimpse is obtained of it, an attempt at least\nis made to put an end to its life ; or if it fails to comprehend so much, it may\ninherit a dread of the thunder and lightning which, for generations, have heralded\nits appearance : birds are not naturally fearful of man ; for even those which have\nbeen taught by their parents to dread him, can be generally converted by gentleness\nand petting : moreover the fact that a grown man can tame a small bird, whereas\neven the tamest will always show the greatest fear of a little boy, certainly seems\nto prove that the instinctive dread of the monkey-nature in the latter is deeply\nimplanted in all birds ; j ust as is that of a cat, even though that animal may\nnever have been seen by the bird previously.*\n"}]
--------------------------------------------------------------------------------
/tests/unit/ocr/data/metrics/metrics.csv:
--------------------------------------------------------------------------------
1 | edit_insert,edit_delete,edit_replace,edit_insert_spacing,edit_delete_spacing,insert,delete,replace,spacing,total_chars,total_words,total_alnum_words,matching_chars,matching_alnum_words,matching_words,alnum_word_accuracy,word_accuracy,char_accuracy,txt_path,ocr_json_path,filename
2 | 2,5,5,0,2,1,1,5,2,1068,176,176,1061,169,169,0.9602272727272727,0.9602272727272727,0.9934456928838952,tests/unit/ocr/data/metrics/text/001.txt,tests/unit/ocr/data/metrics/json/123_001.png.json,001.txt
3 | 0,5,17,0,11,0,2,8,11,1789,301,301,1772,283,283,0.9401993355481728,0.9401993355481728,0.9904974846282839,tests/unit/ocr/data/metrics/text/002.txt,tests/unit/ocr/data/metrics/json/123_002.png.json,002.txt
4 | 0,1,6,0,17,0,0,5,17,2659,460,459,2653,436,437,0.9498910675381264,0.95,0.9977435125987213,tests/unit/ocr/data/metrics/text/003.txt,tests/unit/ocr/data/metrics/json/123_003.png.json,003.txt
5 |
--------------------------------------------------------------------------------
/tests/unit/ocr/data/metrics/substitution.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/tests/unit/ocr/data/metrics/substitution.pkl
--------------------------------------------------------------------------------
/tests/unit/ocr/data/metrics/text/001.txt:
--------------------------------------------------------------------------------
1 | BIRDS
2 |
3 | OF
4 |
5 | GREAT BRITAIN AND IRELAND
6 |
7 | ORDER PASSERES
8 |
9 | FAMILY ORIOLIDA.
10 |
11 | THIS family consists of a tropical group of brightly coloured birds in which
12 | yellow and black, or scarlet and black, are the prevailing hues. Although
13 | in the general form of their heads they somewhat remind one of Starlings,
14 | they must not be confounded with the so-called "Orioles" of the New World,
15 | which belong to the family Icterida or Hang-nests and Troupials, a group of birds
16 | linking the Finches and the Starlings, and feeding largely upon seeds and insects.
17 |
18 | The late Henry Seebohm was of opinion that the Orioles were nearly related
19 | to the Crows; he, therefore, placed the genus Oriolus in his Subfamily Corvina, from
20 | which he said that they chiefly differed in their exposed nostrils, although he
21 | admitted that the tarsus might perhaps be slightly shorter, and the prevailing
22 | colours different; whilst the sexes also were dissimilar.*
23 |
24 | In addition to the above distinctive characters, the third primary of the wing
25 | (not the fourth or fifth) appears to be the longest, in the Orioles; whilst the
26 |
--------------------------------------------------------------------------------
/tests/unit/ocr/data/metrics/text/002.txt:
--------------------------------------------------------------------------------
1 | whole character of the nest, which Seebohm often made much of in his classifi-
2 | cation, is quite unlike that of a Crow; being neatly woven, and slung like a
3 | hammock between the forks of a branch: moreover, whereas the eggs of the Crows
4 | are usually of some shade of green or blue, heavily spotted and speckled, or
5 | blotched and mottled, with various shades of olive or brown, those of the Orioles
6 | vary from white to salmon-pink, clearly spotted with blackish-brown, and some-
7 | times with lilacine-greyish shell-markings.
8 |
9 | The call-notes and songs of the Orioles are bright and melodious; but this
10 | fact would not be a sufficient reason for dissociating them from the Crows;
11 | although our native species of Corvid do not shine as whistlers, in their wild
12 | state. I think, however, that Howard Saunders was fully justified in adopting
13 | the present family for the Orioles.
14 |
15 | Family-ORIOLID.
16 |
17 | THE GOLDEN ORIOLE.
18 |
19 | Oriolus galbula, LINN.
20 |
21 | BREEDS in suitable localities throughout Europe south of the Baltic and in
22 | Algeria; passes through Greece, Asia Minor, Palestine, Egypt, and Nubia,
23 | on migration; and winters in North Africa, south-eastwards to Madagascar,
24 | Natal, and westwards to Damara Land: stragglers sometimes occur in Madeira,
25 | and the Azores.
26 |
27 | The Golden Oriole is a regular visitor to our shores in spring, the largest
28 | number having been seen in the Scilly Islands, and Cornwall; it has, however,
29 | been met with in not a few of the southern and south-eastern counties, and several
30 | instances of its breeding with us have been recorded. In 1868, I saw a male
31 | specimen of this bird near Linton, in Devonshire, and in July, 1887, I was just
32 | too late to see the species in Essex; Mr. Fitch, of Maldon (whom I was visiting)
33 | informed me that the bird had been seen in one of his thickets during the previous
34 |
--------------------------------------------------------------------------------
/tests/unit/ocr/data/metrics/text/003.txt:
--------------------------------------------------------------------------------
1 | week. We visited the place in the hope of discovering a nest, but were unsuc-
2 | cessful.
3 |
4 | In Ireland it has chiefly occurred on the east coast, most of the examples
5 | being females, or immature males; a specimen was recorded as shot in the Faroe
6 | Islands, in May, 1893, by Col. H. W. Feilden. In June, 1906, one was killed
7 | by a cat on the Marine Parade at Brighton. Perhaps the nearest point to
8 | London at which it has been recognized, was noted in the "Zoologist" for 1892,
9 | an example having apparently been seen in Richmond Park.
10 |
11 | The male of this species is bright gamboge-yellow, the lores, wings (excepting
12 | the terminal third of the primary-coverts) and a great part of the tail black; the
13 | primaries, excepting the two outermost, are edged externally, and the secondaries
14 | are tipped with yellowish-white; the two central tail-feathers are yellowish at the
15 | base, and yellow at the tip, and the other feathers have the terminal third of the
16 | outer webs, and borders of the inner webs yellow; bill reddish-ochreous; feet
17 | leaden-grey; iris bright red. The female is much duller than the male, greener,
18 | and with the black colouring replaced by deep brown; the throat, breast, and
19 | centre of belly whitish; the throat, breast, and flanks streaked with greyish.
20 | Young birds are greener and browner than the female, but otherwise similar;
21 | nestlings have the upper parts olivaceous, spotted with yellow.
22 |
23 | The Golden Oriole frequents gardens, groves, plantations, thickets, and the
24 | outskirts of large woods, especially in the neighbourhood of water; it seems to
25 | prefer the haunts of man, yet is so shy that it rarely remains in view for more
26 | than a minute as it flies rapidly, in somewhat Thrush-like, though more undulating
27 | fashion, from cover to cover; choosing ever the densest foliage, as if aware of the
28 | perilous brilliance of its plumage: possibly it may slowly be acquiring a hereditary
29 | knowledge of the fact that, if but a glimpse is obtained of it, an attempt at least
30 | is made to put an end to its life; or if it fails to comprehend so much, it may
31 | inherit a dread of the thunder and lightning which, for generations, have heralded
32 | its appearance: birds are not naturally fearful of man ; for even those which have
33 | been taught by their parents to dread him, can be generally converted by gentleness
34 | and petting: moreover the fact that a grown man can tame a small bird, whereas
35 | even the tamest will always show the greatest fear of a little boy, certainly seems
36 | to prove that the instinctive dread of the monkey-nature in the latter is deeply
37 | implanted in all birds; just as is that of a cat, even though that animal may
38 | never have been seen by the bird previously.*
39 |
--------------------------------------------------------------------------------
/tests/unit/ocr/data/substitution.json:
--------------------------------------------------------------------------------
1 | {"tests/unit/ocr/data/text\\0.txt": {}, "tests/unit/ocr/data/text\\1.txt": {}, "tests/unit/ocr/data/text\\11.txt": {}}
--------------------------------------------------------------------------------
/tests/unit/ocr/data/substitution.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/genalog/f377f3faf8d2838dc3dfb90f47b0d8f1f2b756d6/tests/unit/ocr/data/substitution.pkl
--------------------------------------------------------------------------------
/tests/unit/ocr/data/text/0.txt:
--------------------------------------------------------------------------------
1 | basically ,it was unanimously agreed by the various relevant parties .To its determination ,the Chinese regulatory department compares this reform to a die that has been cast . takes time to prove whether the stock can really meet expectations ,and any deviations that arise during the reform can be promptly corrected . viewers ,the China News program will here .This is Xu Li .Thank you for watching .Coming up is the Focus program hosted by Wang Shilin . ,dear viewers .Hello ,dear viewers . to Focus Today .Today ,let 's turn attention to a road cave -in accident happened in Beijing over the holiday Before dawn on January 3 ,a sewage leakage accident occurred at the main side roads of Jingguang Bridge ,East Ring Road ,Beijing Municipality , in the road caving in .Relevant from Beijing Municipality promptly emergency contingency plans .The administration department carried out supervision near the accident scene . ,how did the emergency response activated by governmental departments effectively during the holiday ?
--------------------------------------------------------------------------------
/tests/unit/ocr/data/text/1.txt:
--------------------------------------------------------------------------------
1 | After the holiday ,what will be done handle citizens ' peak commute ? In ,what measures did relevant take to resolve issues such as waste ,heating ,and communication ,in order ensure that the lives of citizens not affected ? Well ,we have invited honorable guests to the studio today follow this topic with us .One of the honorable guests in the studio is Zhou Hanhua from the Institute of Law the Chinese Academy of Social .Hello .Next is Yang Yang ,a host of Traffic Radio Station .Hello .Welcome of you to the studio to participate our program .Well ,I especially want know ,ha ,how the two of you found the news on the day of the accident ? ,,about 11:00 m. yesterday ,ah ,I to find out through an SMS when I was .Uh-huh .Uh-huh .It happened that I going to have lunch with a friend ,um at noon .And then ,the friend first me an SMS ,Uh-huh .saying he would pick me up to go together .After that I received an SMS from 1860 .Uh-huh , was through an SMS .
--------------------------------------------------------------------------------
/tests/unit/ocr/data/text/11.txt:
--------------------------------------------------------------------------------
1 | Furthermore ,Chaoyang Road is an .Uh-huh .Whether it is Chaoyang Road the east -west direction or the main side roads of East Third Ring Road in south -north direction ,as we can see this diagram ,it can be said that the at the main and side roads of East Ring Road normally has quite heavy ,especially during commuting times . ,Chaoyang Road is a very important in the east -west direction .Yes . people living in the west want to over from the city ,they have to go this road .Hence ,if a traffic occurs at this place ,we can indeed ,ha ,how widespread ,ah ,the extent the impact will be ,such as the of cars caught in traffic jams .Yes , I think everyone can see that from buses that cross Jingguang Bridge . .As buses that cross the Third Ring are currently ,right now affected by Jingguang Bridge accident ,ah ,the results this morning show that 32 bus throughout the neighborhood have had be rerouted .Uh-huh .Well ,I think perhaps many friends in other places wondering how one place is able to 32 commuter routes .
--------------------------------------------------------------------------------
/tests/unit/ocr/test_ocr.py:
--------------------------------------------------------------------------------
1 | import json
2 | import os
3 |
4 | import pytest
5 | import requests
6 |
7 | from genalog.ocr.rest_client import GrokRestClient
8 |
9 |
10 | @pytest.fixture(scope="module", autouse=True)
11 | def set_azure_dummy_secrets(load_azure_resources):
12 | os.environ['BLOB_KEY'] = ""
13 | os.environ['SEARCH_SERVICE_KEY'] = ""
14 | os.environ['COGNITIVE_SERVICE_KEY'] = ""
15 |
16 |
17 | @pytest.fixture(autouse=True)
18 | def setup_monkeypatch(monkeypatch):
19 | def mock_http(*args, **kwargs):
20 | return MockedResponse(args, kwargs)
21 |
22 | # apply the monkeypatch for requests.get to mock_get
23 | monkeypatch.setattr(requests, "put", mock_http)
24 | monkeypatch.setattr(requests, "post", mock_http)
25 | monkeypatch.setattr(requests, "get", mock_http)
26 | monkeypatch.setattr(requests, "delete", mock_http)
27 |
28 |
29 | class MockedResponse:
30 | def __init__(self, args, kwargs):
31 | self.url = args[0]
32 | self.text = "response"
33 | # self.data = args[1]
34 | self.headers = kwargs["headers"]
35 |
36 | def json(self):
37 | if "search.windows.net/skillsets/" in self.url:
38 | return {}
39 |
40 | if "search.windows.net/indexers/" in self.url:
41 | if "status" in self.url:
42 | return {"lastResult": {"status": "success"}, "status": "finished"}
43 | return {}
44 |
45 | if "search.windows.net/indexes/" in self.url:
46 | if "docs/search" in self.url:
47 | return {
48 | "value": [
49 | {
50 | "metadata_storage_name": "521c38122f783673598856cd81d91c21_0.png",
51 | "layoutText": json.load(
52 | open(
53 | "tests/unit/ocr/data/json/521c38122f783673598856cd81d91c21_0.png.json",
54 | "r",
55 | )
56 | ),
57 | },
58 | {
59 | "metadata_storage_name": "521c38122f783673598856cd81d91c21_1.png",
60 | "layoutText": json.load(
61 | open(
62 | "tests/unit/ocr/data/json/521c38122f783673598856cd81d91c21_1.png.json",
63 | "r",
64 | )
65 | ),
66 | },
67 | {
68 | "metadata_storage_name": "521c38122f783673598856cd81d91c21_11.png",
69 | "layoutText": json.load(
70 | open(
71 | "tests/unit/ocr/data/json/521c38122f783673598856cd81d91c21_11.png.json",
72 | "r",
73 | )
74 | ),
75 | },
76 | ]
77 | }
78 | return json.dumps({})
79 | if "search.windows.net/datasources/" in self.url:
80 | return {}
81 |
82 | raise ValueError(f"{self.url} not valid")
83 |
84 | def raise_for_status(self):
85 | pass
86 |
87 |
88 | class TestGROK:
89 | def test_creating_indexing_pipeline(self):
90 | grok_rest_client = GrokRestClient.create_from_env_var()
91 | grok_rest_client.create_indexing_pipeline()
92 | grok_rest_client.delete_indexer_pipeline()
93 |
94 | def test_running_indexer(self):
95 | grok_rest_client = GrokRestClient.create_from_env_var()
96 | grok_rest_client.create_indexing_pipeline()
97 |
98 | indexer_status = grok_rest_client.get_indexer_status()
99 | if indexer_status["status"] == "error":
100 | raise RuntimeError(f"indexer error: {indexer_status}")
101 |
102 | # if not already running start the indexer
103 | if indexer_status["lastResult"]["status"] != "inProgress":
104 | grok_rest_client.run_indexer()
105 |
106 | grok_rest_client.run_indexer()
107 | indexer_status = grok_rest_client.poll_indexer_till_complete()
108 | assert indexer_status["lastResult"]["status"] == "success"
109 | grok_rest_client.delete_indexer_pipeline()
110 |
--------------------------------------------------------------------------------
/tests/unit/text/data/gt_1.txt:
--------------------------------------------------------------------------------
1 | The book Between You and Me /. Hello /. Good evening /. Hi /. I 'd like to ask Mr. Wallace if he 's ever turned down an inter- /- Turned down a what /? Interview /. Turned down an interview /? Were you ever asked by CBS to say go do this guy /. and you said no /? If so I do n't remember /. No I do n't think so /. Orlando Florida hello /. Mike I wanted to know /. Go ahead /. the first time I was in New York I saw a nice looking young man on TV in a show Mike and Buffy /. was that you /? That was me he he /. That was me and Buff Cobb who was my /- That 's not Buff talking , is it /? No no ha ha /. cause Buff is up in New Hampshire /. She lives in a home up there /. She 's not well /. Um yeah she and I used to do a show on CBS when I first came to New York /. and it was a fascinating /. it was a little bit like Regis and uh Kathy or uh Regis and Kelly /. But you were married , right /? Yes /. What was it like to do a show with the wife /? Not easy /. Ha ha /. I 'm serious /. You know uh I 'd love to see that /. uh We used to bicker on the air /. and what happened was after a while the bickering continued after we got off the air /. After you got off the air /. You know what I mean /. I know /. Detroit hello /. Hi /. Hi /. How are you /? Fine /. Mr. Wallace this is a big pleasure for me to talk to you /. But um uh what is your most difficult interview that you had in Sixty Minutes the most difficult person that you could have ever interviewed /? I think probably the Aiatola really because he was not anxious to do it /. It was um just after the US hostages had been taken in Iran /. and I was surprised that he was willing to talk to us /. and it was a very very difficult business /. We did it in the holy city of which uh we /- and the circumstances were difficult /. They took good care to see that we did n't get into trouble /. Ha ha /. We 'll take a break /. And he just /- We 'll be back with more of Mike Wallace /. The book is Between You and Me /. the DVD is included /. oh what can one say it 's a terrific work /. We 'll be right back /. That voice was the subject of The Insider /. That man /. that man remains my hero /. Jeff Wygan who took on the tobacco cartel if you will /. And you remember when all those guys who ran the companies raised their hands and said Oh it 's not addictive /. they knew it was addictive /. And he has succeeded /. I mean really he has succeeded /. He runs a foundation for Smoke Free Kids /. and he 's gotten all kinds of success in all kinds of ways in foreign countries and so forth /. The man is my hero /. And you are mine /. And we have a minute and a half left /. I know you 're asked this all the time /. but how long you going to keep on keeping on /. How long you /- you know what the dickens would I do /? what would I do /? How long are you going to keep doing what you 're /- Yeah but you 're /- How old are you Mike /? Eighty - seven /. can you imagine /? I 'm going to be seventy - two /. so you 're fifteen years older than me /. That 's why I feel like a kid compared to you /. /.
--------------------------------------------------------------------------------
/tests/unit/text/data/label_generator/labels/0.tsv:
--------------------------------------------------------------------------------
1 | basically O
2 | , O
3 | it O
4 | was O
5 | unanimously O
6 | agreed O
7 | upon O
8 | by O
9 | the O
10 | various O
11 | relevant O
12 | parties O
13 | . O
14 |
15 | To O
16 | express O
17 | its O
18 | determination O
19 | , O
20 | the O
21 | Chinese O
22 | securities O
23 | regulatory O
24 | department O
25 | compares O
26 | this O
27 | stock O
28 | reform O
29 | to O
30 | a O
31 | die O
32 | that O
33 | has O
34 | been O
35 | cast O
36 | . O
37 |
38 | It O
39 | takes O
40 | time O
41 | to O
42 | prove O
43 | whether O
44 | the O
45 | stock O
46 | reform O
47 | can O
48 | really O
49 | meet O
50 | expectations O
51 | , O
52 | and O
53 | whether O
54 | any O
55 | deviations O
56 | that O
57 | arise O
58 | during O
59 | the O
60 | stock O
61 | reform O
62 | can O
63 | be O
64 | promptly O
65 | corrected O
66 | . O
67 |
68 | Dear O
69 | viewers B-PERSONTYPE
70 | , O
71 | the O
72 | China B-ORGANIZATION
73 | News I-ORGANIZATION
74 | program O
75 | will O
76 | end O
77 | here O
78 | . O
79 |
80 | This O
81 | is O
82 | Xu B-PERSONNAME
83 | Li I-PERSONNAME
84 | . O
85 |
86 | Thank O
87 | you O
88 | everyone O
89 | for O
90 | watching O
91 | . O
92 |
93 | Coming O
94 | up O
95 | is O
96 | the O
97 | Focus B-ORGANIZATION
98 | Today I-ORGANIZATION
99 | program O
100 | hosted O
101 | by O
102 | Wang B-PERSONNAME
103 | Shilin I-PERSONNAME
104 | . O
105 |
106 | Good-bye O
107 | , O
108 | dear O
109 | viewers B-PERSONTYPE
110 | . O
111 |
112 | Hello O
113 | , O
114 | dear O
115 | viewers B-PERSONTYPE
116 | . O
117 |
118 | Welcome O
119 | to O
120 | Focus B-ORGANIZATION
121 | Today I-ORGANIZATION
122 | . O
123 |
124 | Today B-DATE
125 | , O
126 | let O
127 | 's O
128 | turn O
129 | our O
130 | attention O
131 | to O
132 | a O
133 | road O
134 | cave O
135 | - O
136 | in O
137 | accident O
138 | that O
139 | happened O
140 | in O
141 | Beijing B-GPE
142 | over O
143 | the O
144 | holiday O
145 | . O
146 |
147 | Before B-DATETIMERANGE
148 | dawn I-DATETIMERANGE
149 | on O
150 | January B-DATE
151 | 3 I-DATE
152 | , O
153 | a O
154 | sewage O
155 | pipe O
156 | leakage O
157 | accident O
158 | occurred O
159 | at O
160 | the O
161 | main O
162 | and O
163 | side O
164 | roads O
165 | of O
166 | Jingguang B-LOCATION
167 | Bridge I-LOCATION
168 | , O
169 | East B-ADDRESS
170 | Third I-ADDRESS
171 | Ring I-ADDRESS
172 | Road I-ADDRESS
173 | , O
174 | Beijing B-GPE
175 | Municipality I-GPE
176 | , O
177 | resulting O
178 | in O
179 | the O
180 | road O
181 | caving O
182 | in O
183 | . O
184 |
185 | Relevant O
186 | departments O
187 | from O
188 | Beijing B-GPE
189 | Municipality I-GPE
190 | promptly O
191 | activated O
192 | emergency O
193 | contingency O
194 | plans O
195 | . O
196 |
197 | The O
198 | traffic O
199 | administration O
200 | department O
201 | carried O
202 | out O
203 | traffic O
204 | supervision O
205 | near O
206 | the O
207 | accident O
208 | scene O
209 | . O
210 |
211 | Well O
212 | , O
213 | how O
214 | did O
215 | the O
216 | emergency O
217 | response O
218 | mechanisms O
219 | activated O
220 | by O
221 | governmental O
222 | departments O
223 | operate O
224 | effectively O
225 | during O
226 | the O
227 | holiday O
228 | ? O
229 |
230 |
--------------------------------------------------------------------------------
/tests/unit/text/data/label_generator/labels/1.tsv:
--------------------------------------------------------------------------------
1 | After O
2 | the O
3 | holiday O
4 | , O
5 | what O
6 | will O
7 | be O
8 | done O
9 | to O
10 | handle O
11 | citizens B-PERSONTYPE
12 | ' O
13 | peak O
14 | commute O
15 | ? O
16 |
17 | In O
18 | addition O
19 | , O
20 | what O
21 | measures O
22 | did O
23 | relevant O
24 | departments O
25 | take O
26 | to O
27 | resolve O
28 | issues O
29 | such O
30 | as O
31 | waste O
32 | discharge O
33 | , O
34 | heating O
35 | , O
36 | and O
37 | communication O
38 | , O
39 | in O
40 | order O
41 | to O
42 | ensure O
43 | that O
44 | the O
45 | lives O
46 | of O
47 | citizens B-PERSONTYPE
48 | were O
49 | not O
50 | affected O
51 | ? O
52 |
53 | Well O
54 | , O
55 | we O
56 | have O
57 | invited O
58 | two B-NUMBER
59 | honorable O
60 | guests B-PERSONTYPE
61 | to O
62 | the O
63 | studio B-LOCATION
64 | today B-DATE
65 | to O
66 | follow O
67 | this O
68 | topic O
69 | with O
70 | us O
71 | . O
72 |
73 | One B-NUMBER
74 | of O
75 | the O
76 | two B-NUMBER
77 | honorable O
78 | guests B-PERSONTYPE
79 | in O
80 | the O
81 | studio B-LOCATION
82 | is O
83 | Professor O
84 | Zhou B-PERSONNAME
85 | Hanhua I-PERSONNAME
86 | from O
87 | the O
88 | Institute B-ORGANIZATION
89 | of I-ORGANIZATION
90 | Law I-ORGANIZATION
91 | of O
92 | the O
93 | Chinese B-ORGANIZATION
94 | Academy I-ORGANIZATION
95 | of I-ORGANIZATION
96 | Social I-ORGANIZATION
97 | Sciences I-ORGANIZATION
98 | . O
99 |
100 | Hello O
101 | . O
102 |
103 | Next B-ORDINAL
104 | is O
105 | Yang B-PERSONNAME
106 | Yang I-PERSONNAME
107 | , O
108 | a O
109 | host O
110 | of O
111 | Beijing B-ORGANIZATION
112 | Traffic I-ORGANIZATION
113 | Radio I-ORGANIZATION
114 | Station I-ORGANIZATION
115 | . O
116 |
117 | Hello O
118 | . O
119 |
120 | Welcome O
121 | both O
122 | of O
123 | you O
124 | to O
125 | the O
126 | studio O
127 | to O
128 | participate O
129 | in O
130 | our O
131 | program O
132 | . O
133 |
134 | Well O
135 | , O
136 | I O
137 | especially O
138 | want O
139 | to O
140 | know O
141 | , O
142 | ha O
143 | , O
144 | how O
145 | the O
146 | two B-NUMBER
147 | of O
148 | you O
149 | found O
150 | out O
151 | the O
152 | news O
153 | on O
154 | the B-DATE
155 | day I-DATE
156 | of O
157 | the O
158 | accident B-EVENT
159 | ? O
160 |
161 | Ah O
162 | , O
163 | , O
164 | about O
165 | 11:00 B-NUMBER
166 | m. O
167 | yesterday B-DATE
168 | , O
169 | ah O
170 | , O
171 | I O
172 | happened O
173 | to O
174 | find O
175 | out O
176 | through O
177 | an O
178 | SMS O
179 | when O
180 | I O
181 | was O
182 | outside O
183 | . O
184 |
185 | Uh-huh O
186 | . O
187 |
188 | Uh-huh O
189 | . O
190 |
191 | It O
192 | happened O
193 | that O
194 | I O
195 | was O
196 | going O
197 | to O
198 | have O
199 | lunch B-TIMERANGE
200 | with O
201 | a O
202 | friend B-PERSONTYPE
203 | , O
204 | um O
205 | , O
206 | at O
207 | noon B-TIME
208 | . O
209 |
210 | And O
211 | then O
212 | , O
213 | the O
214 | friend B-PERSONTYPE
215 | first B-ORDINAL
216 | sent O
217 | me O
218 | an O
219 | SMS O
220 | , O
221 | Uh-huh O
222 | . O
223 |
224 | saying O
225 | he O
226 | would O
227 | come O
228 | pick O
229 | me O
230 | up O
231 | to O
232 | go O
233 | together O
234 | . O
235 |
236 | After O
237 | that O
238 | , O
239 | I O
240 | received O
241 | an O
242 | SMS O
243 | from B-DATERANGE
244 | 1860 I-DATERANGE
245 | . O
246 |
247 | Uh-huh O
248 | , O
249 | it O
250 | was O
251 | through O
252 | an O
253 | SMS B-ORGANIZATION
254 | . O
255 |
256 |
--------------------------------------------------------------------------------
/tests/unit/text/data/label_generator/text/0.txt:
--------------------------------------------------------------------------------
1 | basically, it was unanimously agreed upon by the various relevant parties. To express its determination, the Chinese securities regulatory department compares this stock reform to a die that has been cast. It takes time to prove whether the stock reform can really meet expectations, and whether any deviations that arise during the stock reform can be promptly corrected. Dear viewers, the China News program will end here. This is Xu Li. Thank you everyone for watching. Coming up is the Focus Today program hosted by Wang Shilin. Good-bye, dear viewers. Hello, dear viewers. Welcome to Focus Today. Today, let's turn our attention to a road cave - in accident that happened in Beijing over the holiday. Before dawn on January 3, a sewage pipe leakage accident occurred at the main and side roads of Jingguang Bridge , East Third Ring Road, Beijing Municipality, resulting in the road caving in. Relevant departments from Beijing Municipality promptly activated emergency contingency plans. The traffic administration department carried out traffic supervision near the accident scene. Well, how did the emergency response mechanisms activated by governmental departments operate effectively during the holiday ?
--------------------------------------------------------------------------------
/tests/unit/text/data/label_generator/text/1.txt:
--------------------------------------------------------------------------------
1 | After the holiday, what will be done to handle citizens' peak commute? In addition, what measures did relevant departments take to resolve issues such as waste discharge, heating, and communication, in order to ensure that the lives of citizens were not affected? Well, we have invited two honorable guests to the studio today to follow this topic with us. One of the two honorable guests in the studio is Professor Zhou Hanhua from the Institute of Law of the Chinese Academy of Social Sciences. Hello. Next is Yang Yang, a host of Beijing Traffic Radio Station. Hello. Welcome both of you to the studio to participate in our program. Well, I especially want to know, ha, how the two of you found out the news on the day of the accident? Ah,, about 11:00 m. yesterday, ah, I happened to find out through an SMS when I was outside. Uh-huh. Uh-huh. It happened that I was going to have lunch with a friend, um, at noon. And then, the friend first sent me an SMS, Uh-huh. saying he would come pick me up to go together. After that, I received an SMS from 1860. Uh-huh, it was through an SMS.
--------------------------------------------------------------------------------
/tests/unit/text/data/label_generator/text/11.txt:
--------------------------------------------------------------------------------
1 | And you, Yang Yang? A friend happened to call me. You were not at work that day? No. The station called me at noon and said something happened at Jingguang Bridge and that I had to go to the station immediately to research the upcoming program. Uh-huh, that means, er, you found out the accident through an information source at the station. Right, right, right. Uh-huh. Well, like Professor Zhou, I also received this news, ha, through a mobile phone SMS. At that time,, it can be said that this SMS was among the many, ha, SMS containing New Year wishes, like Happy New Year, received after the start of the New Year. Uh-huh. Ah, actually I felt a lot of warmth when I received that SMS. Although we live in the west instead of the east and it did not affect us much, I think it is very useful, ah, to inform people of this kind of news. Yes, exceptionally. Yes, exceptionally. Well, what in fact was the content of that SMS? Let's take a look via this footage, ha. I remember the SMS was written like this at that time, saying that, ah, there was a sewage pipe leakage accident on the side road at the southeast corner of Jingguang Bridge at East Third Ring Road, and, well, traffic supervision was implemented near Chaoyang Road, Jingguang Bridge, and East Third Ring Road, and requesting cars to make a detour. Some car owners said that it was very good that the SMS was sent. Furthermore, there was one last sentence in that SMS thanking citizens for their cooperation and support. Ah, after the SMS was sent ,, I felt it seems to be the first time that Beijing Municipality, ah, used an SMS to give notification at the time of a public emergency. I don't know, all of us are living in Beijing, is this the first time, Professor Zhou? Yes, in terms of an official notification , this should be the first time one was sent officially through 1860. Uh-huh.
--------------------------------------------------------------------------------
/tests/unit/text/data/ocr_1.txt:
--------------------------------------------------------------------------------
1 | The book Between you and me /. Hello / Good evening / Is /. I'd we to ask it Wallace if he's ever turned down an inter / Turned down a what / Interview /. Taned down an interview /? Were you ever asked by CBS to say go do this guy ? and your sant no /7 # so I do n't remember / No I don't think so /. Ontario Florida head / like I wanted to know / Go ahead / the first time I was in wes bork I sow a nice looking young man on TV in's show bike and Butty / was that you ? That was me he he /, That was me and Buff Coco who was my ). That's not But taking, Is it /f to no he he /. cause Butt is up in New Hampshire /. She lives in a home up there /. She's not well /. Um yeah she and I used to do a show on CaS when I first came to he's work / and it was a fascinating /. It was a lene be like Regis and on Kathy or on Recy's and Kelly /. But you were married , right /1 Yes /. What was it wise to do a show with the wife 7 Not easy /. Ha ha / I'm serious /. You know in I'd love to see that ! on we used to backer on live or I. and what happened was after a while the bickering corewed sher we got off the air / After you got off the # 1. You know what I mean /. I know / Decree helio / bill. Is /. How are you /7 Fire ). He Wallace this is a big pleasure for me to talk to you / But tan is what is your most officus aderview that you had it Sixty Minutes the most difficult person that you could have over interviewed /? I think probably the Alitois really because he was not anxious to do i / It was on just otter the US hostages had been taken in van /, and I was surprised that he was wasing to talk to us /. and it was a very very autocult business / we did it in the holy city of which of we / and the circumstances were offcult /, They took good care to see that we and n't got into trouble / Ha ha / we # take a break !, And he just / we " be back with more of mike wallace /. The book is Between lexi and his / the DVD is backand / on what can one say it's a terrific work / we'll be right back / That voice was the subject of The Raider /. That man I. that man remains my hero / jet Wygen who took on the tobacco cartel If you will /. And you remen. bar when all those guys who ran the companies raised their hands and said On It's not aceactive /. they knew & was abiktive / And he has succeeded /. I mean really he has succeeded /. He runs s founds tion for Smoke Free Kids / and he's gotten off kind of success in of kinds of ways in foreign countries and so forth /. The man is my have /. And your are mine /. And we have a miage and s hat let /. I know you 're asked this all the time / but how long you going to keep on keeping on /. How ling you /. you know what the dickens would I do /7 what waxat I do /7 How king are you going to keep doing what you 're / was but you 're / How old are you like /? fighty . seven /. can you imagine /? I'm going to be seventy . two /. so you're fifteen years older than me ). That's why I feel we e kil compared to you 1. 1.
--------------------------------------------------------------------------------
/tests/unit/text/test_lcs.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from genalog.text.lcs import LCS
4 |
5 |
6 | @pytest.fixture(
7 | params=[
8 | ("", ""), # empty
9 | ("abcde", "ace"), # naive case
10 | ]
11 | )
12 | def lcs(request):
13 | str1, str2 = request.param
14 | return LCS(str1, str2)
15 |
16 |
17 | def test_lcs_init(lcs):
18 | assert lcs._lcs_len is not None
19 | assert lcs._lcs is not None
20 |
21 |
22 | @pytest.mark.parametrize(
23 | "str1, str2, expected_len, expected_lcs",
24 | [
25 | ("", "", 0, ""), # empty
26 | ("abc", "abc", 3, "abc"),
27 | ("abcde", "ace", 3, "ace"), # naive case
28 | ("a", "", 0, ""), # no results
29 | ("abc", "cba", 1, "c"), # multiple cases
30 | ("abcdgh", "aedfhr", 3, "adh"),
31 | ("abc.!\t\nd", "dxab", 2, "ab"), # with punctuations
32 | (
33 | "New York @",
34 | "New @ York",
35 | len("New York"),
36 | "New York",
37 | ), # with space-separated, tokens
38 | ("Is A Big City", "A Big City Is", len("A Big City"), "A Big City"),
39 | ("Is A Big City", "City Big Is A", len(" Big "), " Big "), # reversed order
40 | # mixed order with similar tokens
41 | ("Is A Big City IS", "IS Big A City Is", len("I Big City I"), "I Big City I"),
42 | # casing
43 | (
44 | "Is A Big City IS a",
45 | "IS a Big City Is A",
46 | len("I Big City I "),
47 | "I Big City I ",
48 | ),
49 | ],
50 | )
51 | def test_lcs_e2e(str1, str2, expected_len, expected_lcs):
52 | lcs = LCS(str1, str2)
53 | assert expected_lcs == lcs.get_str()
54 | assert expected_len == lcs.get_len()
55 |
--------------------------------------------------------------------------------
/tests/unit/text/test_preprocess.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from genalog.text import preprocess
4 | from genalog.text.alignment import GAP_CHAR
5 |
6 |
7 | @pytest.mark.parametrize(
8 | "token, replacement, desired_output",
9 | [
10 | ("", "_", ""), # Do nothing to empty string
11 | (" ", "_", " "), # Do nothing to whitespaces
12 | (" \n\t", "_", " \n\t"),
13 | ("ascii", "_", "ascii"),
14 | ("a s\nc\tii", "_", "a s\nc\tii"),
15 | ("ascii·", "_", "ascii"), # Tokens with non-ASCII values
16 | ("·", "_", "_"), # Tokens with non-ASCII values
17 | ],
18 | )
19 | def test_remove_non_ascii(token, replacement, desired_output):
20 | for code in range(128, 1000): # non-ASCII values
21 | token.replace("·", chr(code))
22 | output = preprocess.remove_non_ascii(token, replacement)
23 | assert output == desired_output
24 |
25 |
26 | @pytest.mark.parametrize(
27 | "s, desired_output",
28 | [
29 | (" New \t \n", ["New"]),
30 | # Mixed in gap char "@"
31 | (" @ @", ["@", "@"]),
32 | ("New York is big", ["New", "York", "is", "big"]),
33 | # Mixed multiple spaces and tabs
34 | (" New York \t is \t big", ["New", "York", "is", "big"]),
35 | # Mixed in punctuation
36 | ("New .York is, big !", ["New", ".York", "is,", "big", "!"]),
37 | # Mixed in gap char "@"
38 | ("@N@ew York@@@is,\t big@@@@@", ["@N@ew", "York@@@is,", "big@@@@@"]),
39 | ],
40 | )
41 | def test_tokenize(s, desired_output):
42 | output = preprocess.tokenize(s)
43 | assert output == desired_output
44 |
45 |
46 | @pytest.mark.parametrize(
47 | "tokens, desired_output",
48 | [
49 | (
50 | ["New", "York", "is", "big"],
51 | "New York is big",
52 | ),
53 | # Mixed in punctuation
54 | (
55 | ["New", ".York", "is,", "big", "!"],
56 | "New .York is, big !",
57 | ),
58 | # Mixed in gap char "@"
59 | (
60 | ["@N@ew", "York@@@is,", "big@@@@@"],
61 | "@N@ew York@@@is, big@@@@@",
62 | ),
63 | ],
64 | )
65 | def test_join_tokens(tokens, desired_output):
66 | output = preprocess.join_tokens(tokens)
67 | assert output == desired_output
68 |
69 |
70 | @pytest.mark.parametrize(
71 | "c, desired_output",
72 | [
73 | # Gap char
74 | (GAP_CHAR, False),
75 | # Alphabet char
76 | ("a", False),
77 | ("A", False),
78 | # Punctuation
79 | (".", False),
80 | ("!", False),
81 | (",", False),
82 | ("-", False),
83 | # Token separators
84 | (" ", True),
85 | ("\n", True),
86 | ("\t", True),
87 | ],
88 | )
89 | def test__is_spacing(c, desired_output):
90 | assert desired_output == preprocess._is_spacing(c)
91 |
92 |
93 | @pytest.mark.parametrize(
94 | "text, desired_output",
95 | [
96 | ("", ""),
97 | ("w .", "w ."),
98 | ("w !", "w !"),
99 | ("w ?", "w ?"),
100 | ("w /.", "w /."),
101 | ("w /!", "w /!"),
102 | ("w /?", "w /?"),
103 | ("w1 , w2 .", "w1 , w2 ."),
104 | ("w1 . w2 .", "w1 . \nw2 ."),
105 | ("w1 /. w2 /.", "w1 /. \nw2 /."),
106 | ("w1 ! w2 .", "w1 ! \nw2 ."),
107 | ("w1 /! w2 /.", "w1 /! \nw2 /."),
108 | ("w1 ? w2 .", "w1 ? \nw2 ."),
109 | ("w1 /? w2 /.", "w1 /? \nw2 /."),
110 | ("U.S. . w2 .", "U.S. . \nw2 ."),
111 | ("w1 ??? w2 .", "w1 ??? w2 ."), # not splitting
112 | ("w1 !!! w2 .", "w1 !!! w2 ."),
113 | ("w1 ... . w2 .", "w1 ... . \nw2 ."),
114 | ("w1 ... /. w2 /.", "w1 ... /. \nw2 /."),
115 | ("w1 /. /. w2 .", "w1 /. /. \nw2 ."),
116 | ("w1 /. /.", "w1 /. \n/."),
117 | ("w1 /. /. ", "w1 /. /. \n"),
118 | ("w1 ? ? ? ? w2 .", "w1 ? ? ? ? \nw2 ."),
119 | ("w1 /? /? /? /? w2 /.", "w1 /? /? /? /? \nw2 /."),
120 | ("w1 ! ! ! ! w2 .", "w1 ! ! ! ! \nw2 ."),
121 | ("w1 /! /! /! /! w2 /.", "w1 /! /! /! /! \nw2 /."),
122 | ],
123 | )
124 | def test_split_sentences(text, desired_output):
125 | assert desired_output == preprocess.split_sentences(text)
126 |
127 |
128 | @pytest.mark.parametrize(
129 | "token, desired_output",
130 | [
131 | ("", False),
132 | (" ", False),
133 | ("\n", False),
134 | ("\t", False),
135 | (" \n \t", False),
136 | ("...", False),
137 | ("???", False),
138 | ("!!!", False),
139 | (".", True),
140 | ("!", True),
141 | ("?", True),
142 | ("/.", True),
143 | ("/!", True),
144 | ("/?", True),
145 | ],
146 | )
147 | def test_is_sentence_separator(token, desired_output):
148 | assert desired_output == preprocess.is_sentence_separator(token)
149 |
--------------------------------------------------------------------------------
/tests/unit/text/test_utf8.py:
--------------------------------------------------------------------------------
1 | import random
2 | import warnings
3 |
4 | import pytest
5 |
6 | from genalog.text import alignment
7 | from genalog.text.alignment import GAP_CHAR
8 | from tests.unit.cases.text_alignment import ALIGNMENT_REGRESSION_TEST_CASES
9 |
10 |
11 | def random_utf8_char(byte_len=1):
12 | if byte_len == 1:
13 | return chr(random.randint(0, 0x007F))
14 | elif byte_len == 2:
15 | return chr(random.randint(0x007F, 0x07FF))
16 | elif byte_len == 3:
17 | return chr(random.randint(0x07FF, 0xFFFF))
18 | elif byte_len == 4:
19 | return chr(random.randint(0xFFFF, 0x10FFFF))
20 | else:
21 | raise ValueError(
22 | f"Invalid byte length: {byte_len}."
23 | + "utf-8 does not encode characters with more than 4 bytes in length"
24 | )
25 |
26 |
27 | @pytest.mark.parametrize(
28 | "num_utf_char_to_test", [100]
29 | ) # Number of char per byte length
30 | @pytest.mark.parametrize(
31 | "byte_len", [1, 2, 3, 4]
32 | ) # UTF does not encode with more than 4 bytes
33 | @pytest.mark.parametrize(
34 | "gt_txt, noisy_txt, expected_aligned_gt, expected_aligned_noise",
35 | ALIGNMENT_REGRESSION_TEST_CASES,
36 | )
37 | def test_align(
38 | num_utf_char_to_test,
39 | byte_len,
40 | gt_txt,
41 | noisy_txt,
42 | expected_aligned_gt,
43 | expected_aligned_noise,
44 | ):
45 |
46 | invalid_char = set(gt_txt).union(
47 | set(GAP_CHAR)
48 | ) # character to replace to cannot be in this set
49 | for _ in range(num_utf_char_to_test):
50 | utf_char = random_utf8_char(byte_len)
51 | while (
52 | utf_char in invalid_char
53 | ): # find a utf char not in the input string and not GAP_CHAR
54 | utf_char = random_utf8_char(byte_len)
55 | char_to_replace = random.choice(list(invalid_char)) if gt_txt else ""
56 |
57 | gt_txt.replace(char_to_replace, utf_char)
58 | noisy_txt.replace(char_to_replace, utf_char)
59 | expected_aligned_gt_sub = expected_aligned_gt.replace(char_to_replace, utf_char)
60 | expected_aligned_noise_sub = expected_aligned_noise.replace(
61 | char_to_replace, utf_char
62 | )
63 |
64 | # Run alignment
65 | aligned_gt, aligned_noise = alignment.align(gt_txt, noisy_txt)
66 |
67 | aligned_gt = aligned_gt.replace(char_to_replace, utf_char)
68 | aligned_noise = aligned_noise.replace(char_to_replace, utf_char)
69 | if aligned_gt != expected_aligned_gt_sub:
70 | expected_alignment = alignment._format_alignment(
71 | expected_aligned_gt_sub, expected_aligned_noise_sub
72 | )
73 | result_alignment = alignment._format_alignment(aligned_gt, aligned_noise)
74 | warnings.warn(
75 | RuntimeWarning(
76 | f"\n\n****Expect alignment returns:****\n{expected_alignment} \n****But got:****\n{result_alignment}"
77 | )
78 | )
79 |
--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
1 | [tox]
2 | envlist = flake8, py
3 |
4 |
5 | [testenv]
6 | passenv =
7 | # For e2e testing the OCR components
8 | BLOB_KEY
9 | BLOB_NAME
10 | COGNITIVE_SERVICE_KEY
11 | COMPUTER_VISION_SUBSCRIPTION_KEY
12 | SEARCH_SERVICE_KEY
13 | # Reading additional dependencies to run the test
14 | # https://tox.readthedocs.io/en/latest/example/basic.html#depending-on-requirements-txt-or-defining-constraints
15 | deps = -rrequirements-dev.txt
16 | commands =
17 | # {posargs} will be substituded by arguments after the `--` when running.
18 | # This will allow running subset of the test suite via tox.
19 | #
20 | # EX: tox -- -m "not azure and not slow"
21 | # will pass {-m "not azure and not slow"} to `pytest`
22 | # See https://tox.readthedocs.io/en/latest/example/general.html for more details
23 | pytest {posargs}
24 |
25 |
26 | [testenv:flake8]
27 | deps = flake8
28 | skip_install = True
29 | commands = flake8 .
30 |
31 |
32 | # Configurations for running pytest
33 | [pytest]
34 | log_cli = False
35 | log_format = %(asctime)s %(levelname)s %(message)s
36 | junit_family = xunit2
37 | # This enable custom marker as decorator "@pytest.mark.slow"
38 | markers =
39 | # These two markers allow to us to run faster subset of the test:
40 | # EX: pytest -m "not slow and not azure"
41 | # See https://docs.pytest.org/en/stable/example/markers.html#registering-markers
42 | slow: marks tests as slow (deselect with '-m "not slow"')
43 | azure: marks as integration tests that require azure resource
44 | io: marks integration tests involving some form of I/O operations (disk, internet, etc)
45 | testpaths =
46 | tests
47 | addopts =
48 | # reports all (except passed tests). See https://docs.pytest.org/en/latest/usage.html#detailed-summary-report
49 | -ra
50 | --cov-append --cov=genalog --cov-report=html --cov-report=term-missing --cov-report=xml --junitxml=junit/test-results.xml
51 |
52 |
53 | [flake8]
54 | # Configs for flake8-import-order, see https://pypi.org/project/flake8-import-order/ for more info.
55 | import-order-style=edited
56 | application-import-names=genalog, tests
57 | # Native flake8 configs
58 | max-line-length = 140
59 | exclude =
60 | build, dist, docs, example,
61 | .env*,.venv* # local virtual environments
62 | .tox
63 |
--------------------------------------------------------------------------------