├── .gitignore
├── resources
    ├── im_2.png
    ├── im_9.png
    ├── im_10.png
    ├── im_12.png
    └── DoGe_Scheme.png
├── scripts
    ├── print_profiling_stats.py
    ├── count_files.bash
    └── show_image_annotations.py
├── requirements.txt
├── docx_config.json
├── Dockerfile
├── src
    ├── url_parser.py
    ├── utils.py
    ├── manager.py
    ├── document_generator.py
    ├── docx_document.py
    └── augmentations.py
├── main.py
└── README.md


/.gitignore:
--------------------------------------------------------------------------------
1 | data
2 | *cache*
3 | nohup.out
4 | *.png
5 | .vscode
6 | *.profile
7 | logs
8 | fonts


--------------------------------------------------------------------------------
/resources/im_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Travvy88/DocumentGenerator_DoGe/HEAD/resources/im_2.png


--------------------------------------------------------------------------------
/resources/im_9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Travvy88/DocumentGenerator_DoGe/HEAD/resources/im_9.png


--------------------------------------------------------------------------------
/resources/im_10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Travvy88/DocumentGenerator_DoGe/HEAD/resources/im_10.png


--------------------------------------------------------------------------------
/resources/im_12.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Travvy88/DocumentGenerator_DoGe/HEAD/resources/im_12.png


--------------------------------------------------------------------------------
/resources/DoGe_Scheme.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Travvy88/DocumentGenerator_DoGe/HEAD/resources/DoGe_Scheme.png


--------------------------------------------------------------------------------
/scripts/print_profiling_stats.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import pstats
 3 | 
 4 | parser = argparse.ArgumentParser()
 5 | parser.add_argument('path')
 6 | parser.add_argument('-n')
 7 | args = parser.parse_args()
 8 | 
 9 | p = pstats.Stats(args.path)
10 | p.strip_dirs().sort_stats(pstats.SortKey.CUMULATIVE).print_stats(int(args.n))


--------------------------------------------------------------------------------
/scripts/count_files.bash:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | count_files() {
 4 |   if [ -z "$1" ]; then
 5 |     echo "No argument provided" >&2
 6 |     return 1
 7 |   fi
 8 |   if [ ! -d "$1" ]; then
 9 |     echo "Argument is not a directory" >&2
10 |     return 1
11 |   fi
12 |   find "$1" -type f | wc -l
13 | }
14 | 
15 | count_files "$@"


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | -e git+https://github.com/Travvy88/augraphy.git@fix_bboxes_oneof_augmentation_sequence#egg=augraphy
 2 | opencv-python==4.10.0.84
 3 | python-docx==1.1.2
 4 | matplotlib==3.8.2
 5 | pdf2image==1.17.0
 6 | tqdm==4.66.5
 7 | unoserver==2.1
 8 | unotools==0.3.3
 9 | beautifulsoup4==4.12.3
10 | requests==2.32.3
11 | pillow-simd==9.5.0.post2


--------------------------------------------------------------------------------
/docx_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "max_words": 20000,
 3 |     "p_2columns": 0.2,
 4 |     "font_size_interval": [8, 13],
 5 |     "p_line_spacing": [0.5, 0.5],
 6 |     "p_text_alignment": [0.1, 0.4, 0, 0.5],
 7 |     "p_heading_bold": 0.5,
 8 |     "heading_relative_size_interval": [1, 2],
 9 |     "p_heading_alignment": [0.5, 0.25, 0.01, 0.24],
10 |     "table_max_rows": 15,
11 |     "table_max_cols": 5
12 | }


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:22.04
 2 | 
 3 | # Install python and libreoffice
 4 | RUN apt-get update && apt-get install -y libreoffice python3 python3-pip git libjpeg-dev zlib1g-dev poppler-utils
 5 | 
 6 | # install unoserver to python that is used by LibreOffice
 7 | RUN /usr/bin/python3 -m pip install --user unoserver
 8 | 
 9 | # Set working directory to /app
10 | WORKDIR /app
11 | 
12 | # Copy requirements file
13 | COPY requirements.txt .
14 | 
15 | # Install Python dependencies
16 | RUN pip3 install -r requirements.txt
17 | 
18 | # Copy the current directory contents into the container at /app
19 | COPY src /app/src
20 | COPY main.py /app/main.py
21 | COPY docx_config.json /app/docx_config.json
22 | 
23 | # run interactively
24 | CMD ["/bin/bash"]


--------------------------------------------------------------------------------
/scripts/show_image_annotations.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | from PIL import Image, ImageDraw, ImageFont
 4 | 
 5 | parser = argparse.ArgumentParser()
 6 | parser.add_argument('path')
 7 | 
 8 | args = parser.parse_args()
 9 | 
10 | image = Image.open(args.path)
11 | height, width = image.size
12 | 
13 | with open(args.path + '.json', 'r') as f:
14 |     annotations = json.load(f)
15 | 
16 | draw = ImageDraw.Draw(image)
17 | font = ImageFont.truetype('arial.ttf', size=8)
18 | for word, bbox in zip(annotations['words'], annotations['bboxes']):
19 |     x, y, w, h = bbox  
20 |     
21 | 
22 |     
23 | 
24 |     if w < 0:
25 |         x = x - w
26 |         w = w * -1
27 |     if h < 0:
28 |         y = y - h
29 |         h = h * -1
30 | 
31 |     x1 = int(x * width)
32 |     y1 = int(y * height)
33 |     x2 = int((x + w) * width)
34 |     y2 = int((y + h) * height)
35 | 
36 |     print([x, y, w, h])
37 |     print([x1, y1, x2, y2])
38 | 
39 |     draw.rectangle([x1, y1, x2, y2], outline="blue", width=1)
40 |     draw.text((x1, y1 - 10), word, fill="red", font=font, )  # Adjust position as needed
41 | 
42 | output_image_path = 'show_anno.png'
43 | image.save(output_image_path)
44 | 


--------------------------------------------------------------------------------
/src/url_parser.py:
--------------------------------------------------------------------------------
 1 | from urllib.parse import urljoin, urlparse
 2 | from bs4 import BeautifulSoup
 3 | import requests
 4 | from tqdm import tqdm
 5 | 
 6 | 
 7 | class UrlParser:
 8 |     def parse(self, start_url, max_urls, languages):
 9 |         ptr = 0
10 |         urls = []
11 |         urls.append(start_url)
12 | 
13 |         pbar = tqdm(initial=1, total=max_urls)
14 |         while len(urls) < max_urls:
15 |             url = urls[ptr]
16 |             try:
17 |                 response = requests.get(url)
18 |                 response.raise_for_status()
19 |             except requests.exceptions.RequestException as e:
20 |                 print(f"Failed to retrieve {url}: {e}")
21 |                 return
22 | 
23 |             # Parse the page content
24 |             soup = BeautifulSoup(response.content, 'html.parser')
25 | 
26 |             # Find all links on the page
27 |             links = soup.find_all('a', href=True)
28 |             for link in links:
29 |                 href = link['href']
30 |                 full_url = urljoin(url, href)
31 |                 if self.is_valid_url(full_url, languages) and full_url not in urls and len(urls) < max_urls:
32 |                     urls.append(full_url)
33 |                     pbar.update(1)
34 |             ptr += 1
35 |         return urls[1:]
36 |     
37 |     def is_valid_url(self, url, languages):
38 |         # Check if the URL is a valid Wikipedia article URL
39 |         parsed = urlparse(url)
40 |         if parsed.scheme in ('http', 'https') and 'wikipedia.org' in parsed.netloc and \
41 |             any(parsed.netloc.find(lang_element) != -1 for lang_element in languages):
42 |             path = parsed.path
43 |             if path.startswith('/wiki/') and not any(sub in path for sub in [':', '/wiki/Main_Page']):
44 |                 return True
45 |         return False


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import cProfile
 3 | import json
 4 | from pathlib import Path
 5 | from src.manager import Manager
 6 | 
 7 | 
 8 | def create_parser():
 9 |     parser = argparse.ArgumentParser(description="Manager Configuration")
10 |     
11 |     parser.add_argument('--out_dir', type=str, required=True,
12 |                         help='Output directory for saving results')
13 |     parser.add_argument('--remove_existing_dir', action='store_true',
14 |                         help='If out_dir exists, delete the folder and files before creating a new one')
15 |     parser.add_argument('--debug', action='store_true',
16 |                         help='Enable debug mode')
17 |     parser.add_argument('--image_size', type=int, default=244,
18 |                         help='Size of the final images (default: 244)')
19 |     parser.add_argument('--start_page', type=str, default='https://en.wikipedia.org/wiki/Main_Page',
20 |                         help='Starting page URL (default: Wikipedia main page)')
21 |     parser.add_argument('--languages', type=str, nargs='+', default=['en'],
22 |                         help='Permitted languages. Other languages will be ignored (default: English)')
23 |     parser.add_argument('--max_urls', type=int, default=16,
24 |                         help='Maximum number of URLs to process (default: 100)')
25 |     parser.add_argument('--num_processes', type=int, default=1,
26 |                         help='Number of processes to use (default: 1)')
27 |     parser.add_argument('--max_threads', type=int, default=3,
28 |                         help='Maximum threads inside a process (default: 3)')
29 |     parser.add_argument('--ports', type=str, nargs='+', default=[8145, 8146],
30 |                         help='List of ports to use (default: [8145, 8146]). Number of ports \
31 |                             should be 2 times larger than num_processes')
32 | 
33 |     return parser
34 | 
35 | if __name__ == "__main__":
36 |     parser = create_parser()
37 |     args = parser.parse_args()
38 | 
39 |     with open('docx_config.json', 'r') as f:
40 |         docx_config = json.load(f)
41 | 
42 |     manager = Manager(
43 |         docx_config=docx_config,
44 |         out_dir=Path(args.out_dir),
45 |         remove_existing_dir=args.remove_existing_dir,
46 |         debug=args.debug,
47 |         image_size=args.image_size,
48 |         start_page=args.start_page,
49 |         languages=tuple(args.languages),
50 |         max_urls=args.max_urls,
51 |         num_processes=args.num_processes,
52 |         max_threads=args.max_threads,
53 |         ports=tuple(args.ports)
54 |     )
55 |     manager.generate()
56 | 


--------------------------------------------------------------------------------
/src/utils.py:
--------------------------------------------------------------------------------
 1 | from PIL import Image, ImageDraw
 2 | import cv2
 3 | import numpy as np
 4 | 
 5 | 
 6 | def convert_xywh_to_x1y1x2y2(bboxes):
 7 |     if isinstance(bboxes, list):
 8 |         return [[bbox[0], bbox[1], bbox[0] + bbox[2], bbox[1] + bbox[3]] for bbox in bboxes]
 9 |     if isinstance(bboxes, np.ndarray):
10 |         x1 = bboxes[:, 0]
11 |         x2 = bboxes[:, 1]
12 |         x3 = bboxes[:, 0] + bboxes[:, 2]
13 |         x4 = bboxes[:, 1] + bboxes[:, 3]
14 |         return np.column_stack((x1, x2, x3, x4))
15 | 
16 | 
17 | def convert_x1y1x2y2_to_xywh(bboxes):
18 |     if isinstance(bboxes, list):
19 |         return [[bbox[0], bbox[1], bbox[2] - bbox[0], bbox[3] - bbox[1]] for bbox in bboxes]
20 |     if isinstance(bboxes, np.ndarray):
21 |         x = bboxes[:, 0]
22 |         y = bboxes[:, 1]
23 |         w = bboxes[:, 2] - bboxes[:, 0]
24 |         h = bboxes[:, 3] - bboxes[:, 1]
25 |         return np.column_stack((x, y, w, h))
26 | 
27 | 
28 | def normalize_bboxes(bboxes, width, height):
29 |     if isinstance(bboxes, list):
30 |         return [[bbox[0] / width, bbox[1] / height, bbox[2] / width, bbox[3] / height] for bbox in bboxes]
31 |     if isinstance(bboxes, np.ndarray):
32 |         el1 = bboxes[:, 0] / width
33 |         el2 = bboxes[:, 1] / height
34 |         el3 = bboxes[:, 2] / width
35 |         el4 = bboxes[:, 3] / height
36 |         return np.column_stack((el1, el2, el3, el4))
37 | 
38 | 
39 | def unnormalize_bboxes(bboxes, width, height):
40 |     if isinstance(bboxes, list):
41 |         return [[bbox[0] * width, bbox[1] * height, bbox[2] * width, bbox[3] * height] for bbox in bboxes]
42 |     if isinstance(bboxes, np.ndarray):
43 |         el1 = bboxes[:, 0] * width
44 |         el2 = bboxes[:, 1] * height
45 |         el3 = bboxes[:, 2] * width
46 |         el4 = bboxes[:, 3] * height
47 |         return np.column_stack((el1, el2, el3, el4))
48 | 
49 | def draw_bboxes_pil(image, bboxes, words=None):
50 |     draw = ImageDraw.Draw(image)
51 |     for bbox, word in zip(bboxes, words):
52 |         x1, y1, x2, y2 = int(bbox[0]), int(bbox[1]), int(bbox[2]), int(bbox[3])
53 |         # Draw rectangle with red color and 2px thickness
54 |         draw.rectangle([(x1, y1), (x2, y2)], outline="red", width=2)
55 |         # Optionally add text labels above the boxes
56 |         draw.text((x1, y1-15), word, fill="red")
57 |     return image
58 | 
59 | def draw_bboxes(image, bboxes, words=None):
60 |     # bboxes in x1, y1, x2, y2 format
61 |     if words is None:
62 |         words = [""] * len(bboxes)
63 |     
64 |     if isinstance(image, np.ndarray):
65 |         image = Image.fromarray(image)
66 |         image = draw_bboxes_pil(image, bboxes, words)
67 |         image = np.array(image)
68 |     elif isinstance(image, Image.Image):
69 |         image = draw_bboxes_pil(image, bboxes, words)
70 |     else:
71 |         raise ValueError(f"Unsupported image type: {type(image)}")
72 |         
73 |     return image
74 | 


--------------------------------------------------------------------------------
/src/manager.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import multiprocessing
  3 | import os
  4 | from pathlib import Path
  5 | import shutil
  6 | import time
  7 | 
  8 | from tqdm import tqdm
  9 | 
 10 | from src.document_generator import DocumentGenerator
 11 | from src.url_parser import UrlParser
 12 | 
 13 | 
 14 | class Manager:
 15 |     def __init__(self, 
 16 |                  docx_config: dict,
 17 |                  out_dir: Path, 
 18 |                  remove_existing_dir,
 19 |                  debug,
 20 |                  image_size, 
 21 |                  start_page,
 22 |                  languages, 
 23 |                  max_urls,
 24 |                  num_processes, 
 25 |                  max_threads,
 26 |                  ports):
 27 |         
 28 |         self.docx_config = docx_config
 29 |         self.out_dir = out_dir
 30 |         self.debug = debug
 31 |         self.image_size = image_size
 32 |         self.start_page = start_page
 33 |         self.languages = languages
 34 |         self.max_urls = max_urls
 35 | 
 36 |         self.num_processes = num_processes
 37 |         self.max_threads = max_threads
 38 |         self.ports = ports
 39 | 
 40 |         self.url_parser = UrlParser()
 41 |         self.folders = self._create_folders(remove_existing_dir=remove_existing_dir)
 42 |         self.doc_generators = [DocumentGenerator(self.max_threads,
 43 |                                                  self.image_size, 
 44 |                                                  self.docx_config, 
 45 |                                                  self.folders[i], 
 46 |                                                  ports[i], 
 47 |                                                  ports[num_processes + i],
 48 |                                                  self.debug,) \
 49 |                                for i in range(num_processes)]
 50 | 
 51 |     def generate(self):
 52 |         start_time = time.time()
 53 |         print('Parsing urls...')
 54 |         urls = self.url_parser.parse(self.start_page, self.max_urls, self.languages)
 55 |         urls_chunks = self._split_urls_to_chunks(urls)
 56 |         processes = []
 57 |         
 58 |         for i in range(self.num_processes):
 59 |             process = multiprocessing.Process(name=f"Generator_{i}", target=self.doc_generators[i].generate, 
 60 |                                               kwargs={"urls": urls_chunks[i]})
 61 |             processes.append(process)
 62 |             process.start()
 63 | 
 64 |         for process in processes:
 65 |             process.join()
 66 | 
 67 |         self._merge_all_folders()
 68 |         
 69 |         end_time = time.time()
 70 |         file_count = 0
 71 |         for root, dirs, files in os.walk(self.out_dir):
 72 |             file_count += len(files)
 73 |         file_count /= 2
 74 |         print('Images:', int(file_count))
 75 |         print('Elapsed time:', end_time - start_time)
 76 |         print('Urls per second:', self.max_urls / (end_time - start_time))
 77 |         print('Images per second:', file_count / (end_time - start_time))
 78 |         print()
 79 |         print('Seconds per url:', (end_time - start_time) / self.max_urls)
 80 |         print('Seconds per image:', (end_time - start_time) / file_count)
 81 |         print('Images per url:', file_count / self.max_urls)
 82 |     
 83 |     def _split_urls_to_chunks(self, urls):
 84 |         n = len(urls)
 85 |         chunk_size = n // self.num_processes
 86 |         remainder = n % self.num_processes
 87 | 
 88 |         chunks = []
 89 |         for i in range(self.num_processes):
 90 |             start_index = i * chunk_size + min(i, remainder)
 91 |             end_index = start_index + chunk_size + (1 if i < remainder else 0)
 92 |             chunks.append(urls[start_index:end_index])
 93 |         return chunks
 94 |     
 95 |     def _create_folders(self, remove_existing_dir):
 96 |         folders = [self.out_dir / f"tmp_process_{i}" for i in range(self.num_processes)]
 97 |         if remove_existing_dir:
 98 |             if os.path.exists(self.out_dir):
 99 |                 shutil.rmtree(self.out_dir)
100 |             for folder in folders:
101 |                 if os.path.exists(folder):
102 |                     shutil.rmtree(folder)
103 |         
104 |         for folder in folders:
105 |             os.makedirs(folder)
106 | 
107 |         return folders
108 |     
109 |     def _validate_annotations(self, image_path, anno_path):
110 |         if not os.path.exists(image_path):
111 |             print(f"Image {image_path} not found")
112 |             return False
113 |         if not os.path.exists(anno_path):
114 |             print(f"Annotation {anno_path} not found")
115 |             return False
116 |         
117 |         with open(anno_path, 'r') as f:
118 |             anno = json.load(f)
119 | 
120 |         if len(anno['words']) != len(anno['bboxes']):
121 |             print(f"Annotation {anno_path} has different number of words and bboxes")
122 |             return False
123 | 
124 |         return True
125 | 
126 |     def _merge_all_folders(self):
127 |         counter = 0
128 |         bad_annotations = 0
129 |         for folder_path in tqdm(self.folders):
130 |             if os.path.isdir(folder_path):
131 |                 # Iterate over each file in the current folder
132 |                 for file_name in sorted([f for f in os.listdir(folder_path) if f.endswith('.png.json')]):
133 |                     json_path = os.path.join(folder_path, file_name)
134 | 
135 |                     if self._validate_annotations(json_path[:-5], json_path):
136 |                         new_file_name = f"image_{counter}.png"   
137 |                         new_json_name = f"image_{counter}.png.json"
138 | 
139 |                         new_file_path = os.path.join(self.out_dir, new_file_name)
140 |                         new_json_path = os.path.join(self.out_dir, new_json_name)
141 | 
142 |                         # Move and rename the file
143 |                         shutil.move(json_path[:-5], new_file_path)
144 |                         shutil.move(json_path, new_json_path)
145 |                         #print(f'{json_path} -> {new_json_path}')
146 |                         #print(f'{json_path[:-5]} -> {new_file_path}')
147 |                         
148 |                         # Move the colored image
149 |                         _, number = file_name.split("_")
150 |                         number = number.split(".")[0]
151 |                         #print(f'{folder_path}/im_{number}_colored.png -> {new_file_path[:-4] + "_colored.png"}')
152 |                         #print('--------------------------------')
153 |                         if os.path.exists(f"{folder_path}/im_{number}_colored.png"):
154 |                             shutil.move(f"{folder_path}/im_{number}_colored.png", new_file_path[:-4] + "_colored.png")
155 | 
156 |                         counter += 1
157 |                     else:
158 |                         bad_annotations += 1
159 |         
160 |         for i in range(self.num_processes):
161 |             shutil.rmtree(self.out_dir / f'tmp_process_{i}')
162 | 
163 |         print(f'Folder merge finished, bad annotations: {bad_annotations}')
164 | 


--------------------------------------------------------------------------------
/src/document_generator.py:
--------------------------------------------------------------------------------
  1 | import cProfile
  2 | from concurrent.futures import ThreadPoolExecutor
  3 | import json
  4 | import multiprocessing
  5 | import os
  6 | from pathlib import Path
  7 | import subprocess
  8 | from time import sleep
  9 | import time
 10 | import traceback
 11 | from bs4 import BeautifulSoup
 12 | import numpy as np
 13 | from augraphy import AugraphyPipeline
 14 | from unoserver import client
 15 | import requests
 16 | from tqdm import tqdm
 17 | from PIL import Image, ImageDraw
 18 | import cv2 
 19 | import threading
 20 | 
 21 | import src.utils as utils
 22 | from src.augmentations import get_augmentation_phases
 23 | from src.docx_document import DocxDocument
 24 | 
 25 | 
 26 | def profileit(func):
 27 |     def wrapper(*args, **kwargs):
 28 |         datafn = func.__name__ + ".profile" # Name the data file sensibly
 29 |         prof = cProfile.Profile()
 30 |         retval = prof.runcall(func, *args, **kwargs)
 31 |         prof.dump_stats(datafn)
 32 |         return retval
 33 | 
 34 |     return wrapper
 35 | 
 36 | class DocumentGenerator:
 37 |     def __init__(self, max_threads, image_size, docx_config, out_folder, port, uno_port, debug_mode):
 38 |         self.max_threads = max_threads
 39 |         self.image_size = image_size
 40 |         self.out_folder = out_folder
 41 |         self.docx_config = docx_config
 42 |         self.port = port
 43 |         self.uno_port = uno_port
 44 |         self.debug_mode = debug_mode
 45 |         
 46 |         self.image_counter = 0
 47 |     
 48 |         command = f"/usr/bin/python3 -m unoserver.server --port {port} --uno-port {uno_port} > /dev/null 2>&1"
 49 |         print('START SERVER', port, uno_port)
 50 |         self.unoserver_process = subprocess.Popen(command, shell=True)
 51 |         self.uno_client = client.UnoClient(port=port)
 52 |     
 53 |     def __del__(self):
 54 |         self.unoserver_process.kill()
 55 | 
 56 |     def generate(self, urls):
 57 |         print('Start Document Generator...')
 58 |         with ThreadPoolExecutor(max_workers=self.max_threads, 
 59 |                                 thread_name_prefix=f"{multiprocessing.current_process().name}_thread") as executor:
 60 |             futures = [executor.submit(self.create_doc_try_except, url) for url in urls]
 61 |             for future in futures:
 62 |                 future.result()
 63 |     
 64 |     def create_doc_try_except(self, url):
 65 |         try:
 66 |             self.create_doc(url)
 67 |             print(f'{threading.current_thread().name} total images generated by the current process: {self.image_counter}')
 68 |         except Exception as e:
 69 |             if self.debug_mode:
 70 |                 print(traceback.format_exc())
 71 |             else:
 72 |                 print("skipping due augmentation error")
 73 | 
 74 |     #@profileit
 75 |     def create_doc(self, url):
 76 |         doc = DocxDocument(self.docx_config, self.uno_client)
 77 |         response = requests.get(url)
 78 |         if response.status_code != 200:
 79 |             print(f"Bad Response: {response}")
 80 |             return
 81 |         
 82 |         # create colored docx document
 83 |         soup = BeautifulSoup(response.text, 'html.parser')
 84 |         for element in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', "table"]):
 85 |             if element.name.startswith('h'):
 86 |                 doc.add_heading(element)
 87 |             elif element.name == "table":
 88 |                 doc.add_table(element)
 89 |             else:
 90 |                 doc.add_text(element)
 91 |                 
 92 |             if doc.get_num_words() > self.docx_config["max_words"]:
 93 |                 break
 94 |             
 95 |         # extract annotations from colored images
 96 |         colored_images = doc.get_images(dpi=200, image_size=1500)
 97 |         annotations = self.get_bboxes(colored_images, doc.color2word)  # bboxes are normalized to [0,1]
 98 |         doc.convert_to_uncolored_docx()
 99 |         images = doc.get_images(dpi=200, image_size=1024)  # get images for augmentation stage        
100 |         for i, image in enumerate(images):
101 |             if len(annotations[i]['words']) != len(annotations[i]['bboxes']):
102 |                 continue
103 |             # unnormalize bboxes to augmentation image size
104 |             bounding_boxes = np.array(annotations[i]["bboxes"])
105 |             bounding_boxes = utils.unnormalize_bboxes(bounding_boxes, colored_images[0].size[0], colored_images[0].size[1])
106 | 
107 |             # perform augmentation
108 |             augmentation_pipeline = AugraphyPipeline(bounding_boxes=bounding_boxes,
109 |                                                      log=False, **get_augmentation_phases())
110 |             
111 |             augmented_cv2, _, _, augmented_bounding_boxes = augmentation_pipeline(np.array(image))
112 |             augmented_image = Image.fromarray(augmented_cv2)
113 |             with threading.Lock():
114 |                 if self.debug_mode:
115 |                     bboxes_for_image = utils.normalize_bboxes(augmented_bounding_boxes, colored_images[0].size[0], colored_images[0].size[1])
116 |                     bboxes_for_image = utils.unnormalize_bboxes(bboxes_for_image, augmented_image.size[0], augmented_image.size[1])
117 | 
118 |                     augmented_image = utils.draw_bboxes_pil(augmented_image, bboxes_for_image, annotations[i]["words"])
119 |                     colored_image = utils.draw_bboxes_pil(colored_images[i], bounding_boxes, annotations[i]["words"])
120 |                     colored_image.save(self.out_folder / f"im_{self.image_counter}_colored.png")
121 | 
122 |                 # resize image to final dataset size and save 
123 |                 augmented_image = augmented_image.resize((self.image_size, self.image_size))
124 |                 augmented_image.save(self.out_folder / f"im_{self.image_counter}.png")
125 |                 
126 |                 # convert booxes to (x, y, w, h) format and normalize to [0,1]
127 |                 augmented_bounding_boxes = np.array(augmented_bounding_boxes).astype(int)
128 |                 annotations[i]["bboxes"] = utils.normalize_bboxes(augmented_bounding_boxes, colored_images[0].size[0], colored_images[0].size[1]).tolist()
129 |                 
130 |                 # save annotation
131 |                 with open(self.out_folder/ f"im_{self.image_counter}.png.json", "w") as f:
132 |                     json.dump(annotations[i], f)
133 |                 self.image_counter += 1       
134 |   
135 |     def get_bboxes(self, images, color2word):
136 |         annotations = []
137 |         for image_pil in images:
138 |             width, height = image_pil.size
139 |             image_annotations = {"words": [], "bboxes": []}
140 |             image = np.asarray(image_pil)
141 | 
142 |             thr = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
143 |             thr = cv2.threshold(thr, 254, 255, cv2.THRESH_BINARY_INV)[1]
144 |             cnts = cv2.findContours(thr, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)[0]
145 | 
146 |             for c in cnts:
147 |                 peri = cv2.arcLength(c, True)
148 |                 approx = cv2.approxPolyDP(c, 0.015 * peri, True)
149 | 
150 |                 if len(approx) == 4:
151 |                     x, y, w, h = cv2.boundingRect(approx)
152 |                     rgb_color = image_pil.getpixel((x+1, y+1))
153 |                     color = '#%02x%02x%02x' % (rgb_color)
154 |                     if color in color2word:
155 |                         word = color2word[color]
156 |                         image_annotations['words'].append(word)
157 |                         image_annotations["bboxes"].append(
158 |                             (
159 |                                 x / width, 
160 |                                 y / height, 
161 |                                 (x + w) / width, 
162 |                                 (y + h) / height)
163 |                              )
164 |             annotations.append(image_annotations)
165 |         return annotations
166 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # DoGe — Synthetic DOcument GEnerator for Document AI
  2 | 
  3 | DoGe is designed to synthesize a dataset of realistic document scans. Each document contains meaningful text, headings, 
  4 | tables, paragraphs with different formatting and fonts which are parsed from Wikipedia. The coordinates 
  5 | of the words are extracted using the No-OCR method we invented for faster generation on CPU.
  6 | 
  7 | ## Document examples
  8 | 
  9 | <div style="display: flex; flex-wrap: wrap;">
 10 |     <img src="resources/im_2.png" width="300" style="margin-right: 10px; margin-bottom: 10px;">
 11 |     <img src="resources/im_9.png" width="300" style="margin-right: 10px; margin-bottom: 10px;">
 12 |     <img src="resources/im_10.png" width="300" style="margin-right: 10px; margin-bottom: 10px;">
 13 |     <img src="resources/im_12.png" width="300" style="margin-right: 10px; margin-bottom: 10px;">
 14 | </div>
 15 | 
 16 | Check the full size (1024x1024) in [resources](./resources) folder.
 17 | 
 18 | ## Usage
 19 | 
 20 | ### Docker installation
 21 | 
 22 | You can use Docker image with predefined environment to run DoGe:
 23 | ```bash
 24 | git clone https://github.com/Travvy88/DocumentGenerator_DoGe
 25 | cd DocumentGenerator_DoGe
 26 | docker build -t doge .
 27 | ```
 28 | 
 29 | Replace `/path/to/output/folder/on/host` and run commands. Inside the docker container you can
 30 | [start document generation](#start-data-generation). 
 31 | 
 32 | ### Ubuntu installation
 33 | 
 34 | For faster generation, it is recommended to install all dependencies without Docker. 
 35 | Doge is tested on Ubuntu 22.04.
 36 | ```bash
 37 | sudo apt-get update 
 38 | sudo apt-get install libreoffice libjpeg-dev zlib1g-dev poppler-utils
 39 | /usr/bin/python3 -m pip install --user unoserver  # install unoserver on system python
 40 | 
 41 | git clone https://github.com/Travvy88/DocumentGenerator_DoGe
 42 | cd DocumentGenerator_DoGe
 43 | # there you can make venv if needed!
 44 | pip3 install -r requirements.txt 
 45 | ```
 46 | 
 47 | ## Start Data Generation
 48 | 
 49 | Docker:
 50 | 
 51 | ```bash
 52 | docker run -v /full/path/to/output/folder/on/your/computer:/app/data doge python3 main.py --out_dir data --image_size 244 --max_urls 4 --num_processes 2 --ports 4000 4001 4002 4003
 53 | ``` 
 54 | 
 55 | Ubuntu:
 56 | 
 57 | ```bash
 58 | python3 main.py --out_dir data --image_size 244 --max_urls 4 --num_processes 2 --ports 4000 4001 4002 4003
 59 | ```
 60 | 
 61 | ### Main.py
 62 | 
 63 | The following arguments can be passed to the script:
 64 | 
 65 | - `--out_dir`: The output directory for saving results. This argument is required.
 66 | - `--remove_existing_dir`: If set, the output directory will be deleted before creating a new one.
 67 | - `--image_size`: The size of the final images. Default is `244`.
 68 | - `--start_page`: The starting page URL. Default is the Wikipedia main English page. You can use another language Wiki main page URL.
 69 | - `--languages`: Permitted languages. Pages with other localizations will be ignored. Default is `['en']`.
 70 | - `--max_urls`: The maximum number of URLs to process. Default is `100`.
 71 | - `--num_processes`: The number of processes to use. Default is `1`. Each process will start DocumentGenerator and start Unoserver for each generator.
 72 | - `--max_threads`: The maximum threads inside a process. Default is `3`.
 73 | - `--ports`: The list of ports to use. Default is `[8145, 8146]`. The number of ports should be 2 times larger than `num_processes` (each Unoserver instance needs 2 ports for proper multicore work)
 74 | - `--debug`: If set, draws bounding boxes + words on each image and saves itermediate images with highlighted words.
 75 | 
 76 | 
 77 | ### Docx_config.json
 78 | 
 79 | | Parameter | Description |
 80 | |-------------|-------------|
 81 | | `max_words` | The maximum number of words allowed in the generated documents. |
 82 | | `p_2columns` | The probability that the document will be formatted into two columns. |
 83 | | `font_size_interval` | The font size range from which the size is randomly selected for each document. |
 84 | | `p_line_spacing` | A list of probabilities controlling the line spacing of the document (1.5 or double). |
 85 | | `p_text_alignment` | A list of probabilities controlling the text alignment of the document (center, left, right, justify). |
 86 | | `p_heading_bold` | The probability that headings will be displayed in bold font. |
 87 | | `heading_relative_size_interval` | The range of relative font sizes for headings. The relative font size is chosen randomly. |
 88 | | `p_heading_alignment` | A list of probabilities controlling the alignment of headings (center, left, right, justify). |
 89 | | `table_max_rows` | The maximum number of rows allowed in a table. Tables with more than the specified number of rows are dropped. |
 90 | | `table_max_cols` | The maximum number of columns allowed in a table. Tables with more than the specified number of columns are dropped. |
 91 | 
 92 | Parameters with probabilities and intervals calculate its values for each document randomly.
 93 | 
 94 | According to my experience, generator produces an average about 14 images for each url
 95 | with the above Docx settings. 
 96 | 
 97 | ### Augmentations 
 98 | 
 99 | Augmentation pipeline applies on a final stage. You can manage different augmentations 
100 | in `src/augmentations.py` file. Read the [Augraphy Docs](https://augraphy.readthedocs.io/en/latest/) for detailed explanation. 
101 | 
102 | 
103 | ## How it works
104 | ![General scheme](resources/DoGe_Scheme.png "General scheme of DoGe")
105 | 
106 | Firstly, the `Manager` class creates the `DocumentGenerator` instances in separate processes. For 
107 | each `DocumentGenerator`, a Unoserver instance is started.
108 | 
109 | Then, the `UrlParser` generates a list of URLs by crawling the web, starting from a given start page 
110 | and following links on each page. It uses `BeautifulSoup` to parse HTML content and extract links, 
111 | then checks each link's validity and language, adding it to the list if it meets certain conditions. 
112 | The process continues until a maximum number of URLs is reached, and the method returns the list of 
113 | generated URLs, excluding the starting URL. 
114 | 
115 | When data generation begins, the list of URLs is divided into several chunks for each `DocumentGenerator`.
116 | Each `DocumentGenerator` instance retrieves a Wikipedia HTML page by URL from its chunk.
117 | Headers, paragraphs formatting, and tables are extracted and placed into a Docx document via the `DocxDocument` class. 
118 | At this stage, some random parametrization is applied according to `docx_config.json`. 
119 | For example, font size, text alignment, one or two columns, and other parameters 
120 | are chosen for each document randomly. 
121 | 
122 | After that, each word in the Docx is filled with a unique color. As a result, a colored rectangle
123 | appears in place of each word. The image will be encoded with 24-bit color depth, 
124 | so the maximum number of words per document is 16,777,216. The text of each word is saved to a hashmap of type color_code -> word. 
125 | 
126 | The next step is Docx to image conversion. DoGe uses Unoserver to convert Docx to Pdf and
127 | pdf2image for image rendering.
128 | 
129 | Then, all rectangle coordinates are detected via OpenCV on converted images. The word for each bounding box is retrieved from the hashmap. 
130 | DoGe saves annotations to JSON files in the following format:
131 | 
132 | ```json
133 | {
134 |   "words": [
135 |     "Hello", 
136 |     "World"
137 |   ],
138 |   "bboxes": [
139 |     [0.1, 0.1, 0.03, 0.02],
140 |     [0.4, 0.3, 0.11, 0.02]
141 |   ]
142 | }
143 | ```
144 | 
145 | The bboxes are normalized and saved in XYWH format. 
146 | 
147 | The final step is deleting all color fills from words in the Docx document, rendering images, applying Augraphy augmentations, 
148 | and saving the augmented images to disk. That's it!
149 | 
150 | ## Join us!
151 | DoGe is the perspective method of producing synthetic document datasets. There are some features that will help many developers:
152 | - Download and place images into documents
153 | - Add annotations of headers, tables, paragraphs and images (if added)
154 | - Add different output formats (Parquet for example)
155 | - Add additional information via LLMs
156 | - Performance improvement: the **bottleneck** of generation is transforming Docx -> Pdf -> Png! I look for more simple way of converting Docx to Png.  
157 | 
158 | If have any ideas or you want to take part in the development of DoGe, write me:
159 | - travvy88@yandex.ru
160 | - https://t.me/travvy88
161 | 
162 | Or create a Pull Request to this repo. I will be glad to improve the project with the power of community.
163 | 
164 | ## Acknowledgments
165 | Here are some great open-source projects I benefit from:
166 | - [ISP RAS Dedoc Team](https://github.com/ispras/dedoc) for support and assistance. 
167 | - [Augraphy](https://github.com/sparkfish/augraphy) for augmentation code of final images. 
168 | - [Unoserver](https://github.com/unoconv/unoserver) for Docx to Pdf converter.
169 | - [Pdf2image](https://github.com/Belval/pdf2image) for image from Pdf rendering module.
170 | - [Pillow-SIMD](https://github.com/uploadcare/pillow-simd) for faster image processing. 
171 | 


--------------------------------------------------------------------------------
/src/docx_document.py:
--------------------------------------------------------------------------------
  1 | import cProfile
  2 | import io
  3 | import re
  4 | from docx import Document
  5 | from docx.enum.text import WD_PARAGRAPH_ALIGNMENT, WD_LINE_SPACING
  6 | from docx.oxml.ns import qn
  7 | from docx.shared import Pt, RGBColor
  8 | from docx.oxml import OxmlElement
  9 | import matplotlib.font_manager
 10 | import numba
 11 | import numpy as np
 12 | from pdf2image import convert_from_bytes
 13 | from PIL import Image
 14 | def profileit(func):
 15 |     def wrapper(*args, **kwargs):
 16 |         datafn = func.__name__ + ".profile" # Name the data file sensibly
 17 |         prof = cProfile.Profile()
 18 |         retval = prof.runcall(func, *args, **kwargs)
 19 |         prof.dump_stats(datafn)
 20 |         return retval
 21 | 
 22 |     return wrapper
 23 | 
 24 | class DocxDocument:
 25 |     def __init__(self, docx_config, uno_client):
 26 |         self.docx_config = docx_config
 27 |         self.uno_client = uno_client
 28 | 
 29 |         self.doc = Document()
 30 |         self.colors = self._init_colors(docx_config["max_words"])
 31 | 
 32 |         self.color2word = {}
 33 |         self.color_ptr = 0
 34 | 
 35 |         # sample random settings from docx_config 
 36 |         if np.random.binomial(1, self.docx_config["p_2columns"]):
 37 |             self.num_columns = 2
 38 |         else:
 39 |             self.num_columns = 1
 40 |         self.configure_several_columns()
 41 | 
 42 |         self.font_size = Pt(np.random.randint(*self.docx_config["font_size_interval"]))
 43 |         self.font_name = np.random.choice(self._list_available_fonts())
 44 |         
 45 |         self.line_spacing = np.random.choice(
 46 |             (WD_LINE_SPACING.ONE_POINT_FIVE, WD_LINE_SPACING.DOUBLE), 
 47 |             p=self._normalize_probabilities(self.docx_config["p_line_spacing"])) 
 48 | 
 49 |         self.paragraph_alignment = np.random.choice(
 50 |             (WD_PARAGRAPH_ALIGNMENT.CENTER, WD_PARAGRAPH_ALIGNMENT.LEFT, 
 51 |              WD_PARAGRAPH_ALIGNMENT.RIGHT, WD_PARAGRAPH_ALIGNMENT.JUSTIFY), 
 52 |             p=self._normalize_probabilities(self.docx_config["p_text_alignment"]))
 53 |     
 54 |         self.heading_bold = bool(np.random.binomial(1, self.docx_config["p_heading_bold"]))
 55 |         self.heading_relative_size = np.random.uniform(*self.docx_config["heading_relative_size_interval"])
 56 |         self.heading_size = Pt(self.heading_relative_size * self.font_size)
 57 |         self.heading_alignment = np.random.choice(
 58 |             (WD_PARAGRAPH_ALIGNMENT.CENTER, WD_PARAGRAPH_ALIGNMENT.LEFT, 
 59 |              WD_PARAGRAPH_ALIGNMENT.RIGHT, WD_PARAGRAPH_ALIGNMENT.JUSTIFY), 
 60 |             p=self._normalize_probabilities(self.docx_config["p_heading_alignment"]))
 61 |     
 62 |     def _normalize_probabilities(self, p):
 63 |         return np.array(p) / sum(p)
 64 | 
 65 |     def _list_available_fonts(self):
 66 |         font_paths = matplotlib.font_manager.findSystemFonts(fontpaths=None, fontext='ttf')
 67 |         font_names = set()
 68 | 
 69 |         for font_path in font_paths:
 70 |             try:
 71 |                 font = matplotlib.font_manager.get_font(font_path)
 72 |                 font_names.add(font.family_name)
 73 |             except RuntimeError as e:
 74 |                 print(f"Could not load font from path: {font_path}, error: {e}")
 75 |         return list(font_names)
 76 |     
 77 |     def _init_colors(self, max_colors):
 78 |         colors = []
 79 |         hex_color = "#000000"
 80 |         x = int(max_colors ** (1/3)) + 1
 81 |         for i in range(x):
 82 |             for j in range(x):
 83 |                 for k in range(x):
 84 |                     # Convert HEX to RGB
 85 |                     r = int(hex_color[1:3], 16)
 86 |                     g = int(hex_color[3:5], 16)
 87 |                     b = int(hex_color[5:7], 16)
 88 | 
 89 |                     # Increment RGB values
 90 |                     r = (r + i) % 256
 91 |                     g = (g + j) % 256
 92 |                     b = (b + k) % 256
 93 |                     hex_color = '#{:02x}{:02x}{:02x}'.format(r, g, b)
 94 |                     colors.append(hex_color)
 95 |         return colors
 96 |     
 97 |     def configure_several_columns(self):
 98 |         section = self.doc.sections[0]
 99 |         sectPr = section._sectPr
100 |         cols = sectPr.xpath('./w:cols')[0]
101 |         cols.set(qn('w:num'), str(self.num_columns))    
102 | 
103 |     def add_paragraph(self):
104 |         paragraph = self.doc.add_paragraph()
105 |         paragraph.alignment = self.paragraph_alignment
106 |         paragraph.paragraph_format.space_after = 0
107 |         paragraph.paragraph_format.line_spacing_rule = self.line_spacing
108 |         
109 |         return paragraph
110 | 
111 |     def add_heading(self, element):
112 |         text = element.text
113 |         level = int(element.name[1])
114 |         
115 |         if len(self.doc.paragraphs) > 1 and "Heading" in self.doc.paragraphs[-2].style.style_id:
116 |             return
117 |         
118 |         if text not in ["Contents"]:
119 |             paragraph = self.doc.add_heading(level=level)
120 |             _, metadata = self.add_words(text, paragraph)
121 |             self.color2word.update(metadata)
122 |             #for run in paragraph.runs:
123 |                 #run.font.size = self.heading_size
124 |             paragraph.alignment = self.heading_alignment
125 |             
126 |             p = self.add_paragraph()
127 |             p.paragraph_format.line_spacing_rule = WD_LINE_SPACING.SINGLE
128 |     
129 |     def add_table(self, html_element):
130 |         rows = html_element.find_all('tr')
131 |         parsed_table = []
132 |         for row in rows:
133 |             cells = row.find_all(['th', 'td'])
134 |             parsed_row = []
135 |             for cell in cells:
136 |                 parsed_row.append(cell.text.strip())
137 |             parsed_table.append(parsed_row)
138 | 
139 |         rows = len(parsed_table)
140 |         cols = max(len(row) for row in parsed_table)
141 |         if rows <= self.docx_config["table_max_rows"] and cols <= self.docx_config["table_max_cols"]:
142 |             table = self.doc.add_table(rows=len(parsed_table), cols=max(len(row) for row in parsed_table))
143 |             table.style = 'TableGrid'
144 |             self.set_table_border_color(table, "FFFFFF")
145 |             # Populating table data
146 |             for i, row_data in enumerate(parsed_table):
147 |                 for j, cell_data in enumerate(row_data):
148 |                     _, metadata = self.add_words(cell_data, table.cell(i, j).paragraphs[0])
149 |                     self.color2word.update(metadata)
150 |                     '''table.cell(i, j).paragraphs[0].paragraph_format.line_spacing = Pt(24)
151 |                     for run in table.cell(i, j).paragraphs[0].runs:
152 |                         run.font.size = self.doc_config["font_size"]'''
153 |     def add_text(self, html_element):
154 |         paragraph = self.add_paragraph()
155 |         prev_word = " "
156 |         first_word = True
157 |         for i, child in enumerate(html_element.children):
158 |             if i > 0:
159 |                 first_word = False
160 |             prev_word, metadata = self.add_words(child.get_text(), paragraph, formatting=child.name, prev_word=prev_word, first_word=first_word)
161 |             self.color2word.update(metadata)
162 | 
163 |     def add_words(self, text, paragraph, formatting=None, prev_word=" ", first_word=False):
164 |         text = re.sub(r'\[.*?\]', '', text)
165 |         words = re.split(r'\s+', text)
166 |         metadata = {}
167 |         for word in words:
168 |             if word:
169 |                 if word[0] not in ",.?!:;)}]»" and prev_word[-1] not in "«[{(":
170 |                     if first_word:
171 |                         paragraph.add_run(' ' * 4)
172 |                     else:
173 |                         paragraph.add_run(' ')
174 | 
175 |                 run = paragraph.add_run(word)
176 |                 color = self.color_word(run)
177 |                 metadata[color] = word
178 |                 if formatting == 'b':
179 |                     run.bold = True
180 |                 if formatting == 'i':
181 |                     run.italic = True
182 |                 if formatting == 'u':
183 |                     run.underline = True
184 |                 prev_word = word
185 |                 run.font.size = self.font_size
186 |                 run.font.name = self.font_name
187 |         return prev_word, metadata
188 |     
189 |     def color_word(self, run):
190 |         color = self.colors[self.color_ptr]
191 | 
192 |         self.color_ptr += 1
193 |         tag = run._r
194 | 
195 |         # Create XML element
196 |         shd = OxmlElement('w:shd')
197 | 
198 |         # Add attributes to the element
199 |         shd.set(qn('w:val'), 'clear')
200 |         shd.set(qn('w:color'), 'auto')
201 |         shd.set(qn('w:fill'), color)
202 | 
203 |         # Set the font size - this is important! Without this step the
204 |         # tag.rPr value below will be None.
205 |         run.element.get_or_add_rPr()
206 | 
207 |         tag.rPr.append(shd)
208 | 
209 |         run.font.color.rgb = RGBColor(*tuple(int(color[i:i + 2], 16) for i in (1, 3, 5)))
210 |         return color
211 | 
212 |     def set_table_border_color(self, table, color):
213 |         tbl = table._element
214 |         tbl_pr = tbl.tblPr
215 | 
216 |         # Create a new border element
217 |         tbl_borders = OxmlElement('w:tblBorders')
218 | 
219 |         # Create a list of border attributes
220 |         borders = [
221 |             'top',
222 |             'left',
223 |             'bottom',
224 |             'right',
225 |             'insideH',
226 |             'insideV'
227 |         ]
228 | 
229 |         # Iterate through each border attribute
230 |         for border in borders:
231 |             border_element = OxmlElement(f'w:{border}')
232 |             border_element.set(qn('w:val'), 'single')
233 |             border_element.set(qn('w:sz'), '4')
234 |             border_element.set(qn('w:space'), '0')
235 |             border_element.set(qn('w:color'), color)
236 |             tbl_borders.append(border_element)
237 | 
238 |         tbl_pr.append(tbl_borders)
239 | 
240 |     def save_docx(self, path):
241 |         self.doc.save(path)
242 | 
243 |     #@profileit
244 |     def get_images(self, image_size, dpi) -> list[Image]:
245 |         out = io.BytesIO()
246 |         self.doc.save(out)
247 |         doc_bytes = out.getvalue()
248 |         pdf_bytes = self.uno_client.convert(indata=doc_bytes, convert_to='pdf')
249 |         return convert_from_bytes(pdf_bytes, dpi=dpi, size=image_size) 
250 |     
251 |     def convert_to_uncolored_docx(self):
252 |         for paragraph in self.doc.paragraphs:
253 |             for run in paragraph.runs:
254 |                 run.font.color.rgb = RGBColor(0, 0, 0)
255 |                 rpr = run.element.get_or_add_rPr()
256 |                 element = rpr.find(qn('w:shd'))
257 |                 if element is not None:
258 |                     element.set(qn('w:fill'), "#FFFFFF")
259 | 
260 |         for table in self.doc.tables:
261 |             self.set_table_border_color(table, "000000")
262 |             for row in table.rows:
263 |                 for cell in row.cells:
264 |                     for run in cell.paragraphs[0].runs:
265 |                         run.font.color.rgb = RGBColor(0, 0, 0)
266 |                         rpr = run.element.get_or_add_rPr()
267 |                         element = rpr.find(qn('w:shd'))
268 |                         if element is not None:
269 |                             element.set(qn('w:fill'), "#FFFFFF")
270 |         
271 |     def get_num_words(self):
272 |         return len(self.color2word)
273 | 


--------------------------------------------------------------------------------
/src/augmentations.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | from augraphy import * 
  3 | 
  4 | 
  5 | def get_augmentation_phases():
  6 | 
  7 |     pre_phase = [
  8 |         # Rescale(scale="optimal", target_dpi = 300,  p = 1.0),
  9 |     ]
 10 | 
 11 |     ink_phase = [
 12 |         InkColorSwap(
 13 |             ink_swap_color="random",
 14 |             ink_swap_sequence_number_range=(5, 10),
 15 |             ink_swap_min_width_range=(2, 3),
 16 |             ink_swap_max_width_range=(100, 120),
 17 |             ink_swap_min_height_range=(2, 3),
 18 |             ink_swap_max_height_range=(100, 120),
 19 |             ink_swap_min_area_range=(10, 20),
 20 |             ink_swap_max_area_range=(400, 500),
 21 |             p=0.1,
 22 |         ),
 23 |         LinesDegradation(
 24 |             line_roi=(0.0, 0.0, 1.0, 1.0),
 25 |             line_gradient_range=(32, 255),
 26 |             line_gradient_direction=(0, 2),
 27 |             line_split_probability=(0.2, 0.4),
 28 |             line_replacement_value=(250, 255),
 29 |             line_min_length=(30, 40),
 30 |             line_long_to_short_ratio=(5, 7),
 31 |             line_replacement_probability=(0.4, 0.5),
 32 |             line_replacement_thickness=(1, 3),
 33 |             p=0.1,
 34 |         ),
 35 |         OneOf(
 36 |             [
 37 |                 Dithering(
 38 |                     dither=random.choice(["ordered", "floyd-steinberg"]),
 39 |                     order=(3, 5),
 40 |                 ),
 41 |                 InkBleed(
 42 |                     intensity_range=(0.1, 0.2),
 43 |                     kernel_size=random.choice([(7, 7), (5, 5), (3, 3)]),
 44 |                     severity=(0.4, 0.6),
 45 |                 ),
 46 |             ],
 47 |             p=0.1,
 48 |         ),
 49 |         OneOf(
 50 |             [
 51 |                 InkShifter(
 52 |                    text_shift_scale_range=(18, 27),
 53 |                    text_shift_factor_range=(1, 4),
 54 |                    text_fade_range=(0, 2),
 55 |                    blur_kernel_size=(5, 5),
 56 |                    blur_sigma=0,
 57 |                    noise_type="random",
 58 |                 ),
 59 |                 BleedThrough(
 60 |                     intensity_range=(0.1, 0.3),
 61 |                     color_range=(32, 224),
 62 |                     ksize=(17, 17),
 63 |                     sigmaX=1,
 64 |                     alpha=random.uniform(0.1, 0.2),
 65 |                     offsets=(10, 20),
 66 |                 ),
 67 |             ],
 68 |             p=0.1,
 69 |         ),
 70 |         OneOf(
 71 |             [
 72 |                 Hollow(
 73 |                     hollow_median_kernel_value_range=(71, 101),
 74 |                     hollow_min_width_range=(1, 2),
 75 |                     hollow_max_width_range=(150, 200),
 76 |                     hollow_min_height_range=(1, 2),
 77 |                     hollow_max_height_range=(150, 200),
 78 |                     hollow_min_area_range=(10, 20),
 79 |                     hollow_max_area_range=(2000, 5000),
 80 |                     hollow_dilation_kernel_size_range=(1, 2),
 81 |                 ),
 82 |                 Letterpress(
 83 |                     n_samples=(100, 400),
 84 |                     n_clusters=(200, 400),
 85 |                     std_range=(500, 3000),
 86 |                     value_range=(150, 224),
 87 |                     value_threshold_range=(96, 128),
 88 |                     blur=1,
 89 |                 ),
 90 |             ],
 91 |             p=0.1,
 92 |         ),
 93 |         OneOf(
 94 |             [
 95 |                 LowInkRandomLines(
 96 |                     count_range=(5, 10),
 97 |                     use_consistent_lines=random.choice([True, False]),
 98 |                     noise_probability=0.1,
 99 |                 ),
100 |                 LowInkPeriodicLines(
101 |                     count_range=(2, 5),
102 |                     period_range=(16, 32),
103 |                     use_consistent_lines=random.choice([True, False]),
104 |                     noise_probability=0.1,
105 |                 ),
106 |             ],
107 |             p=0.1,
108 |         ),
109 |     ]
110 | 
111 |     paper_phase = [
112 |         PaperFactory(p=0.1),
113 |         ColorPaper(
114 |             hue_range=(0, 255),
115 |             saturation_range=(10, 40),
116 |             p=0.1,
117 |         ),
118 |         OneOf(
119 |             [
120 |                 #DelaunayTessellation(
121 |                 #    n_points_range=(500, 800),
122 |                 #    n_horizontal_points_range=(500, 800),
123 |                 #    n_vertical_points_range=(500, 800),
124 |                 #    noise_type="random",
125 |                 #    color_list="default",
126 |                 #    color_list_alternate="default",
127 |                 #),
128 |                 PatternGenerator(
129 |                     imgx=random.randint(256, 512),
130 |                     imgy=random.randint(256, 512),
131 |                     n_rotation_range=(10, 15),
132 |                     color="random",
133 |                     alpha_range=(0.25, 0.5),
134 |                 ),
135 |                 #VoronoiTessellation(
136 |                  #   mult_range=(50, 80),
137 |                   #  seed=19829813472,
138 |                   #  num_cells_range=(500, 1000),
139 |                   #  noise_type="random",
140 |                   #  background_value=(200, 255),
141 |                 #),
142 |             ],
143 |             p=0.1,
144 |         ),
145 |         WaterMark(
146 |             watermark_word="random",
147 |             watermark_font_size=(10, 15),
148 |             watermark_font_thickness=(20, 25),
149 |             watermark_rotation=(0, 360),
150 |             watermark_location="random",
151 |             watermark_color="random",
152 |             watermark_method="darken",
153 |             p=0.1,
154 |         ),
155 |         OneOf(
156 |             [
157 |                 AugmentationSequence(
158 |                     [
159 |                         NoiseTexturize(
160 |                             sigma_range=(3, 10),
161 |                             turbulence_range=(2, 5),
162 |                             texture_width_range=(300, 500),
163 |                             texture_height_range=(300, 500),
164 |                         ),
165 |                         BrightnessTexturize(
166 |                             texturize_range=(0.9, 0.99),
167 |                             deviation=0.03,
168 |                         ),
169 |                     ],
170 |                 ),
171 |                 AugmentationSequence(
172 |                     [
173 |                         BrightnessTexturize(
174 |                             texturize_range=(0.9, 0.99),
175 |                             deviation=0.03,
176 |                         ),
177 |                         NoiseTexturize(
178 |                             sigma_range=(3, 10),
179 |                             turbulence_range=(2, 5),
180 |                             texture_width_range=(300, 500),
181 |                             texture_height_range=(300, 500),
182 |                         ),
183 |                     ],
184 |                 ),
185 |             ],
186 |             p=0.1,
187 |         ),
188 |     ]
189 | 
190 |     post_phase = [
191 |             ColorShift(
192 |                 color_shift_offset_x_range=(3, 5),
193 |                 color_shift_offset_y_range=(3, 5),
194 |                 color_shift_iterations=(2, 3),
195 |                 color_shift_brightness_range=(0.9, 1.1),
196 |                 color_shift_gaussian_kernel_range=(3, 3),
197 |                 p=0.1,
198 |             ),
199 |         OneOf(
200 |             [
201 |                 DirtyDrum(
202 |                     line_width_range=(1, 6),
203 |                     line_concentration=random.uniform(0.05, 0.15),
204 |                     direction=random.randint(0, 2),
205 |                     noise_intensity=random.uniform(0.6, 0.95),
206 |                     noise_value=(64, 224),
207 |                     ksize=random.choice([(3, 3), (5, 5), (7, 7)]),
208 |                     sigmaX=0,
209 |                     p=0.1,
210 |                 ),
211 |                 DirtyRollers(
212 |                     line_width_range=(2, 32),
213 |                     scanline_type=0,
214 |                 ),
215 |             ],
216 |             p=0.1,
217 |         ),
218 |         OneOf(
219 |             [
220 |                 LightingGradient(
221 |                     light_position=None,
222 |                     direction=None,
223 |                     max_brightness=255,
224 |                     min_brightness=0,
225 |                     mode="gaussian",
226 |                     linear_decay_rate=None,
227 |                     transparency=None,
228 |                 ),
229 |                 Brightness(
230 |                     brightness_range=(0.9, 1.1),
231 |                     min_brightness=0,
232 |                     min_brightness_value=(120, 150),
233 |                 ),
234 |                 Gamma(
235 |                     gamma_range=(0.9, 1.1),
236 |                 ),
237 |             ],
238 |             p=0.1,
239 |         ),
240 |         OneOf(
241 |             [
242 |                 SubtleNoise(
243 |                     subtle_range=random.randint(5, 10),
244 |                 ),
245 |                 Jpeg(
246 |                     quality_range=(25, 95),
247 |                 ),
248 |             ],
249 |             p=0.1,
250 |         ),
251 |         OneOf(
252 |             [
253 |                 Markup(
254 |                     num_lines_range=(2, 7),
255 |                     markup_length_range=(0.5, 1),
256 |                     markup_thickness_range=(1, 2),
257 |                     markup_type=random.choice(["strikethrough", "crossed", "highlight", "underline"]),
258 |                     markup_color="random",
259 |                     single_word_mode=False,
260 |                     repetitions=1,
261 |                 ),
262 |                 Scribbles(
263 |                     scribbles_type="random",
264 |                     scribbles_location="random",
265 |                     scribbles_size_range=(250, 600),
266 |                     scribbles_count_range=(1, 6),
267 |                     scribbles_thickness_range=(1, 3),
268 |                     scribbles_brightness_change=[32, 64, 128],
269 |                     scribbles_text="random",
270 |                     scribbles_text_font="random",
271 |                     scribbles_text_rotate_range=(0, 360),
272 |                     scribbles_lines_stroke_count_range=(1, 6),
273 |                 ),
274 |             ],
275 |             p=0.1,
276 |         ),
277 |         OneOf(
278 |             [
279 |                 #BadPhotoCopy(
280 |                 #    noise_mask=None,
281 |                 #    noise_type=-1,
282 |                 #    noise_side="random",
283 |                 #    noise_iteration=(1, 2),
284 |                 #    noise_size=(1, 3),
285 |                 #    noise_value=(128, 196),
286 |                 #    noise_sparsity=(0.3, 0.6),
287 |                 #    noise_concentration=(0.1, 0.6),
288 |                 #    blur_noise=random.choice([True, False]),
289 |                 #    blur_noise_kernel=random.choice([(3, 3), (5, 5), (7, 7)]),
290 |                 #    wave_pattern=random.choice([True, False]),
291 |                 #    edge_effect=random.choice([True, False]),
292 |                 #),
293 |                 ShadowCast(
294 |                     shadow_side="random",
295 |                     shadow_vertices_range=(1, 20),
296 |                     shadow_width_range=(0.3, 0.8),
297 |                     shadow_height_range=(0.3, 0.8),
298 |                     shadow_color=(0, 0, 0),
299 |                     shadow_opacity_range=(0.2, 0.9),
300 |                     shadow_iterations_range=(1, 2),
301 |                     shadow_blur_kernel_range=(101, 301),
302 |                 ),
303 |                 LowLightNoise(
304 |                     num_photons_range=(50, 100),
305 |                     alpha_range=(0.7, 1.0),
306 |                     beta_range=(10, 30),
307 |                     gamma_range=(1, 1.8),
308 |                     bias_range=(20, 40),
309 |                     dark_current_value=1.0,
310 |                     exposure_time=0.2,
311 |                     gain=0.1,
312 |                 ),
313 |             ],
314 |             p=0.1,
315 |         ),
316 |         OneOf(
317 |             [
318 |                 NoisyLines(
319 |                     noisy_lines_direction="random",
320 |                     noisy_lines_location="random",
321 |                     noisy_lines_number_range=(5, 20),
322 |                     noisy_lines_color=(0, 0, 0),
323 |                     noisy_lines_thickness_range=(1, 2),
324 |                     noisy_lines_random_noise_intensity_range=(0.01, 0.1),
325 |                     noisy_lines_length_interval_range=(0, 100),
326 |                     noisy_lines_gaussian_kernel_value_range=(3, 5),
327 |                     noisy_lines_overlay_method="ink_to_paper",
328 |                 ),
329 |                 BindingsAndFasteners(
330 |                     overlay_types="darken",
331 |                     foreground=None,
332 |                     effect_type="random",
333 |                     width_range="random",
334 |                     height_range="random",
335 |                     angle_range=(-30, 30),
336 |                     ntimes=(2, 6),
337 |                     nscales=(0.9, 1.0),
338 |                     edge="random",
339 |                     edge_offset=(10, 50),
340 |                     use_figshare_library=0,
341 |                 ),
342 |             ],
343 |             p=0.1,
344 |         ),
345 |         OneOf(
346 |             [
347 |                 Squish(
348 |                     squish_direction="random",
349 |                     squish_location="random",
350 |                     squish_number_range=(5, 10),
351 |                     squish_distance_range=(5, 7),
352 |                     squish_line="random",
353 |                     squish_line_thickness_range=(1, 1),
354 |                 ),
355 |                 Geometric(
356 |                     fliplr=False,
357 |                     flipud=False,
358 |                     crop=(),
359 |                     rotate_range=(-4, 4),
360 |                     randomize=0,
361 |                     p=1,
362 |                 ),
363 |             ],
364 |             p=0.1,
365 |         ),
366 |         OneOf(
367 |             [
368 |                 DotMatrix(
369 |                     dot_matrix_shape="random",
370 |                     dot_matrix_dot_width_range=(3, 3),
371 |                     dot_matrix_dot_height_range=(3, 3),
372 |                     dot_matrix_min_width_range=(1, 2),
373 |                     dot_matrix_max_width_range=(150, 200),
374 |                     dot_matrix_min_height_range=(1, 2),
375 |                     dot_matrix_max_height_range=(150, 200),
376 |                     dot_matrix_min_area_range=(10, 20),
377 |                     dot_matrix_max_area_range=(2000, 5000),
378 |                     dot_matrix_median_kernel_value_range=(128, 255),
379 |                     dot_matrix_gaussian_kernel_value_range=(1, 3),
380 |                     dot_matrix_rotate_value_range=(0, 360),
381 |                 ),
382 |                 Faxify(
383 |                     scale_range=(0.3, 0.6),
384 |                     monochrome=random.choice([0, 1]),
385 |                     monochrome_method="random",
386 |                     monochrome_arguments={},
387 |                     halftone=random.choice([0, 1]),
388 |                     invert=1,
389 |                     half_kernel_size=random.choice([(1, 1), (2, 2)]),
390 |                     angle=(0, 360),
391 |                     sigma=(1, 3),
392 |                 ),
393 |             ],
394 |             p=0.1,
395 |         ),
396 |         OneOf(
397 |             [
398 |                 InkMottling(
399 |                     ink_mottling_alpha_range=(0.2, 0.3),
400 |                     ink_mottling_noise_scale_range=(2, 2),
401 |                     ink_mottling_gaussian_kernel_range=(3, 5),
402 |                 ),
403 |                 ReflectedLight(
404 |                     reflected_light_smoothness=0.8,
405 |                     reflected_light_internal_radius_range=(0.0, 0.001),
406 |                     reflected_light_external_radius_range=(0.5, 0.8),
407 |                     reflected_light_minor_major_ratio_range=(0.9, 1.0),
408 |                     reflected_light_color=(255, 255, 255),
409 |                     reflected_light_internal_max_brightness_range=(0.75, 0.75),
410 |                     reflected_light_external_max_brightness_range=(0.5, 0.75),
411 |                     reflected_light_location="random",
412 |                     reflected_light_ellipse_angle_range=(0, 360),
413 |                     reflected_light_gaussian_kernel_size_range=(5, 310),
414 |                     p=0.1,
415 |                 ),
416 |             ],
417 |             p=0.1,
418 |         ),
419 |         OneOf(
420 |             [
421 |                 PageBorder(
422 |                     page_border_width_height="random",
423 |                     page_border_color=(0, 0, 0),
424 |                     page_border_background_color=(0, 0, 0),
425 |                     page_numbers="random",
426 |                     page_rotation_angle_range=(-3, 3),
427 |                     curve_frequency=(2, 8),
428 |                     curve_height=(2, 4),
429 |                     curve_length_one_side=(50, 100),
430 |                     same_page_border=random.choice([0, 1]),
431 |                 ),
432 |                 #BookBinding(
433 |                 #    shadow_radius_range=(30, 100),
434 |                 #    curve_range_right=(50, 200),
435 |                 #    curve_range_left=(50, 200),
436 |                 #    curve_ratio_right=(0.1, 0.3),
437 |                 #    curve_ratio_left=(0.1, 0.3),
438 |                 #    mirror_range=(1.0, 1.0),
439 |                 #    binding_align="random",
440 |                 #    binding_pages=(5, 10),
441 |                 #    curling_direction=-1,
442 |                 #    backdrop_color=(0, 0, 0),
443 |                 #    enable_shadow=random.choice([0, 1]),
444 |                 #),
445 |                 Folding(
446 |                     fold_x=None,
447 |                     fold_deviation=(0, 0),
448 |                     fold_count=random.randint(2, 8),
449 |                     fold_noise=0.01,
450 |                     fold_angle_range=(-360, 360),
451 |                     gradient_width=(0.1, 0.2),
452 |                     gradient_height=(0.01, 0.02),
453 |                     backdrop_color=(0, 0, 0),
454 |                 ),
455 |             ],
456 |             p=0.1,
457 |         ),
458 |         # Rescale(scale = "original" , p = 1.0)
459 |     ]
460 |     return {'ink_phase': ink_phase, 
461 |             'paper_phase': paper_phase, 
462 |             'post_phase': post_phase, 
463 |             'pre_phase': pre_phase}
464 | 


--------------------------------------------------------------------------------