├── .gitignore ├── resources ├── im_2.png ├── im_9.png ├── im_10.png ├── im_12.png └── DoGe_Scheme.png ├── scripts ├── print_profiling_stats.py ├── count_files.bash └── show_image_annotations.py ├── requirements.txt ├── docx_config.json ├── Dockerfile ├── src ├── url_parser.py ├── utils.py ├── manager.py ├── document_generator.py ├── docx_document.py └── augmentations.py ├── main.py └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | data 2 | *cache* 3 | nohup.out 4 | *.png 5 | .vscode 6 | *.profile 7 | logs 8 | fonts -------------------------------------------------------------------------------- /resources/im_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Travvy88/DocumentGenerator_DoGe/HEAD/resources/im_2.png -------------------------------------------------------------------------------- /resources/im_9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Travvy88/DocumentGenerator_DoGe/HEAD/resources/im_9.png -------------------------------------------------------------------------------- /resources/im_10.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Travvy88/DocumentGenerator_DoGe/HEAD/resources/im_10.png -------------------------------------------------------------------------------- /resources/im_12.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Travvy88/DocumentGenerator_DoGe/HEAD/resources/im_12.png -------------------------------------------------------------------------------- /resources/DoGe_Scheme.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Travvy88/DocumentGenerator_DoGe/HEAD/resources/DoGe_Scheme.png -------------------------------------------------------------------------------- /scripts/print_profiling_stats.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import pstats 3 | 4 | parser = argparse.ArgumentParser() 5 | parser.add_argument('path') 6 | parser.add_argument('-n') 7 | args = parser.parse_args() 8 | 9 | p = pstats.Stats(args.path) 10 | p.strip_dirs().sort_stats(pstats.SortKey.CUMULATIVE).print_stats(int(args.n)) -------------------------------------------------------------------------------- /scripts/count_files.bash: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | count_files() { 4 | if [ -z "$1" ]; then 5 | echo "No argument provided" >&2 6 | return 1 7 | fi 8 | if [ ! -d "$1" ]; then 9 | echo "Argument is not a directory" >&2 10 | return 1 11 | fi 12 | find "$1" -type f | wc -l 13 | } 14 | 15 | count_files "$@" -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | -e git+https://github.com/Travvy88/augraphy.git@fix_bboxes_oneof_augmentation_sequence#egg=augraphy 2 | opencv-python==4.10.0.84 3 | python-docx==1.1.2 4 | matplotlib==3.8.2 5 | pdf2image==1.17.0 6 | tqdm==4.66.5 7 | unoserver==2.1 8 | unotools==0.3.3 9 | beautifulsoup4==4.12.3 10 | requests==2.32.3 11 | pillow-simd==9.5.0.post2 -------------------------------------------------------------------------------- /docx_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "max_words": 20000, 3 | "p_2columns": 0.2, 4 | "font_size_interval": [8, 13], 5 | "p_line_spacing": [0.5, 0.5], 6 | "p_text_alignment": [0.1, 0.4, 0, 0.5], 7 | "p_heading_bold": 0.5, 8 | "heading_relative_size_interval": [1, 2], 9 | "p_heading_alignment": [0.5, 0.25, 0.01, 0.24], 10 | "table_max_rows": 15, 11 | "table_max_cols": 5 12 | } -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:22.04 2 | 3 | # Install python and libreoffice 4 | RUN apt-get update && apt-get install -y libreoffice python3 python3-pip git libjpeg-dev zlib1g-dev poppler-utils 5 | 6 | # install unoserver to python that is used by LibreOffice 7 | RUN /usr/bin/python3 -m pip install --user unoserver 8 | 9 | # Set working directory to /app 10 | WORKDIR /app 11 | 12 | # Copy requirements file 13 | COPY requirements.txt . 14 | 15 | # Install Python dependencies 16 | RUN pip3 install -r requirements.txt 17 | 18 | # Copy the current directory contents into the container at /app 19 | COPY src /app/src 20 | COPY main.py /app/main.py 21 | COPY docx_config.json /app/docx_config.json 22 | 23 | # run interactively 24 | CMD ["/bin/bash"] -------------------------------------------------------------------------------- /scripts/show_image_annotations.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | from PIL import Image, ImageDraw, ImageFont 4 | 5 | parser = argparse.ArgumentParser() 6 | parser.add_argument('path') 7 | 8 | args = parser.parse_args() 9 | 10 | image = Image.open(args.path) 11 | height, width = image.size 12 | 13 | with open(args.path + '.json', 'r') as f: 14 | annotations = json.load(f) 15 | 16 | draw = ImageDraw.Draw(image) 17 | font = ImageFont.truetype('arial.ttf', size=8) 18 | for word, bbox in zip(annotations['words'], annotations['bboxes']): 19 | x, y, w, h = bbox 20 | 21 | 22 | 23 | 24 | if w < 0: 25 | x = x - w 26 | w = w * -1 27 | if h < 0: 28 | y = y - h 29 | h = h * -1 30 | 31 | x1 = int(x * width) 32 | y1 = int(y * height) 33 | x2 = int((x + w) * width) 34 | y2 = int((y + h) * height) 35 | 36 | print([x, y, w, h]) 37 | print([x1, y1, x2, y2]) 38 | 39 | draw.rectangle([x1, y1, x2, y2], outline="blue", width=1) 40 | draw.text((x1, y1 - 10), word, fill="red", font=font, ) # Adjust position as needed 41 | 42 | output_image_path = 'show_anno.png' 43 | image.save(output_image_path) 44 | -------------------------------------------------------------------------------- /src/url_parser.py: -------------------------------------------------------------------------------- 1 | from urllib.parse import urljoin, urlparse 2 | from bs4 import BeautifulSoup 3 | import requests 4 | from tqdm import tqdm 5 | 6 | 7 | class UrlParser: 8 | def parse(self, start_url, max_urls, languages): 9 | ptr = 0 10 | urls = [] 11 | urls.append(start_url) 12 | 13 | pbar = tqdm(initial=1, total=max_urls) 14 | while len(urls) < max_urls: 15 | url = urls[ptr] 16 | try: 17 | response = requests.get(url) 18 | response.raise_for_status() 19 | except requests.exceptions.RequestException as e: 20 | print(f"Failed to retrieve {url}: {e}") 21 | return 22 | 23 | # Parse the page content 24 | soup = BeautifulSoup(response.content, 'html.parser') 25 | 26 | # Find all links on the page 27 | links = soup.find_all('a', href=True) 28 | for link in links: 29 | href = link['href'] 30 | full_url = urljoin(url, href) 31 | if self.is_valid_url(full_url, languages) and full_url not in urls and len(urls) < max_urls: 32 | urls.append(full_url) 33 | pbar.update(1) 34 | ptr += 1 35 | return urls[1:] 36 | 37 | def is_valid_url(self, url, languages): 38 | # Check if the URL is a valid Wikipedia article URL 39 | parsed = urlparse(url) 40 | if parsed.scheme in ('http', 'https') and 'wikipedia.org' in parsed.netloc and \ 41 | any(parsed.netloc.find(lang_element) != -1 for lang_element in languages): 42 | path = parsed.path 43 | if path.startswith('/wiki/') and not any(sub in path for sub in [':', '/wiki/Main_Page']): 44 | return True 45 | return False -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import cProfile 3 | import json 4 | from pathlib import Path 5 | from src.manager import Manager 6 | 7 | 8 | def create_parser(): 9 | parser = argparse.ArgumentParser(description="Manager Configuration") 10 | 11 | parser.add_argument('--out_dir', type=str, required=True, 12 | help='Output directory for saving results') 13 | parser.add_argument('--remove_existing_dir', action='store_true', 14 | help='If out_dir exists, delete the folder and files before creating a new one') 15 | parser.add_argument('--debug', action='store_true', 16 | help='Enable debug mode') 17 | parser.add_argument('--image_size', type=int, default=244, 18 | help='Size of the final images (default: 244)') 19 | parser.add_argument('--start_page', type=str, default='https://en.wikipedia.org/wiki/Main_Page', 20 | help='Starting page URL (default: Wikipedia main page)') 21 | parser.add_argument('--languages', type=str, nargs='+', default=['en'], 22 | help='Permitted languages. Other languages will be ignored (default: English)') 23 | parser.add_argument('--max_urls', type=int, default=16, 24 | help='Maximum number of URLs to process (default: 100)') 25 | parser.add_argument('--num_processes', type=int, default=1, 26 | help='Number of processes to use (default: 1)') 27 | parser.add_argument('--max_threads', type=int, default=3, 28 | help='Maximum threads inside a process (default: 3)') 29 | parser.add_argument('--ports', type=str, nargs='+', default=[8145, 8146], 30 | help='List of ports to use (default: [8145, 8146]). Number of ports \ 31 | should be 2 times larger than num_processes') 32 | 33 | return parser 34 | 35 | if __name__ == "__main__": 36 | parser = create_parser() 37 | args = parser.parse_args() 38 | 39 | with open('docx_config.json', 'r') as f: 40 | docx_config = json.load(f) 41 | 42 | manager = Manager( 43 | docx_config=docx_config, 44 | out_dir=Path(args.out_dir), 45 | remove_existing_dir=args.remove_existing_dir, 46 | debug=args.debug, 47 | image_size=args.image_size, 48 | start_page=args.start_page, 49 | languages=tuple(args.languages), 50 | max_urls=args.max_urls, 51 | num_processes=args.num_processes, 52 | max_threads=args.max_threads, 53 | ports=tuple(args.ports) 54 | ) 55 | manager.generate() 56 | -------------------------------------------------------------------------------- /src/utils.py: -------------------------------------------------------------------------------- 1 | from PIL import Image, ImageDraw 2 | import cv2 3 | import numpy as np 4 | 5 | 6 | def convert_xywh_to_x1y1x2y2(bboxes): 7 | if isinstance(bboxes, list): 8 | return [[bbox[0], bbox[1], bbox[0] + bbox[2], bbox[1] + bbox[3]] for bbox in bboxes] 9 | if isinstance(bboxes, np.ndarray): 10 | x1 = bboxes[:, 0] 11 | x2 = bboxes[:, 1] 12 | x3 = bboxes[:, 0] + bboxes[:, 2] 13 | x4 = bboxes[:, 1] + bboxes[:, 3] 14 | return np.column_stack((x1, x2, x3, x4)) 15 | 16 | 17 | def convert_x1y1x2y2_to_xywh(bboxes): 18 | if isinstance(bboxes, list): 19 | return [[bbox[0], bbox[1], bbox[2] - bbox[0], bbox[3] - bbox[1]] for bbox in bboxes] 20 | if isinstance(bboxes, np.ndarray): 21 | x = bboxes[:, 0] 22 | y = bboxes[:, 1] 23 | w = bboxes[:, 2] - bboxes[:, 0] 24 | h = bboxes[:, 3] - bboxes[:, 1] 25 | return np.column_stack((x, y, w, h)) 26 | 27 | 28 | def normalize_bboxes(bboxes, width, height): 29 | if isinstance(bboxes, list): 30 | return [[bbox[0] / width, bbox[1] / height, bbox[2] / width, bbox[3] / height] for bbox in bboxes] 31 | if isinstance(bboxes, np.ndarray): 32 | el1 = bboxes[:, 0] / width 33 | el2 = bboxes[:, 1] / height 34 | el3 = bboxes[:, 2] / width 35 | el4 = bboxes[:, 3] / height 36 | return np.column_stack((el1, el2, el3, el4)) 37 | 38 | 39 | def unnormalize_bboxes(bboxes, width, height): 40 | if isinstance(bboxes, list): 41 | return [[bbox[0] * width, bbox[1] * height, bbox[2] * width, bbox[3] * height] for bbox in bboxes] 42 | if isinstance(bboxes, np.ndarray): 43 | el1 = bboxes[:, 0] * width 44 | el2 = bboxes[:, 1] * height 45 | el3 = bboxes[:, 2] * width 46 | el4 = bboxes[:, 3] * height 47 | return np.column_stack((el1, el2, el3, el4)) 48 | 49 | def draw_bboxes_pil(image, bboxes, words=None): 50 | draw = ImageDraw.Draw(image) 51 | for bbox, word in zip(bboxes, words): 52 | x1, y1, x2, y2 = int(bbox[0]), int(bbox[1]), int(bbox[2]), int(bbox[3]) 53 | # Draw rectangle with red color and 2px thickness 54 | draw.rectangle([(x1, y1), (x2, y2)], outline="red", width=2) 55 | # Optionally add text labels above the boxes 56 | draw.text((x1, y1-15), word, fill="red") 57 | return image 58 | 59 | def draw_bboxes(image, bboxes, words=None): 60 | # bboxes in x1, y1, x2, y2 format 61 | if words is None: 62 | words = [""] * len(bboxes) 63 | 64 | if isinstance(image, np.ndarray): 65 | image = Image.fromarray(image) 66 | image = draw_bboxes_pil(image, bboxes, words) 67 | image = np.array(image) 68 | elif isinstance(image, Image.Image): 69 | image = draw_bboxes_pil(image, bboxes, words) 70 | else: 71 | raise ValueError(f"Unsupported image type: {type(image)}") 72 | 73 | return image 74 | -------------------------------------------------------------------------------- /src/manager.py: -------------------------------------------------------------------------------- 1 | import json 2 | import multiprocessing 3 | import os 4 | from pathlib import Path 5 | import shutil 6 | import time 7 | 8 | from tqdm import tqdm 9 | 10 | from src.document_generator import DocumentGenerator 11 | from src.url_parser import UrlParser 12 | 13 | 14 | class Manager: 15 | def __init__(self, 16 | docx_config: dict, 17 | out_dir: Path, 18 | remove_existing_dir, 19 | debug, 20 | image_size, 21 | start_page, 22 | languages, 23 | max_urls, 24 | num_processes, 25 | max_threads, 26 | ports): 27 | 28 | self.docx_config = docx_config 29 | self.out_dir = out_dir 30 | self.debug = debug 31 | self.image_size = image_size 32 | self.start_page = start_page 33 | self.languages = languages 34 | self.max_urls = max_urls 35 | 36 | self.num_processes = num_processes 37 | self.max_threads = max_threads 38 | self.ports = ports 39 | 40 | self.url_parser = UrlParser() 41 | self.folders = self._create_folders(remove_existing_dir=remove_existing_dir) 42 | self.doc_generators = [DocumentGenerator(self.max_threads, 43 | self.image_size, 44 | self.docx_config, 45 | self.folders[i], 46 | ports[i], 47 | ports[num_processes + i], 48 | self.debug,) \ 49 | for i in range(num_processes)] 50 | 51 | def generate(self): 52 | start_time = time.time() 53 | print('Parsing urls...') 54 | urls = self.url_parser.parse(self.start_page, self.max_urls, self.languages) 55 | urls_chunks = self._split_urls_to_chunks(urls) 56 | processes = [] 57 | 58 | for i in range(self.num_processes): 59 | process = multiprocessing.Process(name=f"Generator_{i}", target=self.doc_generators[i].generate, 60 | kwargs={"urls": urls_chunks[i]}) 61 | processes.append(process) 62 | process.start() 63 | 64 | for process in processes: 65 | process.join() 66 | 67 | self._merge_all_folders() 68 | 69 | end_time = time.time() 70 | file_count = 0 71 | for root, dirs, files in os.walk(self.out_dir): 72 | file_count += len(files) 73 | file_count /= 2 74 | print('Images:', int(file_count)) 75 | print('Elapsed time:', end_time - start_time) 76 | print('Urls per second:', self.max_urls / (end_time - start_time)) 77 | print('Images per second:', file_count / (end_time - start_time)) 78 | print() 79 | print('Seconds per url:', (end_time - start_time) / self.max_urls) 80 | print('Seconds per image:', (end_time - start_time) / file_count) 81 | print('Images per url:', file_count / self.max_urls) 82 | 83 | def _split_urls_to_chunks(self, urls): 84 | n = len(urls) 85 | chunk_size = n // self.num_processes 86 | remainder = n % self.num_processes 87 | 88 | chunks = [] 89 | for i in range(self.num_processes): 90 | start_index = i * chunk_size + min(i, remainder) 91 | end_index = start_index + chunk_size + (1 if i < remainder else 0) 92 | chunks.append(urls[start_index:end_index]) 93 | return chunks 94 | 95 | def _create_folders(self, remove_existing_dir): 96 | folders = [self.out_dir / f"tmp_process_{i}" for i in range(self.num_processes)] 97 | if remove_existing_dir: 98 | if os.path.exists(self.out_dir): 99 | shutil.rmtree(self.out_dir) 100 | for folder in folders: 101 | if os.path.exists(folder): 102 | shutil.rmtree(folder) 103 | 104 | for folder in folders: 105 | os.makedirs(folder) 106 | 107 | return folders 108 | 109 | def _validate_annotations(self, image_path, anno_path): 110 | if not os.path.exists(image_path): 111 | print(f"Image {image_path} not found") 112 | return False 113 | if not os.path.exists(anno_path): 114 | print(f"Annotation {anno_path} not found") 115 | return False 116 | 117 | with open(anno_path, 'r') as f: 118 | anno = json.load(f) 119 | 120 | if len(anno['words']) != len(anno['bboxes']): 121 | print(f"Annotation {anno_path} has different number of words and bboxes") 122 | return False 123 | 124 | return True 125 | 126 | def _merge_all_folders(self): 127 | counter = 0 128 | bad_annotations = 0 129 | for folder_path in tqdm(self.folders): 130 | if os.path.isdir(folder_path): 131 | # Iterate over each file in the current folder 132 | for file_name in sorted([f for f in os.listdir(folder_path) if f.endswith('.png.json')]): 133 | json_path = os.path.join(folder_path, file_name) 134 | 135 | if self._validate_annotations(json_path[:-5], json_path): 136 | new_file_name = f"image_{counter}.png" 137 | new_json_name = f"image_{counter}.png.json" 138 | 139 | new_file_path = os.path.join(self.out_dir, new_file_name) 140 | new_json_path = os.path.join(self.out_dir, new_json_name) 141 | 142 | # Move and rename the file 143 | shutil.move(json_path[:-5], new_file_path) 144 | shutil.move(json_path, new_json_path) 145 | #print(f'{json_path} -> {new_json_path}') 146 | #print(f'{json_path[:-5]} -> {new_file_path}') 147 | 148 | # Move the colored image 149 | _, number = file_name.split("_") 150 | number = number.split(".")[0] 151 | #print(f'{folder_path}/im_{number}_colored.png -> {new_file_path[:-4] + "_colored.png"}') 152 | #print('--------------------------------') 153 | if os.path.exists(f"{folder_path}/im_{number}_colored.png"): 154 | shutil.move(f"{folder_path}/im_{number}_colored.png", new_file_path[:-4] + "_colored.png") 155 | 156 | counter += 1 157 | else: 158 | bad_annotations += 1 159 | 160 | for i in range(self.num_processes): 161 | shutil.rmtree(self.out_dir / f'tmp_process_{i}') 162 | 163 | print(f'Folder merge finished, bad annotations: {bad_annotations}') 164 | -------------------------------------------------------------------------------- /src/document_generator.py: -------------------------------------------------------------------------------- 1 | import cProfile 2 | from concurrent.futures import ThreadPoolExecutor 3 | import json 4 | import multiprocessing 5 | import os 6 | from pathlib import Path 7 | import subprocess 8 | from time import sleep 9 | import time 10 | import traceback 11 | from bs4 import BeautifulSoup 12 | import numpy as np 13 | from augraphy import AugraphyPipeline 14 | from unoserver import client 15 | import requests 16 | from tqdm import tqdm 17 | from PIL import Image, ImageDraw 18 | import cv2 19 | import threading 20 | 21 | import src.utils as utils 22 | from src.augmentations import get_augmentation_phases 23 | from src.docx_document import DocxDocument 24 | 25 | 26 | def profileit(func): 27 | def wrapper(*args, **kwargs): 28 | datafn = func.__name__ + ".profile" # Name the data file sensibly 29 | prof = cProfile.Profile() 30 | retval = prof.runcall(func, *args, **kwargs) 31 | prof.dump_stats(datafn) 32 | return retval 33 | 34 | return wrapper 35 | 36 | class DocumentGenerator: 37 | def __init__(self, max_threads, image_size, docx_config, out_folder, port, uno_port, debug_mode): 38 | self.max_threads = max_threads 39 | self.image_size = image_size 40 | self.out_folder = out_folder 41 | self.docx_config = docx_config 42 | self.port = port 43 | self.uno_port = uno_port 44 | self.debug_mode = debug_mode 45 | 46 | self.image_counter = 0 47 | 48 | command = f"/usr/bin/python3 -m unoserver.server --port {port} --uno-port {uno_port} > /dev/null 2>&1" 49 | print('START SERVER', port, uno_port) 50 | self.unoserver_process = subprocess.Popen(command, shell=True) 51 | self.uno_client = client.UnoClient(port=port) 52 | 53 | def __del__(self): 54 | self.unoserver_process.kill() 55 | 56 | def generate(self, urls): 57 | print('Start Document Generator...') 58 | with ThreadPoolExecutor(max_workers=self.max_threads, 59 | thread_name_prefix=f"{multiprocessing.current_process().name}_thread") as executor: 60 | futures = [executor.submit(self.create_doc_try_except, url) for url in urls] 61 | for future in futures: 62 | future.result() 63 | 64 | def create_doc_try_except(self, url): 65 | try: 66 | self.create_doc(url) 67 | print(f'{threading.current_thread().name} total images generated by the current process: {self.image_counter}') 68 | except Exception as e: 69 | if self.debug_mode: 70 | print(traceback.format_exc()) 71 | else: 72 | print("skipping due augmentation error") 73 | 74 | #@profileit 75 | def create_doc(self, url): 76 | doc = DocxDocument(self.docx_config, self.uno_client) 77 | response = requests.get(url) 78 | if response.status_code != 200: 79 | print(f"Bad Response: {response}") 80 | return 81 | 82 | # create colored docx document 83 | soup = BeautifulSoup(response.text, 'html.parser') 84 | for element in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', "table"]): 85 | if element.name.startswith('h'): 86 | doc.add_heading(element) 87 | elif element.name == "table": 88 | doc.add_table(element) 89 | else: 90 | doc.add_text(element) 91 | 92 | if doc.get_num_words() > self.docx_config["max_words"]: 93 | break 94 | 95 | # extract annotations from colored images 96 | colored_images = doc.get_images(dpi=200, image_size=1500) 97 | annotations = self.get_bboxes(colored_images, doc.color2word) # bboxes are normalized to [0,1] 98 | doc.convert_to_uncolored_docx() 99 | images = doc.get_images(dpi=200, image_size=1024) # get images for augmentation stage 100 | for i, image in enumerate(images): 101 | if len(annotations[i]['words']) != len(annotations[i]['bboxes']): 102 | continue 103 | # unnormalize bboxes to augmentation image size 104 | bounding_boxes = np.array(annotations[i]["bboxes"]) 105 | bounding_boxes = utils.unnormalize_bboxes(bounding_boxes, colored_images[0].size[0], colored_images[0].size[1]) 106 | 107 | # perform augmentation 108 | augmentation_pipeline = AugraphyPipeline(bounding_boxes=bounding_boxes, 109 | log=False, **get_augmentation_phases()) 110 | 111 | augmented_cv2, _, _, augmented_bounding_boxes = augmentation_pipeline(np.array(image)) 112 | augmented_image = Image.fromarray(augmented_cv2) 113 | with threading.Lock(): 114 | if self.debug_mode: 115 | bboxes_for_image = utils.normalize_bboxes(augmented_bounding_boxes, colored_images[0].size[0], colored_images[0].size[1]) 116 | bboxes_for_image = utils.unnormalize_bboxes(bboxes_for_image, augmented_image.size[0], augmented_image.size[1]) 117 | 118 | augmented_image = utils.draw_bboxes_pil(augmented_image, bboxes_for_image, annotations[i]["words"]) 119 | colored_image = utils.draw_bboxes_pil(colored_images[i], bounding_boxes, annotations[i]["words"]) 120 | colored_image.save(self.out_folder / f"im_{self.image_counter}_colored.png") 121 | 122 | # resize image to final dataset size and save 123 | augmented_image = augmented_image.resize((self.image_size, self.image_size)) 124 | augmented_image.save(self.out_folder / f"im_{self.image_counter}.png") 125 | 126 | # convert booxes to (x, y, w, h) format and normalize to [0,1] 127 | augmented_bounding_boxes = np.array(augmented_bounding_boxes).astype(int) 128 | annotations[i]["bboxes"] = utils.normalize_bboxes(augmented_bounding_boxes, colored_images[0].size[0], colored_images[0].size[1]).tolist() 129 | 130 | # save annotation 131 | with open(self.out_folder/ f"im_{self.image_counter}.png.json", "w") as f: 132 | json.dump(annotations[i], f) 133 | self.image_counter += 1 134 | 135 | def get_bboxes(self, images, color2word): 136 | annotations = [] 137 | for image_pil in images: 138 | width, height = image_pil.size 139 | image_annotations = {"words": [], "bboxes": []} 140 | image = np.asarray(image_pil) 141 | 142 | thr = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY) 143 | thr = cv2.threshold(thr, 254, 255, cv2.THRESH_BINARY_INV)[1] 144 | cnts = cv2.findContours(thr, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)[0] 145 | 146 | for c in cnts: 147 | peri = cv2.arcLength(c, True) 148 | approx = cv2.approxPolyDP(c, 0.015 * peri, True) 149 | 150 | if len(approx) == 4: 151 | x, y, w, h = cv2.boundingRect(approx) 152 | rgb_color = image_pil.getpixel((x+1, y+1)) 153 | color = '#%02x%02x%02x' % (rgb_color) 154 | if color in color2word: 155 | word = color2word[color] 156 | image_annotations['words'].append(word) 157 | image_annotations["bboxes"].append( 158 | ( 159 | x / width, 160 | y / height, 161 | (x + w) / width, 162 | (y + h) / height) 163 | ) 164 | annotations.append(image_annotations) 165 | return annotations 166 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DoGe — Synthetic DOcument GEnerator for Document AI 2 | 3 | DoGe is designed to synthesize a dataset of realistic document scans. Each document contains meaningful text, headings, 4 | tables, paragraphs with different formatting and fonts which are parsed from Wikipedia. The coordinates 5 | of the words are extracted using the No-OCR method we invented for faster generation on CPU. 6 | 7 | ## Document examples 8 | 9 |
10 | 11 | 12 | 13 | 14 |
15 | 16 | Check the full size (1024x1024) in [resources](./resources) folder. 17 | 18 | ## Usage 19 | 20 | ### Docker installation 21 | 22 | You can use Docker image with predefined environment to run DoGe: 23 | ```bash 24 | git clone https://github.com/Travvy88/DocumentGenerator_DoGe 25 | cd DocumentGenerator_DoGe 26 | docker build -t doge . 27 | ``` 28 | 29 | Replace `/path/to/output/folder/on/host` and run commands. Inside the docker container you can 30 | [start document generation](#start-data-generation). 31 | 32 | ### Ubuntu installation 33 | 34 | For faster generation, it is recommended to install all dependencies without Docker. 35 | Doge is tested on Ubuntu 22.04. 36 | ```bash 37 | sudo apt-get update 38 | sudo apt-get install libreoffice libjpeg-dev zlib1g-dev poppler-utils 39 | /usr/bin/python3 -m pip install --user unoserver # install unoserver on system python 40 | 41 | git clone https://github.com/Travvy88/DocumentGenerator_DoGe 42 | cd DocumentGenerator_DoGe 43 | # there you can make venv if needed! 44 | pip3 install -r requirements.txt 45 | ``` 46 | 47 | ## Start Data Generation 48 | 49 | Docker: 50 | 51 | ```bash 52 | docker run -v /full/path/to/output/folder/on/your/computer:/app/data doge python3 main.py --out_dir data --image_size 244 --max_urls 4 --num_processes 2 --ports 4000 4001 4002 4003 53 | ``` 54 | 55 | Ubuntu: 56 | 57 | ```bash 58 | python3 main.py --out_dir data --image_size 244 --max_urls 4 --num_processes 2 --ports 4000 4001 4002 4003 59 | ``` 60 | 61 | ### Main.py 62 | 63 | The following arguments can be passed to the script: 64 | 65 | - `--out_dir`: The output directory for saving results. This argument is required. 66 | - `--remove_existing_dir`: If set, the output directory will be deleted before creating a new one. 67 | - `--image_size`: The size of the final images. Default is `244`. 68 | - `--start_page`: The starting page URL. Default is the Wikipedia main English page. You can use another language Wiki main page URL. 69 | - `--languages`: Permitted languages. Pages with other localizations will be ignored. Default is `['en']`. 70 | - `--max_urls`: The maximum number of URLs to process. Default is `100`. 71 | - `--num_processes`: The number of processes to use. Default is `1`. Each process will start DocumentGenerator and start Unoserver for each generator. 72 | - `--max_threads`: The maximum threads inside a process. Default is `3`. 73 | - `--ports`: The list of ports to use. Default is `[8145, 8146]`. The number of ports should be 2 times larger than `num_processes` (each Unoserver instance needs 2 ports for proper multicore work) 74 | - `--debug`: If set, draws bounding boxes + words on each image and saves itermediate images with highlighted words. 75 | 76 | 77 | ### Docx_config.json 78 | 79 | | Parameter | Description | 80 | |-------------|-------------| 81 | | `max_words` | The maximum number of words allowed in the generated documents. | 82 | | `p_2columns` | The probability that the document will be formatted into two columns. | 83 | | `font_size_interval` | The font size range from which the size is randomly selected for each document. | 84 | | `p_line_spacing` | A list of probabilities controlling the line spacing of the document (1.5 or double). | 85 | | `p_text_alignment` | A list of probabilities controlling the text alignment of the document (center, left, right, justify). | 86 | | `p_heading_bold` | The probability that headings will be displayed in bold font. | 87 | | `heading_relative_size_interval` | The range of relative font sizes for headings. The relative font size is chosen randomly. | 88 | | `p_heading_alignment` | A list of probabilities controlling the alignment of headings (center, left, right, justify). | 89 | | `table_max_rows` | The maximum number of rows allowed in a table. Tables with more than the specified number of rows are dropped. | 90 | | `table_max_cols` | The maximum number of columns allowed in a table. Tables with more than the specified number of columns are dropped. | 91 | 92 | Parameters with probabilities and intervals calculate its values for each document randomly. 93 | 94 | According to my experience, generator produces an average about 14 images for each url 95 | with the above Docx settings. 96 | 97 | ### Augmentations 98 | 99 | Augmentation pipeline applies on a final stage. You can manage different augmentations 100 | in `src/augmentations.py` file. Read the [Augraphy Docs](https://augraphy.readthedocs.io/en/latest/) for detailed explanation. 101 | 102 | 103 | ## How it works 104 | ![General scheme](resources/DoGe_Scheme.png "General scheme of DoGe") 105 | 106 | Firstly, the `Manager` class creates the `DocumentGenerator` instances in separate processes. For 107 | each `DocumentGenerator`, a Unoserver instance is started. 108 | 109 | Then, the `UrlParser` generates a list of URLs by crawling the web, starting from a given start page 110 | and following links on each page. It uses `BeautifulSoup` to parse HTML content and extract links, 111 | then checks each link's validity and language, adding it to the list if it meets certain conditions. 112 | The process continues until a maximum number of URLs is reached, and the method returns the list of 113 | generated URLs, excluding the starting URL. 114 | 115 | When data generation begins, the list of URLs is divided into several chunks for each `DocumentGenerator`. 116 | Each `DocumentGenerator` instance retrieves a Wikipedia HTML page by URL from its chunk. 117 | Headers, paragraphs formatting, and tables are extracted and placed into a Docx document via the `DocxDocument` class. 118 | At this stage, some random parametrization is applied according to `docx_config.json`. 119 | For example, font size, text alignment, one or two columns, and other parameters 120 | are chosen for each document randomly. 121 | 122 | After that, each word in the Docx is filled with a unique color. As a result, a colored rectangle 123 | appears in place of each word. The image will be encoded with 24-bit color depth, 124 | so the maximum number of words per document is 16,777,216. The text of each word is saved to a hashmap of type color_code -> word. 125 | 126 | The next step is Docx to image conversion. DoGe uses Unoserver to convert Docx to Pdf and 127 | pdf2image for image rendering. 128 | 129 | Then, all rectangle coordinates are detected via OpenCV on converted images. The word for each bounding box is retrieved from the hashmap. 130 | DoGe saves annotations to JSON files in the following format: 131 | 132 | ```json 133 | { 134 | "words": [ 135 | "Hello", 136 | "World" 137 | ], 138 | "bboxes": [ 139 | [0.1, 0.1, 0.03, 0.02], 140 | [0.4, 0.3, 0.11, 0.02] 141 | ] 142 | } 143 | ``` 144 | 145 | The bboxes are normalized and saved in XYWH format. 146 | 147 | The final step is deleting all color fills from words in the Docx document, rendering images, applying Augraphy augmentations, 148 | and saving the augmented images to disk. That's it! 149 | 150 | ## Join us! 151 | DoGe is the perspective method of producing synthetic document datasets. There are some features that will help many developers: 152 | - Download and place images into documents 153 | - Add annotations of headers, tables, paragraphs and images (if added) 154 | - Add different output formats (Parquet for example) 155 | - Add additional information via LLMs 156 | - Performance improvement: the **bottleneck** of generation is transforming Docx -> Pdf -> Png! I look for more simple way of converting Docx to Png. 157 | 158 | If have any ideas or you want to take part in the development of DoGe, write me: 159 | - travvy88@yandex.ru 160 | - https://t.me/travvy88 161 | 162 | Or create a Pull Request to this repo. I will be glad to improve the project with the power of community. 163 | 164 | ## Acknowledgments 165 | Here are some great open-source projects I benefit from: 166 | - [ISP RAS Dedoc Team](https://github.com/ispras/dedoc) for support and assistance. 167 | - [Augraphy](https://github.com/sparkfish/augraphy) for augmentation code of final images. 168 | - [Unoserver](https://github.com/unoconv/unoserver) for Docx to Pdf converter. 169 | - [Pdf2image](https://github.com/Belval/pdf2image) for image from Pdf rendering module. 170 | - [Pillow-SIMD](https://github.com/uploadcare/pillow-simd) for faster image processing. 171 | -------------------------------------------------------------------------------- /src/docx_document.py: -------------------------------------------------------------------------------- 1 | import cProfile 2 | import io 3 | import re 4 | from docx import Document 5 | from docx.enum.text import WD_PARAGRAPH_ALIGNMENT, WD_LINE_SPACING 6 | from docx.oxml.ns import qn 7 | from docx.shared import Pt, RGBColor 8 | from docx.oxml import OxmlElement 9 | import matplotlib.font_manager 10 | import numba 11 | import numpy as np 12 | from pdf2image import convert_from_bytes 13 | from PIL import Image 14 | def profileit(func): 15 | def wrapper(*args, **kwargs): 16 | datafn = func.__name__ + ".profile" # Name the data file sensibly 17 | prof = cProfile.Profile() 18 | retval = prof.runcall(func, *args, **kwargs) 19 | prof.dump_stats(datafn) 20 | return retval 21 | 22 | return wrapper 23 | 24 | class DocxDocument: 25 | def __init__(self, docx_config, uno_client): 26 | self.docx_config = docx_config 27 | self.uno_client = uno_client 28 | 29 | self.doc = Document() 30 | self.colors = self._init_colors(docx_config["max_words"]) 31 | 32 | self.color2word = {} 33 | self.color_ptr = 0 34 | 35 | # sample random settings from docx_config 36 | if np.random.binomial(1, self.docx_config["p_2columns"]): 37 | self.num_columns = 2 38 | else: 39 | self.num_columns = 1 40 | self.configure_several_columns() 41 | 42 | self.font_size = Pt(np.random.randint(*self.docx_config["font_size_interval"])) 43 | self.font_name = np.random.choice(self._list_available_fonts()) 44 | 45 | self.line_spacing = np.random.choice( 46 | (WD_LINE_SPACING.ONE_POINT_FIVE, WD_LINE_SPACING.DOUBLE), 47 | p=self._normalize_probabilities(self.docx_config["p_line_spacing"])) 48 | 49 | self.paragraph_alignment = np.random.choice( 50 | (WD_PARAGRAPH_ALIGNMENT.CENTER, WD_PARAGRAPH_ALIGNMENT.LEFT, 51 | WD_PARAGRAPH_ALIGNMENT.RIGHT, WD_PARAGRAPH_ALIGNMENT.JUSTIFY), 52 | p=self._normalize_probabilities(self.docx_config["p_text_alignment"])) 53 | 54 | self.heading_bold = bool(np.random.binomial(1, self.docx_config["p_heading_bold"])) 55 | self.heading_relative_size = np.random.uniform(*self.docx_config["heading_relative_size_interval"]) 56 | self.heading_size = Pt(self.heading_relative_size * self.font_size) 57 | self.heading_alignment = np.random.choice( 58 | (WD_PARAGRAPH_ALIGNMENT.CENTER, WD_PARAGRAPH_ALIGNMENT.LEFT, 59 | WD_PARAGRAPH_ALIGNMENT.RIGHT, WD_PARAGRAPH_ALIGNMENT.JUSTIFY), 60 | p=self._normalize_probabilities(self.docx_config["p_heading_alignment"])) 61 | 62 | def _normalize_probabilities(self, p): 63 | return np.array(p) / sum(p) 64 | 65 | def _list_available_fonts(self): 66 | font_paths = matplotlib.font_manager.findSystemFonts(fontpaths=None, fontext='ttf') 67 | font_names = set() 68 | 69 | for font_path in font_paths: 70 | try: 71 | font = matplotlib.font_manager.get_font(font_path) 72 | font_names.add(font.family_name) 73 | except RuntimeError as e: 74 | print(f"Could not load font from path: {font_path}, error: {e}") 75 | return list(font_names) 76 | 77 | def _init_colors(self, max_colors): 78 | colors = [] 79 | hex_color = "#000000" 80 | x = int(max_colors ** (1/3)) + 1 81 | for i in range(x): 82 | for j in range(x): 83 | for k in range(x): 84 | # Convert HEX to RGB 85 | r = int(hex_color[1:3], 16) 86 | g = int(hex_color[3:5], 16) 87 | b = int(hex_color[5:7], 16) 88 | 89 | # Increment RGB values 90 | r = (r + i) % 256 91 | g = (g + j) % 256 92 | b = (b + k) % 256 93 | hex_color = '#{:02x}{:02x}{:02x}'.format(r, g, b) 94 | colors.append(hex_color) 95 | return colors 96 | 97 | def configure_several_columns(self): 98 | section = self.doc.sections[0] 99 | sectPr = section._sectPr 100 | cols = sectPr.xpath('./w:cols')[0] 101 | cols.set(qn('w:num'), str(self.num_columns)) 102 | 103 | def add_paragraph(self): 104 | paragraph = self.doc.add_paragraph() 105 | paragraph.alignment = self.paragraph_alignment 106 | paragraph.paragraph_format.space_after = 0 107 | paragraph.paragraph_format.line_spacing_rule = self.line_spacing 108 | 109 | return paragraph 110 | 111 | def add_heading(self, element): 112 | text = element.text 113 | level = int(element.name[1]) 114 | 115 | if len(self.doc.paragraphs) > 1 and "Heading" in self.doc.paragraphs[-2].style.style_id: 116 | return 117 | 118 | if text not in ["Contents"]: 119 | paragraph = self.doc.add_heading(level=level) 120 | _, metadata = self.add_words(text, paragraph) 121 | self.color2word.update(metadata) 122 | #for run in paragraph.runs: 123 | #run.font.size = self.heading_size 124 | paragraph.alignment = self.heading_alignment 125 | 126 | p = self.add_paragraph() 127 | p.paragraph_format.line_spacing_rule = WD_LINE_SPACING.SINGLE 128 | 129 | def add_table(self, html_element): 130 | rows = html_element.find_all('tr') 131 | parsed_table = [] 132 | for row in rows: 133 | cells = row.find_all(['th', 'td']) 134 | parsed_row = [] 135 | for cell in cells: 136 | parsed_row.append(cell.text.strip()) 137 | parsed_table.append(parsed_row) 138 | 139 | rows = len(parsed_table) 140 | cols = max(len(row) for row in parsed_table) 141 | if rows <= self.docx_config["table_max_rows"] and cols <= self.docx_config["table_max_cols"]: 142 | table = self.doc.add_table(rows=len(parsed_table), cols=max(len(row) for row in parsed_table)) 143 | table.style = 'TableGrid' 144 | self.set_table_border_color(table, "FFFFFF") 145 | # Populating table data 146 | for i, row_data in enumerate(parsed_table): 147 | for j, cell_data in enumerate(row_data): 148 | _, metadata = self.add_words(cell_data, table.cell(i, j).paragraphs[0]) 149 | self.color2word.update(metadata) 150 | '''table.cell(i, j).paragraphs[0].paragraph_format.line_spacing = Pt(24) 151 | for run in table.cell(i, j).paragraphs[0].runs: 152 | run.font.size = self.doc_config["font_size"]''' 153 | def add_text(self, html_element): 154 | paragraph = self.add_paragraph() 155 | prev_word = " " 156 | first_word = True 157 | for i, child in enumerate(html_element.children): 158 | if i > 0: 159 | first_word = False 160 | prev_word, metadata = self.add_words(child.get_text(), paragraph, formatting=child.name, prev_word=prev_word, first_word=first_word) 161 | self.color2word.update(metadata) 162 | 163 | def add_words(self, text, paragraph, formatting=None, prev_word=" ", first_word=False): 164 | text = re.sub(r'\[.*?\]', '', text) 165 | words = re.split(r'\s+', text) 166 | metadata = {} 167 | for word in words: 168 | if word: 169 | if word[0] not in ",.?!:;)}]»" and prev_word[-1] not in "«[{(": 170 | if first_word: 171 | paragraph.add_run(' ' * 4) 172 | else: 173 | paragraph.add_run(' ') 174 | 175 | run = paragraph.add_run(word) 176 | color = self.color_word(run) 177 | metadata[color] = word 178 | if formatting == 'b': 179 | run.bold = True 180 | if formatting == 'i': 181 | run.italic = True 182 | if formatting == 'u': 183 | run.underline = True 184 | prev_word = word 185 | run.font.size = self.font_size 186 | run.font.name = self.font_name 187 | return prev_word, metadata 188 | 189 | def color_word(self, run): 190 | color = self.colors[self.color_ptr] 191 | 192 | self.color_ptr += 1 193 | tag = run._r 194 | 195 | # Create XML element 196 | shd = OxmlElement('w:shd') 197 | 198 | # Add attributes to the element 199 | shd.set(qn('w:val'), 'clear') 200 | shd.set(qn('w:color'), 'auto') 201 | shd.set(qn('w:fill'), color) 202 | 203 | # Set the font size - this is important! Without this step the 204 | # tag.rPr value below will be None. 205 | run.element.get_or_add_rPr() 206 | 207 | tag.rPr.append(shd) 208 | 209 | run.font.color.rgb = RGBColor(*tuple(int(color[i:i + 2], 16) for i in (1, 3, 5))) 210 | return color 211 | 212 | def set_table_border_color(self, table, color): 213 | tbl = table._element 214 | tbl_pr = tbl.tblPr 215 | 216 | # Create a new border element 217 | tbl_borders = OxmlElement('w:tblBorders') 218 | 219 | # Create a list of border attributes 220 | borders = [ 221 | 'top', 222 | 'left', 223 | 'bottom', 224 | 'right', 225 | 'insideH', 226 | 'insideV' 227 | ] 228 | 229 | # Iterate through each border attribute 230 | for border in borders: 231 | border_element = OxmlElement(f'w:{border}') 232 | border_element.set(qn('w:val'), 'single') 233 | border_element.set(qn('w:sz'), '4') 234 | border_element.set(qn('w:space'), '0') 235 | border_element.set(qn('w:color'), color) 236 | tbl_borders.append(border_element) 237 | 238 | tbl_pr.append(tbl_borders) 239 | 240 | def save_docx(self, path): 241 | self.doc.save(path) 242 | 243 | #@profileit 244 | def get_images(self, image_size, dpi) -> list[Image]: 245 | out = io.BytesIO() 246 | self.doc.save(out) 247 | doc_bytes = out.getvalue() 248 | pdf_bytes = self.uno_client.convert(indata=doc_bytes, convert_to='pdf') 249 | return convert_from_bytes(pdf_bytes, dpi=dpi, size=image_size) 250 | 251 | def convert_to_uncolored_docx(self): 252 | for paragraph in self.doc.paragraphs: 253 | for run in paragraph.runs: 254 | run.font.color.rgb = RGBColor(0, 0, 0) 255 | rpr = run.element.get_or_add_rPr() 256 | element = rpr.find(qn('w:shd')) 257 | if element is not None: 258 | element.set(qn('w:fill'), "#FFFFFF") 259 | 260 | for table in self.doc.tables: 261 | self.set_table_border_color(table, "000000") 262 | for row in table.rows: 263 | for cell in row.cells: 264 | for run in cell.paragraphs[0].runs: 265 | run.font.color.rgb = RGBColor(0, 0, 0) 266 | rpr = run.element.get_or_add_rPr() 267 | element = rpr.find(qn('w:shd')) 268 | if element is not None: 269 | element.set(qn('w:fill'), "#FFFFFF") 270 | 271 | def get_num_words(self): 272 | return len(self.color2word) 273 | -------------------------------------------------------------------------------- /src/augmentations.py: -------------------------------------------------------------------------------- 1 | import random 2 | from augraphy import * 3 | 4 | 5 | def get_augmentation_phases(): 6 | 7 | pre_phase = [ 8 | # Rescale(scale="optimal", target_dpi = 300, p = 1.0), 9 | ] 10 | 11 | ink_phase = [ 12 | InkColorSwap( 13 | ink_swap_color="random", 14 | ink_swap_sequence_number_range=(5, 10), 15 | ink_swap_min_width_range=(2, 3), 16 | ink_swap_max_width_range=(100, 120), 17 | ink_swap_min_height_range=(2, 3), 18 | ink_swap_max_height_range=(100, 120), 19 | ink_swap_min_area_range=(10, 20), 20 | ink_swap_max_area_range=(400, 500), 21 | p=0.1, 22 | ), 23 | LinesDegradation( 24 | line_roi=(0.0, 0.0, 1.0, 1.0), 25 | line_gradient_range=(32, 255), 26 | line_gradient_direction=(0, 2), 27 | line_split_probability=(0.2, 0.4), 28 | line_replacement_value=(250, 255), 29 | line_min_length=(30, 40), 30 | line_long_to_short_ratio=(5, 7), 31 | line_replacement_probability=(0.4, 0.5), 32 | line_replacement_thickness=(1, 3), 33 | p=0.1, 34 | ), 35 | OneOf( 36 | [ 37 | Dithering( 38 | dither=random.choice(["ordered", "floyd-steinberg"]), 39 | order=(3, 5), 40 | ), 41 | InkBleed( 42 | intensity_range=(0.1, 0.2), 43 | kernel_size=random.choice([(7, 7), (5, 5), (3, 3)]), 44 | severity=(0.4, 0.6), 45 | ), 46 | ], 47 | p=0.1, 48 | ), 49 | OneOf( 50 | [ 51 | InkShifter( 52 | text_shift_scale_range=(18, 27), 53 | text_shift_factor_range=(1, 4), 54 | text_fade_range=(0, 2), 55 | blur_kernel_size=(5, 5), 56 | blur_sigma=0, 57 | noise_type="random", 58 | ), 59 | BleedThrough( 60 | intensity_range=(0.1, 0.3), 61 | color_range=(32, 224), 62 | ksize=(17, 17), 63 | sigmaX=1, 64 | alpha=random.uniform(0.1, 0.2), 65 | offsets=(10, 20), 66 | ), 67 | ], 68 | p=0.1, 69 | ), 70 | OneOf( 71 | [ 72 | Hollow( 73 | hollow_median_kernel_value_range=(71, 101), 74 | hollow_min_width_range=(1, 2), 75 | hollow_max_width_range=(150, 200), 76 | hollow_min_height_range=(1, 2), 77 | hollow_max_height_range=(150, 200), 78 | hollow_min_area_range=(10, 20), 79 | hollow_max_area_range=(2000, 5000), 80 | hollow_dilation_kernel_size_range=(1, 2), 81 | ), 82 | Letterpress( 83 | n_samples=(100, 400), 84 | n_clusters=(200, 400), 85 | std_range=(500, 3000), 86 | value_range=(150, 224), 87 | value_threshold_range=(96, 128), 88 | blur=1, 89 | ), 90 | ], 91 | p=0.1, 92 | ), 93 | OneOf( 94 | [ 95 | LowInkRandomLines( 96 | count_range=(5, 10), 97 | use_consistent_lines=random.choice([True, False]), 98 | noise_probability=0.1, 99 | ), 100 | LowInkPeriodicLines( 101 | count_range=(2, 5), 102 | period_range=(16, 32), 103 | use_consistent_lines=random.choice([True, False]), 104 | noise_probability=0.1, 105 | ), 106 | ], 107 | p=0.1, 108 | ), 109 | ] 110 | 111 | paper_phase = [ 112 | PaperFactory(p=0.1), 113 | ColorPaper( 114 | hue_range=(0, 255), 115 | saturation_range=(10, 40), 116 | p=0.1, 117 | ), 118 | OneOf( 119 | [ 120 | #DelaunayTessellation( 121 | # n_points_range=(500, 800), 122 | # n_horizontal_points_range=(500, 800), 123 | # n_vertical_points_range=(500, 800), 124 | # noise_type="random", 125 | # color_list="default", 126 | # color_list_alternate="default", 127 | #), 128 | PatternGenerator( 129 | imgx=random.randint(256, 512), 130 | imgy=random.randint(256, 512), 131 | n_rotation_range=(10, 15), 132 | color="random", 133 | alpha_range=(0.25, 0.5), 134 | ), 135 | #VoronoiTessellation( 136 | # mult_range=(50, 80), 137 | # seed=19829813472, 138 | # num_cells_range=(500, 1000), 139 | # noise_type="random", 140 | # background_value=(200, 255), 141 | #), 142 | ], 143 | p=0.1, 144 | ), 145 | WaterMark( 146 | watermark_word="random", 147 | watermark_font_size=(10, 15), 148 | watermark_font_thickness=(20, 25), 149 | watermark_rotation=(0, 360), 150 | watermark_location="random", 151 | watermark_color="random", 152 | watermark_method="darken", 153 | p=0.1, 154 | ), 155 | OneOf( 156 | [ 157 | AugmentationSequence( 158 | [ 159 | NoiseTexturize( 160 | sigma_range=(3, 10), 161 | turbulence_range=(2, 5), 162 | texture_width_range=(300, 500), 163 | texture_height_range=(300, 500), 164 | ), 165 | BrightnessTexturize( 166 | texturize_range=(0.9, 0.99), 167 | deviation=0.03, 168 | ), 169 | ], 170 | ), 171 | AugmentationSequence( 172 | [ 173 | BrightnessTexturize( 174 | texturize_range=(0.9, 0.99), 175 | deviation=0.03, 176 | ), 177 | NoiseTexturize( 178 | sigma_range=(3, 10), 179 | turbulence_range=(2, 5), 180 | texture_width_range=(300, 500), 181 | texture_height_range=(300, 500), 182 | ), 183 | ], 184 | ), 185 | ], 186 | p=0.1, 187 | ), 188 | ] 189 | 190 | post_phase = [ 191 | ColorShift( 192 | color_shift_offset_x_range=(3, 5), 193 | color_shift_offset_y_range=(3, 5), 194 | color_shift_iterations=(2, 3), 195 | color_shift_brightness_range=(0.9, 1.1), 196 | color_shift_gaussian_kernel_range=(3, 3), 197 | p=0.1, 198 | ), 199 | OneOf( 200 | [ 201 | DirtyDrum( 202 | line_width_range=(1, 6), 203 | line_concentration=random.uniform(0.05, 0.15), 204 | direction=random.randint(0, 2), 205 | noise_intensity=random.uniform(0.6, 0.95), 206 | noise_value=(64, 224), 207 | ksize=random.choice([(3, 3), (5, 5), (7, 7)]), 208 | sigmaX=0, 209 | p=0.1, 210 | ), 211 | DirtyRollers( 212 | line_width_range=(2, 32), 213 | scanline_type=0, 214 | ), 215 | ], 216 | p=0.1, 217 | ), 218 | OneOf( 219 | [ 220 | LightingGradient( 221 | light_position=None, 222 | direction=None, 223 | max_brightness=255, 224 | min_brightness=0, 225 | mode="gaussian", 226 | linear_decay_rate=None, 227 | transparency=None, 228 | ), 229 | Brightness( 230 | brightness_range=(0.9, 1.1), 231 | min_brightness=0, 232 | min_brightness_value=(120, 150), 233 | ), 234 | Gamma( 235 | gamma_range=(0.9, 1.1), 236 | ), 237 | ], 238 | p=0.1, 239 | ), 240 | OneOf( 241 | [ 242 | SubtleNoise( 243 | subtle_range=random.randint(5, 10), 244 | ), 245 | Jpeg( 246 | quality_range=(25, 95), 247 | ), 248 | ], 249 | p=0.1, 250 | ), 251 | OneOf( 252 | [ 253 | Markup( 254 | num_lines_range=(2, 7), 255 | markup_length_range=(0.5, 1), 256 | markup_thickness_range=(1, 2), 257 | markup_type=random.choice(["strikethrough", "crossed", "highlight", "underline"]), 258 | markup_color="random", 259 | single_word_mode=False, 260 | repetitions=1, 261 | ), 262 | Scribbles( 263 | scribbles_type="random", 264 | scribbles_location="random", 265 | scribbles_size_range=(250, 600), 266 | scribbles_count_range=(1, 6), 267 | scribbles_thickness_range=(1, 3), 268 | scribbles_brightness_change=[32, 64, 128], 269 | scribbles_text="random", 270 | scribbles_text_font="random", 271 | scribbles_text_rotate_range=(0, 360), 272 | scribbles_lines_stroke_count_range=(1, 6), 273 | ), 274 | ], 275 | p=0.1, 276 | ), 277 | OneOf( 278 | [ 279 | #BadPhotoCopy( 280 | # noise_mask=None, 281 | # noise_type=-1, 282 | # noise_side="random", 283 | # noise_iteration=(1, 2), 284 | # noise_size=(1, 3), 285 | # noise_value=(128, 196), 286 | # noise_sparsity=(0.3, 0.6), 287 | # noise_concentration=(0.1, 0.6), 288 | # blur_noise=random.choice([True, False]), 289 | # blur_noise_kernel=random.choice([(3, 3), (5, 5), (7, 7)]), 290 | # wave_pattern=random.choice([True, False]), 291 | # edge_effect=random.choice([True, False]), 292 | #), 293 | ShadowCast( 294 | shadow_side="random", 295 | shadow_vertices_range=(1, 20), 296 | shadow_width_range=(0.3, 0.8), 297 | shadow_height_range=(0.3, 0.8), 298 | shadow_color=(0, 0, 0), 299 | shadow_opacity_range=(0.2, 0.9), 300 | shadow_iterations_range=(1, 2), 301 | shadow_blur_kernel_range=(101, 301), 302 | ), 303 | LowLightNoise( 304 | num_photons_range=(50, 100), 305 | alpha_range=(0.7, 1.0), 306 | beta_range=(10, 30), 307 | gamma_range=(1, 1.8), 308 | bias_range=(20, 40), 309 | dark_current_value=1.0, 310 | exposure_time=0.2, 311 | gain=0.1, 312 | ), 313 | ], 314 | p=0.1, 315 | ), 316 | OneOf( 317 | [ 318 | NoisyLines( 319 | noisy_lines_direction="random", 320 | noisy_lines_location="random", 321 | noisy_lines_number_range=(5, 20), 322 | noisy_lines_color=(0, 0, 0), 323 | noisy_lines_thickness_range=(1, 2), 324 | noisy_lines_random_noise_intensity_range=(0.01, 0.1), 325 | noisy_lines_length_interval_range=(0, 100), 326 | noisy_lines_gaussian_kernel_value_range=(3, 5), 327 | noisy_lines_overlay_method="ink_to_paper", 328 | ), 329 | BindingsAndFasteners( 330 | overlay_types="darken", 331 | foreground=None, 332 | effect_type="random", 333 | width_range="random", 334 | height_range="random", 335 | angle_range=(-30, 30), 336 | ntimes=(2, 6), 337 | nscales=(0.9, 1.0), 338 | edge="random", 339 | edge_offset=(10, 50), 340 | use_figshare_library=0, 341 | ), 342 | ], 343 | p=0.1, 344 | ), 345 | OneOf( 346 | [ 347 | Squish( 348 | squish_direction="random", 349 | squish_location="random", 350 | squish_number_range=(5, 10), 351 | squish_distance_range=(5, 7), 352 | squish_line="random", 353 | squish_line_thickness_range=(1, 1), 354 | ), 355 | Geometric( 356 | fliplr=False, 357 | flipud=False, 358 | crop=(), 359 | rotate_range=(-4, 4), 360 | randomize=0, 361 | p=1, 362 | ), 363 | ], 364 | p=0.1, 365 | ), 366 | OneOf( 367 | [ 368 | DotMatrix( 369 | dot_matrix_shape="random", 370 | dot_matrix_dot_width_range=(3, 3), 371 | dot_matrix_dot_height_range=(3, 3), 372 | dot_matrix_min_width_range=(1, 2), 373 | dot_matrix_max_width_range=(150, 200), 374 | dot_matrix_min_height_range=(1, 2), 375 | dot_matrix_max_height_range=(150, 200), 376 | dot_matrix_min_area_range=(10, 20), 377 | dot_matrix_max_area_range=(2000, 5000), 378 | dot_matrix_median_kernel_value_range=(128, 255), 379 | dot_matrix_gaussian_kernel_value_range=(1, 3), 380 | dot_matrix_rotate_value_range=(0, 360), 381 | ), 382 | Faxify( 383 | scale_range=(0.3, 0.6), 384 | monochrome=random.choice([0, 1]), 385 | monochrome_method="random", 386 | monochrome_arguments={}, 387 | halftone=random.choice([0, 1]), 388 | invert=1, 389 | half_kernel_size=random.choice([(1, 1), (2, 2)]), 390 | angle=(0, 360), 391 | sigma=(1, 3), 392 | ), 393 | ], 394 | p=0.1, 395 | ), 396 | OneOf( 397 | [ 398 | InkMottling( 399 | ink_mottling_alpha_range=(0.2, 0.3), 400 | ink_mottling_noise_scale_range=(2, 2), 401 | ink_mottling_gaussian_kernel_range=(3, 5), 402 | ), 403 | ReflectedLight( 404 | reflected_light_smoothness=0.8, 405 | reflected_light_internal_radius_range=(0.0, 0.001), 406 | reflected_light_external_radius_range=(0.5, 0.8), 407 | reflected_light_minor_major_ratio_range=(0.9, 1.0), 408 | reflected_light_color=(255, 255, 255), 409 | reflected_light_internal_max_brightness_range=(0.75, 0.75), 410 | reflected_light_external_max_brightness_range=(0.5, 0.75), 411 | reflected_light_location="random", 412 | reflected_light_ellipse_angle_range=(0, 360), 413 | reflected_light_gaussian_kernel_size_range=(5, 310), 414 | p=0.1, 415 | ), 416 | ], 417 | p=0.1, 418 | ), 419 | OneOf( 420 | [ 421 | PageBorder( 422 | page_border_width_height="random", 423 | page_border_color=(0, 0, 0), 424 | page_border_background_color=(0, 0, 0), 425 | page_numbers="random", 426 | page_rotation_angle_range=(-3, 3), 427 | curve_frequency=(2, 8), 428 | curve_height=(2, 4), 429 | curve_length_one_side=(50, 100), 430 | same_page_border=random.choice([0, 1]), 431 | ), 432 | #BookBinding( 433 | # shadow_radius_range=(30, 100), 434 | # curve_range_right=(50, 200), 435 | # curve_range_left=(50, 200), 436 | # curve_ratio_right=(0.1, 0.3), 437 | # curve_ratio_left=(0.1, 0.3), 438 | # mirror_range=(1.0, 1.0), 439 | # binding_align="random", 440 | # binding_pages=(5, 10), 441 | # curling_direction=-1, 442 | # backdrop_color=(0, 0, 0), 443 | # enable_shadow=random.choice([0, 1]), 444 | #), 445 | Folding( 446 | fold_x=None, 447 | fold_deviation=(0, 0), 448 | fold_count=random.randint(2, 8), 449 | fold_noise=0.01, 450 | fold_angle_range=(-360, 360), 451 | gradient_width=(0.1, 0.2), 452 | gradient_height=(0.01, 0.02), 453 | backdrop_color=(0, 0, 0), 454 | ), 455 | ], 456 | p=0.1, 457 | ), 458 | # Rescale(scale = "original" , p = 1.0) 459 | ] 460 | return {'ink_phase': ink_phase, 461 | 'paper_phase': paper_phase, 462 | 'post_phase': post_phase, 463 | 'pre_phase': pre_phase} 464 | --------------------------------------------------------------------------------