├── .gitignore
├── resources
├── im_2.png
├── im_9.png
├── im_10.png
├── im_12.png
└── DoGe_Scheme.png
├── scripts
├── print_profiling_stats.py
├── count_files.bash
└── show_image_annotations.py
├── requirements.txt
├── docx_config.json
├── Dockerfile
├── src
├── url_parser.py
├── utils.py
├── manager.py
├── document_generator.py
├── docx_document.py
└── augmentations.py
├── main.py
└── README.md
/.gitignore:
--------------------------------------------------------------------------------
1 | data
2 | *cache*
3 | nohup.out
4 | *.png
5 | .vscode
6 | *.profile
7 | logs
8 | fonts
--------------------------------------------------------------------------------
/resources/im_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Travvy88/DocumentGenerator_DoGe/HEAD/resources/im_2.png
--------------------------------------------------------------------------------
/resources/im_9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Travvy88/DocumentGenerator_DoGe/HEAD/resources/im_9.png
--------------------------------------------------------------------------------
/resources/im_10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Travvy88/DocumentGenerator_DoGe/HEAD/resources/im_10.png
--------------------------------------------------------------------------------
/resources/im_12.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Travvy88/DocumentGenerator_DoGe/HEAD/resources/im_12.png
--------------------------------------------------------------------------------
/resources/DoGe_Scheme.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Travvy88/DocumentGenerator_DoGe/HEAD/resources/DoGe_Scheme.png
--------------------------------------------------------------------------------
/scripts/print_profiling_stats.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import pstats
3 |
4 | parser = argparse.ArgumentParser()
5 | parser.add_argument('path')
6 | parser.add_argument('-n')
7 | args = parser.parse_args()
8 |
9 | p = pstats.Stats(args.path)
10 | p.strip_dirs().sort_stats(pstats.SortKey.CUMULATIVE).print_stats(int(args.n))
--------------------------------------------------------------------------------
/scripts/count_files.bash:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | count_files() {
4 | if [ -z "$1" ]; then
5 | echo "No argument provided" >&2
6 | return 1
7 | fi
8 | if [ ! -d "$1" ]; then
9 | echo "Argument is not a directory" >&2
10 | return 1
11 | fi
12 | find "$1" -type f | wc -l
13 | }
14 |
15 | count_files "$@"
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | -e git+https://github.com/Travvy88/augraphy.git@fix_bboxes_oneof_augmentation_sequence#egg=augraphy
2 | opencv-python==4.10.0.84
3 | python-docx==1.1.2
4 | matplotlib==3.8.2
5 | pdf2image==1.17.0
6 | tqdm==4.66.5
7 | unoserver==2.1
8 | unotools==0.3.3
9 | beautifulsoup4==4.12.3
10 | requests==2.32.3
11 | pillow-simd==9.5.0.post2
--------------------------------------------------------------------------------
/docx_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "max_words": 20000,
3 | "p_2columns": 0.2,
4 | "font_size_interval": [8, 13],
5 | "p_line_spacing": [0.5, 0.5],
6 | "p_text_alignment": [0.1, 0.4, 0, 0.5],
7 | "p_heading_bold": 0.5,
8 | "heading_relative_size_interval": [1, 2],
9 | "p_heading_alignment": [0.5, 0.25, 0.01, 0.24],
10 | "table_max_rows": 15,
11 | "table_max_cols": 5
12 | }
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM ubuntu:22.04
2 |
3 | # Install python and libreoffice
4 | RUN apt-get update && apt-get install -y libreoffice python3 python3-pip git libjpeg-dev zlib1g-dev poppler-utils
5 |
6 | # install unoserver to python that is used by LibreOffice
7 | RUN /usr/bin/python3 -m pip install --user unoserver
8 |
9 | # Set working directory to /app
10 | WORKDIR /app
11 |
12 | # Copy requirements file
13 | COPY requirements.txt .
14 |
15 | # Install Python dependencies
16 | RUN pip3 install -r requirements.txt
17 |
18 | # Copy the current directory contents into the container at /app
19 | COPY src /app/src
20 | COPY main.py /app/main.py
21 | COPY docx_config.json /app/docx_config.json
22 |
23 | # run interactively
24 | CMD ["/bin/bash"]
--------------------------------------------------------------------------------
/scripts/show_image_annotations.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import json
3 | from PIL import Image, ImageDraw, ImageFont
4 |
5 | parser = argparse.ArgumentParser()
6 | parser.add_argument('path')
7 |
8 | args = parser.parse_args()
9 |
10 | image = Image.open(args.path)
11 | height, width = image.size
12 |
13 | with open(args.path + '.json', 'r') as f:
14 | annotations = json.load(f)
15 |
16 | draw = ImageDraw.Draw(image)
17 | font = ImageFont.truetype('arial.ttf', size=8)
18 | for word, bbox in zip(annotations['words'], annotations['bboxes']):
19 | x, y, w, h = bbox
20 |
21 |
22 |
23 |
24 | if w < 0:
25 | x = x - w
26 | w = w * -1
27 | if h < 0:
28 | y = y - h
29 | h = h * -1
30 |
31 | x1 = int(x * width)
32 | y1 = int(y * height)
33 | x2 = int((x + w) * width)
34 | y2 = int((y + h) * height)
35 |
36 | print([x, y, w, h])
37 | print([x1, y1, x2, y2])
38 |
39 | draw.rectangle([x1, y1, x2, y2], outline="blue", width=1)
40 | draw.text((x1, y1 - 10), word, fill="red", font=font, ) # Adjust position as needed
41 |
42 | output_image_path = 'show_anno.png'
43 | image.save(output_image_path)
44 |
--------------------------------------------------------------------------------
/src/url_parser.py:
--------------------------------------------------------------------------------
1 | from urllib.parse import urljoin, urlparse
2 | from bs4 import BeautifulSoup
3 | import requests
4 | from tqdm import tqdm
5 |
6 |
7 | class UrlParser:
8 | def parse(self, start_url, max_urls, languages):
9 | ptr = 0
10 | urls = []
11 | urls.append(start_url)
12 |
13 | pbar = tqdm(initial=1, total=max_urls)
14 | while len(urls) < max_urls:
15 | url = urls[ptr]
16 | try:
17 | response = requests.get(url)
18 | response.raise_for_status()
19 | except requests.exceptions.RequestException as e:
20 | print(f"Failed to retrieve {url}: {e}")
21 | return
22 |
23 | # Parse the page content
24 | soup = BeautifulSoup(response.content, 'html.parser')
25 |
26 | # Find all links on the page
27 | links = soup.find_all('a', href=True)
28 | for link in links:
29 | href = link['href']
30 | full_url = urljoin(url, href)
31 | if self.is_valid_url(full_url, languages) and full_url not in urls and len(urls) < max_urls:
32 | urls.append(full_url)
33 | pbar.update(1)
34 | ptr += 1
35 | return urls[1:]
36 |
37 | def is_valid_url(self, url, languages):
38 | # Check if the URL is a valid Wikipedia article URL
39 | parsed = urlparse(url)
40 | if parsed.scheme in ('http', 'https') and 'wikipedia.org' in parsed.netloc and \
41 | any(parsed.netloc.find(lang_element) != -1 for lang_element in languages):
42 | path = parsed.path
43 | if path.startswith('/wiki/') and not any(sub in path for sub in [':', '/wiki/Main_Page']):
44 | return True
45 | return False
--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import cProfile
3 | import json
4 | from pathlib import Path
5 | from src.manager import Manager
6 |
7 |
8 | def create_parser():
9 | parser = argparse.ArgumentParser(description="Manager Configuration")
10 |
11 | parser.add_argument('--out_dir', type=str, required=True,
12 | help='Output directory for saving results')
13 | parser.add_argument('--remove_existing_dir', action='store_true',
14 | help='If out_dir exists, delete the folder and files before creating a new one')
15 | parser.add_argument('--debug', action='store_true',
16 | help='Enable debug mode')
17 | parser.add_argument('--image_size', type=int, default=244,
18 | help='Size of the final images (default: 244)')
19 | parser.add_argument('--start_page', type=str, default='https://en.wikipedia.org/wiki/Main_Page',
20 | help='Starting page URL (default: Wikipedia main page)')
21 | parser.add_argument('--languages', type=str, nargs='+', default=['en'],
22 | help='Permitted languages. Other languages will be ignored (default: English)')
23 | parser.add_argument('--max_urls', type=int, default=16,
24 | help='Maximum number of URLs to process (default: 100)')
25 | parser.add_argument('--num_processes', type=int, default=1,
26 | help='Number of processes to use (default: 1)')
27 | parser.add_argument('--max_threads', type=int, default=3,
28 | help='Maximum threads inside a process (default: 3)')
29 | parser.add_argument('--ports', type=str, nargs='+', default=[8145, 8146],
30 | help='List of ports to use (default: [8145, 8146]). Number of ports \
31 | should be 2 times larger than num_processes')
32 |
33 | return parser
34 |
35 | if __name__ == "__main__":
36 | parser = create_parser()
37 | args = parser.parse_args()
38 |
39 | with open('docx_config.json', 'r') as f:
40 | docx_config = json.load(f)
41 |
42 | manager = Manager(
43 | docx_config=docx_config,
44 | out_dir=Path(args.out_dir),
45 | remove_existing_dir=args.remove_existing_dir,
46 | debug=args.debug,
47 | image_size=args.image_size,
48 | start_page=args.start_page,
49 | languages=tuple(args.languages),
50 | max_urls=args.max_urls,
51 | num_processes=args.num_processes,
52 | max_threads=args.max_threads,
53 | ports=tuple(args.ports)
54 | )
55 | manager.generate()
56 |
--------------------------------------------------------------------------------
/src/utils.py:
--------------------------------------------------------------------------------
1 | from PIL import Image, ImageDraw
2 | import cv2
3 | import numpy as np
4 |
5 |
6 | def convert_xywh_to_x1y1x2y2(bboxes):
7 | if isinstance(bboxes, list):
8 | return [[bbox[0], bbox[1], bbox[0] + bbox[2], bbox[1] + bbox[3]] for bbox in bboxes]
9 | if isinstance(bboxes, np.ndarray):
10 | x1 = bboxes[:, 0]
11 | x2 = bboxes[:, 1]
12 | x3 = bboxes[:, 0] + bboxes[:, 2]
13 | x4 = bboxes[:, 1] + bboxes[:, 3]
14 | return np.column_stack((x1, x2, x3, x4))
15 |
16 |
17 | def convert_x1y1x2y2_to_xywh(bboxes):
18 | if isinstance(bboxes, list):
19 | return [[bbox[0], bbox[1], bbox[2] - bbox[0], bbox[3] - bbox[1]] for bbox in bboxes]
20 | if isinstance(bboxes, np.ndarray):
21 | x = bboxes[:, 0]
22 | y = bboxes[:, 1]
23 | w = bboxes[:, 2] - bboxes[:, 0]
24 | h = bboxes[:, 3] - bboxes[:, 1]
25 | return np.column_stack((x, y, w, h))
26 |
27 |
28 | def normalize_bboxes(bboxes, width, height):
29 | if isinstance(bboxes, list):
30 | return [[bbox[0] / width, bbox[1] / height, bbox[2] / width, bbox[3] / height] for bbox in bboxes]
31 | if isinstance(bboxes, np.ndarray):
32 | el1 = bboxes[:, 0] / width
33 | el2 = bboxes[:, 1] / height
34 | el3 = bboxes[:, 2] / width
35 | el4 = bboxes[:, 3] / height
36 | return np.column_stack((el1, el2, el3, el4))
37 |
38 |
39 | def unnormalize_bboxes(bboxes, width, height):
40 | if isinstance(bboxes, list):
41 | return [[bbox[0] * width, bbox[1] * height, bbox[2] * width, bbox[3] * height] for bbox in bboxes]
42 | if isinstance(bboxes, np.ndarray):
43 | el1 = bboxes[:, 0] * width
44 | el2 = bboxes[:, 1] * height
45 | el3 = bboxes[:, 2] * width
46 | el4 = bboxes[:, 3] * height
47 | return np.column_stack((el1, el2, el3, el4))
48 |
49 | def draw_bboxes_pil(image, bboxes, words=None):
50 | draw = ImageDraw.Draw(image)
51 | for bbox, word in zip(bboxes, words):
52 | x1, y1, x2, y2 = int(bbox[0]), int(bbox[1]), int(bbox[2]), int(bbox[3])
53 | # Draw rectangle with red color and 2px thickness
54 | draw.rectangle([(x1, y1), (x2, y2)], outline="red", width=2)
55 | # Optionally add text labels above the boxes
56 | draw.text((x1, y1-15), word, fill="red")
57 | return image
58 |
59 | def draw_bboxes(image, bboxes, words=None):
60 | # bboxes in x1, y1, x2, y2 format
61 | if words is None:
62 | words = [""] * len(bboxes)
63 |
64 | if isinstance(image, np.ndarray):
65 | image = Image.fromarray(image)
66 | image = draw_bboxes_pil(image, bboxes, words)
67 | image = np.array(image)
68 | elif isinstance(image, Image.Image):
69 | image = draw_bboxes_pil(image, bboxes, words)
70 | else:
71 | raise ValueError(f"Unsupported image type: {type(image)}")
72 |
73 | return image
74 |
--------------------------------------------------------------------------------
/src/manager.py:
--------------------------------------------------------------------------------
1 | import json
2 | import multiprocessing
3 | import os
4 | from pathlib import Path
5 | import shutil
6 | import time
7 |
8 | from tqdm import tqdm
9 |
10 | from src.document_generator import DocumentGenerator
11 | from src.url_parser import UrlParser
12 |
13 |
14 | class Manager:
15 | def __init__(self,
16 | docx_config: dict,
17 | out_dir: Path,
18 | remove_existing_dir,
19 | debug,
20 | image_size,
21 | start_page,
22 | languages,
23 | max_urls,
24 | num_processes,
25 | max_threads,
26 | ports):
27 |
28 | self.docx_config = docx_config
29 | self.out_dir = out_dir
30 | self.debug = debug
31 | self.image_size = image_size
32 | self.start_page = start_page
33 | self.languages = languages
34 | self.max_urls = max_urls
35 |
36 | self.num_processes = num_processes
37 | self.max_threads = max_threads
38 | self.ports = ports
39 |
40 | self.url_parser = UrlParser()
41 | self.folders = self._create_folders(remove_existing_dir=remove_existing_dir)
42 | self.doc_generators = [DocumentGenerator(self.max_threads,
43 | self.image_size,
44 | self.docx_config,
45 | self.folders[i],
46 | ports[i],
47 | ports[num_processes + i],
48 | self.debug,) \
49 | for i in range(num_processes)]
50 |
51 | def generate(self):
52 | start_time = time.time()
53 | print('Parsing urls...')
54 | urls = self.url_parser.parse(self.start_page, self.max_urls, self.languages)
55 | urls_chunks = self._split_urls_to_chunks(urls)
56 | processes = []
57 |
58 | for i in range(self.num_processes):
59 | process = multiprocessing.Process(name=f"Generator_{i}", target=self.doc_generators[i].generate,
60 | kwargs={"urls": urls_chunks[i]})
61 | processes.append(process)
62 | process.start()
63 |
64 | for process in processes:
65 | process.join()
66 |
67 | self._merge_all_folders()
68 |
69 | end_time = time.time()
70 | file_count = 0
71 | for root, dirs, files in os.walk(self.out_dir):
72 | file_count += len(files)
73 | file_count /= 2
74 | print('Images:', int(file_count))
75 | print('Elapsed time:', end_time - start_time)
76 | print('Urls per second:', self.max_urls / (end_time - start_time))
77 | print('Images per second:', file_count / (end_time - start_time))
78 | print()
79 | print('Seconds per url:', (end_time - start_time) / self.max_urls)
80 | print('Seconds per image:', (end_time - start_time) / file_count)
81 | print('Images per url:', file_count / self.max_urls)
82 |
83 | def _split_urls_to_chunks(self, urls):
84 | n = len(urls)
85 | chunk_size = n // self.num_processes
86 | remainder = n % self.num_processes
87 |
88 | chunks = []
89 | for i in range(self.num_processes):
90 | start_index = i * chunk_size + min(i, remainder)
91 | end_index = start_index + chunk_size + (1 if i < remainder else 0)
92 | chunks.append(urls[start_index:end_index])
93 | return chunks
94 |
95 | def _create_folders(self, remove_existing_dir):
96 | folders = [self.out_dir / f"tmp_process_{i}" for i in range(self.num_processes)]
97 | if remove_existing_dir:
98 | if os.path.exists(self.out_dir):
99 | shutil.rmtree(self.out_dir)
100 | for folder in folders:
101 | if os.path.exists(folder):
102 | shutil.rmtree(folder)
103 |
104 | for folder in folders:
105 | os.makedirs(folder)
106 |
107 | return folders
108 |
109 | def _validate_annotations(self, image_path, anno_path):
110 | if not os.path.exists(image_path):
111 | print(f"Image {image_path} not found")
112 | return False
113 | if not os.path.exists(anno_path):
114 | print(f"Annotation {anno_path} not found")
115 | return False
116 |
117 | with open(anno_path, 'r') as f:
118 | anno = json.load(f)
119 |
120 | if len(anno['words']) != len(anno['bboxes']):
121 | print(f"Annotation {anno_path} has different number of words and bboxes")
122 | return False
123 |
124 | return True
125 |
126 | def _merge_all_folders(self):
127 | counter = 0
128 | bad_annotations = 0
129 | for folder_path in tqdm(self.folders):
130 | if os.path.isdir(folder_path):
131 | # Iterate over each file in the current folder
132 | for file_name in sorted([f for f in os.listdir(folder_path) if f.endswith('.png.json')]):
133 | json_path = os.path.join(folder_path, file_name)
134 |
135 | if self._validate_annotations(json_path[:-5], json_path):
136 | new_file_name = f"image_{counter}.png"
137 | new_json_name = f"image_{counter}.png.json"
138 |
139 | new_file_path = os.path.join(self.out_dir, new_file_name)
140 | new_json_path = os.path.join(self.out_dir, new_json_name)
141 |
142 | # Move and rename the file
143 | shutil.move(json_path[:-5], new_file_path)
144 | shutil.move(json_path, new_json_path)
145 | #print(f'{json_path} -> {new_json_path}')
146 | #print(f'{json_path[:-5]} -> {new_file_path}')
147 |
148 | # Move the colored image
149 | _, number = file_name.split("_")
150 | number = number.split(".")[0]
151 | #print(f'{folder_path}/im_{number}_colored.png -> {new_file_path[:-4] + "_colored.png"}')
152 | #print('--------------------------------')
153 | if os.path.exists(f"{folder_path}/im_{number}_colored.png"):
154 | shutil.move(f"{folder_path}/im_{number}_colored.png", new_file_path[:-4] + "_colored.png")
155 |
156 | counter += 1
157 | else:
158 | bad_annotations += 1
159 |
160 | for i in range(self.num_processes):
161 | shutil.rmtree(self.out_dir / f'tmp_process_{i}')
162 |
163 | print(f'Folder merge finished, bad annotations: {bad_annotations}')
164 |
--------------------------------------------------------------------------------
/src/document_generator.py:
--------------------------------------------------------------------------------
1 | import cProfile
2 | from concurrent.futures import ThreadPoolExecutor
3 | import json
4 | import multiprocessing
5 | import os
6 | from pathlib import Path
7 | import subprocess
8 | from time import sleep
9 | import time
10 | import traceback
11 | from bs4 import BeautifulSoup
12 | import numpy as np
13 | from augraphy import AugraphyPipeline
14 | from unoserver import client
15 | import requests
16 | from tqdm import tqdm
17 | from PIL import Image, ImageDraw
18 | import cv2
19 | import threading
20 |
21 | import src.utils as utils
22 | from src.augmentations import get_augmentation_phases
23 | from src.docx_document import DocxDocument
24 |
25 |
26 | def profileit(func):
27 | def wrapper(*args, **kwargs):
28 | datafn = func.__name__ + ".profile" # Name the data file sensibly
29 | prof = cProfile.Profile()
30 | retval = prof.runcall(func, *args, **kwargs)
31 | prof.dump_stats(datafn)
32 | return retval
33 |
34 | return wrapper
35 |
36 | class DocumentGenerator:
37 | def __init__(self, max_threads, image_size, docx_config, out_folder, port, uno_port, debug_mode):
38 | self.max_threads = max_threads
39 | self.image_size = image_size
40 | self.out_folder = out_folder
41 | self.docx_config = docx_config
42 | self.port = port
43 | self.uno_port = uno_port
44 | self.debug_mode = debug_mode
45 |
46 | self.image_counter = 0
47 |
48 | command = f"/usr/bin/python3 -m unoserver.server --port {port} --uno-port {uno_port} > /dev/null 2>&1"
49 | print('START SERVER', port, uno_port)
50 | self.unoserver_process = subprocess.Popen(command, shell=True)
51 | self.uno_client = client.UnoClient(port=port)
52 |
53 | def __del__(self):
54 | self.unoserver_process.kill()
55 |
56 | def generate(self, urls):
57 | print('Start Document Generator...')
58 | with ThreadPoolExecutor(max_workers=self.max_threads,
59 | thread_name_prefix=f"{multiprocessing.current_process().name}_thread") as executor:
60 | futures = [executor.submit(self.create_doc_try_except, url) for url in urls]
61 | for future in futures:
62 | future.result()
63 |
64 | def create_doc_try_except(self, url):
65 | try:
66 | self.create_doc(url)
67 | print(f'{threading.current_thread().name} total images generated by the current process: {self.image_counter}')
68 | except Exception as e:
69 | if self.debug_mode:
70 | print(traceback.format_exc())
71 | else:
72 | print("skipping due augmentation error")
73 |
74 | #@profileit
75 | def create_doc(self, url):
76 | doc = DocxDocument(self.docx_config, self.uno_client)
77 | response = requests.get(url)
78 | if response.status_code != 200:
79 | print(f"Bad Response: {response}")
80 | return
81 |
82 | # create colored docx document
83 | soup = BeautifulSoup(response.text, 'html.parser')
84 | for element in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', "table"]):
85 | if element.name.startswith('h'):
86 | doc.add_heading(element)
87 | elif element.name == "table":
88 | doc.add_table(element)
89 | else:
90 | doc.add_text(element)
91 |
92 | if doc.get_num_words() > self.docx_config["max_words"]:
93 | break
94 |
95 | # extract annotations from colored images
96 | colored_images = doc.get_images(dpi=200, image_size=1500)
97 | annotations = self.get_bboxes(colored_images, doc.color2word) # bboxes are normalized to [0,1]
98 | doc.convert_to_uncolored_docx()
99 | images = doc.get_images(dpi=200, image_size=1024) # get images for augmentation stage
100 | for i, image in enumerate(images):
101 | if len(annotations[i]['words']) != len(annotations[i]['bboxes']):
102 | continue
103 | # unnormalize bboxes to augmentation image size
104 | bounding_boxes = np.array(annotations[i]["bboxes"])
105 | bounding_boxes = utils.unnormalize_bboxes(bounding_boxes, colored_images[0].size[0], colored_images[0].size[1])
106 |
107 | # perform augmentation
108 | augmentation_pipeline = AugraphyPipeline(bounding_boxes=bounding_boxes,
109 | log=False, **get_augmentation_phases())
110 |
111 | augmented_cv2, _, _, augmented_bounding_boxes = augmentation_pipeline(np.array(image))
112 | augmented_image = Image.fromarray(augmented_cv2)
113 | with threading.Lock():
114 | if self.debug_mode:
115 | bboxes_for_image = utils.normalize_bboxes(augmented_bounding_boxes, colored_images[0].size[0], colored_images[0].size[1])
116 | bboxes_for_image = utils.unnormalize_bboxes(bboxes_for_image, augmented_image.size[0], augmented_image.size[1])
117 |
118 | augmented_image = utils.draw_bboxes_pil(augmented_image, bboxes_for_image, annotations[i]["words"])
119 | colored_image = utils.draw_bboxes_pil(colored_images[i], bounding_boxes, annotations[i]["words"])
120 | colored_image.save(self.out_folder / f"im_{self.image_counter}_colored.png")
121 |
122 | # resize image to final dataset size and save
123 | augmented_image = augmented_image.resize((self.image_size, self.image_size))
124 | augmented_image.save(self.out_folder / f"im_{self.image_counter}.png")
125 |
126 | # convert booxes to (x, y, w, h) format and normalize to [0,1]
127 | augmented_bounding_boxes = np.array(augmented_bounding_boxes).astype(int)
128 | annotations[i]["bboxes"] = utils.normalize_bboxes(augmented_bounding_boxes, colored_images[0].size[0], colored_images[0].size[1]).tolist()
129 |
130 | # save annotation
131 | with open(self.out_folder/ f"im_{self.image_counter}.png.json", "w") as f:
132 | json.dump(annotations[i], f)
133 | self.image_counter += 1
134 |
135 | def get_bboxes(self, images, color2word):
136 | annotations = []
137 | for image_pil in images:
138 | width, height = image_pil.size
139 | image_annotations = {"words": [], "bboxes": []}
140 | image = np.asarray(image_pil)
141 |
142 | thr = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
143 | thr = cv2.threshold(thr, 254, 255, cv2.THRESH_BINARY_INV)[1]
144 | cnts = cv2.findContours(thr, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)[0]
145 |
146 | for c in cnts:
147 | peri = cv2.arcLength(c, True)
148 | approx = cv2.approxPolyDP(c, 0.015 * peri, True)
149 |
150 | if len(approx) == 4:
151 | x, y, w, h = cv2.boundingRect(approx)
152 | rgb_color = image_pil.getpixel((x+1, y+1))
153 | color = '#%02x%02x%02x' % (rgb_color)
154 | if color in color2word:
155 | word = color2word[color]
156 | image_annotations['words'].append(word)
157 | image_annotations["bboxes"].append(
158 | (
159 | x / width,
160 | y / height,
161 | (x + w) / width,
162 | (y + h) / height)
163 | )
164 | annotations.append(image_annotations)
165 | return annotations
166 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # DoGe — Synthetic DOcument GEnerator for Document AI
2 |
3 | DoGe is designed to synthesize a dataset of realistic document scans. Each document contains meaningful text, headings,
4 | tables, paragraphs with different formatting and fonts which are parsed from Wikipedia. The coordinates
5 | of the words are extracted using the No-OCR method we invented for faster generation on CPU.
6 |
7 | ## Document examples
8 |
9 |
15 |
16 | Check the full size (1024x1024) in [resources](./resources) folder.
17 |
18 | ## Usage
19 |
20 | ### Docker installation
21 |
22 | You can use Docker image with predefined environment to run DoGe:
23 | ```bash
24 | git clone https://github.com/Travvy88/DocumentGenerator_DoGe
25 | cd DocumentGenerator_DoGe
26 | docker build -t doge .
27 | ```
28 |
29 | Replace `/path/to/output/folder/on/host` and run commands. Inside the docker container you can
30 | [start document generation](#start-data-generation).
31 |
32 | ### Ubuntu installation
33 |
34 | For faster generation, it is recommended to install all dependencies without Docker.
35 | Doge is tested on Ubuntu 22.04.
36 | ```bash
37 | sudo apt-get update
38 | sudo apt-get install libreoffice libjpeg-dev zlib1g-dev poppler-utils
39 | /usr/bin/python3 -m pip install --user unoserver # install unoserver on system python
40 |
41 | git clone https://github.com/Travvy88/DocumentGenerator_DoGe
42 | cd DocumentGenerator_DoGe
43 | # there you can make venv if needed!
44 | pip3 install -r requirements.txt
45 | ```
46 |
47 | ## Start Data Generation
48 |
49 | Docker:
50 |
51 | ```bash
52 | docker run -v /full/path/to/output/folder/on/your/computer:/app/data doge python3 main.py --out_dir data --image_size 244 --max_urls 4 --num_processes 2 --ports 4000 4001 4002 4003
53 | ```
54 |
55 | Ubuntu:
56 |
57 | ```bash
58 | python3 main.py --out_dir data --image_size 244 --max_urls 4 --num_processes 2 --ports 4000 4001 4002 4003
59 | ```
60 |
61 | ### Main.py
62 |
63 | The following arguments can be passed to the script:
64 |
65 | - `--out_dir`: The output directory for saving results. This argument is required.
66 | - `--remove_existing_dir`: If set, the output directory will be deleted before creating a new one.
67 | - `--image_size`: The size of the final images. Default is `244`.
68 | - `--start_page`: The starting page URL. Default is the Wikipedia main English page. You can use another language Wiki main page URL.
69 | - `--languages`: Permitted languages. Pages with other localizations will be ignored. Default is `['en']`.
70 | - `--max_urls`: The maximum number of URLs to process. Default is `100`.
71 | - `--num_processes`: The number of processes to use. Default is `1`. Each process will start DocumentGenerator and start Unoserver for each generator.
72 | - `--max_threads`: The maximum threads inside a process. Default is `3`.
73 | - `--ports`: The list of ports to use. Default is `[8145, 8146]`. The number of ports should be 2 times larger than `num_processes` (each Unoserver instance needs 2 ports for proper multicore work)
74 | - `--debug`: If set, draws bounding boxes + words on each image and saves itermediate images with highlighted words.
75 |
76 |
77 | ### Docx_config.json
78 |
79 | | Parameter | Description |
80 | |-------------|-------------|
81 | | `max_words` | The maximum number of words allowed in the generated documents. |
82 | | `p_2columns` | The probability that the document will be formatted into two columns. |
83 | | `font_size_interval` | The font size range from which the size is randomly selected for each document. |
84 | | `p_line_spacing` | A list of probabilities controlling the line spacing of the document (1.5 or double). |
85 | | `p_text_alignment` | A list of probabilities controlling the text alignment of the document (center, left, right, justify). |
86 | | `p_heading_bold` | The probability that headings will be displayed in bold font. |
87 | | `heading_relative_size_interval` | The range of relative font sizes for headings. The relative font size is chosen randomly. |
88 | | `p_heading_alignment` | A list of probabilities controlling the alignment of headings (center, left, right, justify). |
89 | | `table_max_rows` | The maximum number of rows allowed in a table. Tables with more than the specified number of rows are dropped. |
90 | | `table_max_cols` | The maximum number of columns allowed in a table. Tables with more than the specified number of columns are dropped. |
91 |
92 | Parameters with probabilities and intervals calculate its values for each document randomly.
93 |
94 | According to my experience, generator produces an average about 14 images for each url
95 | with the above Docx settings.
96 |
97 | ### Augmentations
98 |
99 | Augmentation pipeline applies on a final stage. You can manage different augmentations
100 | in `src/augmentations.py` file. Read the [Augraphy Docs](https://augraphy.readthedocs.io/en/latest/) for detailed explanation.
101 |
102 |
103 | ## How it works
104 | 
105 |
106 | Firstly, the `Manager` class creates the `DocumentGenerator` instances in separate processes. For
107 | each `DocumentGenerator`, a Unoserver instance is started.
108 |
109 | Then, the `UrlParser` generates a list of URLs by crawling the web, starting from a given start page
110 | and following links on each page. It uses `BeautifulSoup` to parse HTML content and extract links,
111 | then checks each link's validity and language, adding it to the list if it meets certain conditions.
112 | The process continues until a maximum number of URLs is reached, and the method returns the list of
113 | generated URLs, excluding the starting URL.
114 |
115 | When data generation begins, the list of URLs is divided into several chunks for each `DocumentGenerator`.
116 | Each `DocumentGenerator` instance retrieves a Wikipedia HTML page by URL from its chunk.
117 | Headers, paragraphs formatting, and tables are extracted and placed into a Docx document via the `DocxDocument` class.
118 | At this stage, some random parametrization is applied according to `docx_config.json`.
119 | For example, font size, text alignment, one or two columns, and other parameters
120 | are chosen for each document randomly.
121 |
122 | After that, each word in the Docx is filled with a unique color. As a result, a colored rectangle
123 | appears in place of each word. The image will be encoded with 24-bit color depth,
124 | so the maximum number of words per document is 16,777,216. The text of each word is saved to a hashmap of type color_code -> word.
125 |
126 | The next step is Docx to image conversion. DoGe uses Unoserver to convert Docx to Pdf and
127 | pdf2image for image rendering.
128 |
129 | Then, all rectangle coordinates are detected via OpenCV on converted images. The word for each bounding box is retrieved from the hashmap.
130 | DoGe saves annotations to JSON files in the following format:
131 |
132 | ```json
133 | {
134 | "words": [
135 | "Hello",
136 | "World"
137 | ],
138 | "bboxes": [
139 | [0.1, 0.1, 0.03, 0.02],
140 | [0.4, 0.3, 0.11, 0.02]
141 | ]
142 | }
143 | ```
144 |
145 | The bboxes are normalized and saved in XYWH format.
146 |
147 | The final step is deleting all color fills from words in the Docx document, rendering images, applying Augraphy augmentations,
148 | and saving the augmented images to disk. That's it!
149 |
150 | ## Join us!
151 | DoGe is the perspective method of producing synthetic document datasets. There are some features that will help many developers:
152 | - Download and place images into documents
153 | - Add annotations of headers, tables, paragraphs and images (if added)
154 | - Add different output formats (Parquet for example)
155 | - Add additional information via LLMs
156 | - Performance improvement: the **bottleneck** of generation is transforming Docx -> Pdf -> Png! I look for more simple way of converting Docx to Png.
157 |
158 | If have any ideas or you want to take part in the development of DoGe, write me:
159 | - travvy88@yandex.ru
160 | - https://t.me/travvy88
161 |
162 | Or create a Pull Request to this repo. I will be glad to improve the project with the power of community.
163 |
164 | ## Acknowledgments
165 | Here are some great open-source projects I benefit from:
166 | - [ISP RAS Dedoc Team](https://github.com/ispras/dedoc) for support and assistance.
167 | - [Augraphy](https://github.com/sparkfish/augraphy) for augmentation code of final images.
168 | - [Unoserver](https://github.com/unoconv/unoserver) for Docx to Pdf converter.
169 | - [Pdf2image](https://github.com/Belval/pdf2image) for image from Pdf rendering module.
170 | - [Pillow-SIMD](https://github.com/uploadcare/pillow-simd) for faster image processing.
171 |
--------------------------------------------------------------------------------
/src/docx_document.py:
--------------------------------------------------------------------------------
1 | import cProfile
2 | import io
3 | import re
4 | from docx import Document
5 | from docx.enum.text import WD_PARAGRAPH_ALIGNMENT, WD_LINE_SPACING
6 | from docx.oxml.ns import qn
7 | from docx.shared import Pt, RGBColor
8 | from docx.oxml import OxmlElement
9 | import matplotlib.font_manager
10 | import numba
11 | import numpy as np
12 | from pdf2image import convert_from_bytes
13 | from PIL import Image
14 | def profileit(func):
15 | def wrapper(*args, **kwargs):
16 | datafn = func.__name__ + ".profile" # Name the data file sensibly
17 | prof = cProfile.Profile()
18 | retval = prof.runcall(func, *args, **kwargs)
19 | prof.dump_stats(datafn)
20 | return retval
21 |
22 | return wrapper
23 |
24 | class DocxDocument:
25 | def __init__(self, docx_config, uno_client):
26 | self.docx_config = docx_config
27 | self.uno_client = uno_client
28 |
29 | self.doc = Document()
30 | self.colors = self._init_colors(docx_config["max_words"])
31 |
32 | self.color2word = {}
33 | self.color_ptr = 0
34 |
35 | # sample random settings from docx_config
36 | if np.random.binomial(1, self.docx_config["p_2columns"]):
37 | self.num_columns = 2
38 | else:
39 | self.num_columns = 1
40 | self.configure_several_columns()
41 |
42 | self.font_size = Pt(np.random.randint(*self.docx_config["font_size_interval"]))
43 | self.font_name = np.random.choice(self._list_available_fonts())
44 |
45 | self.line_spacing = np.random.choice(
46 | (WD_LINE_SPACING.ONE_POINT_FIVE, WD_LINE_SPACING.DOUBLE),
47 | p=self._normalize_probabilities(self.docx_config["p_line_spacing"]))
48 |
49 | self.paragraph_alignment = np.random.choice(
50 | (WD_PARAGRAPH_ALIGNMENT.CENTER, WD_PARAGRAPH_ALIGNMENT.LEFT,
51 | WD_PARAGRAPH_ALIGNMENT.RIGHT, WD_PARAGRAPH_ALIGNMENT.JUSTIFY),
52 | p=self._normalize_probabilities(self.docx_config["p_text_alignment"]))
53 |
54 | self.heading_bold = bool(np.random.binomial(1, self.docx_config["p_heading_bold"]))
55 | self.heading_relative_size = np.random.uniform(*self.docx_config["heading_relative_size_interval"])
56 | self.heading_size = Pt(self.heading_relative_size * self.font_size)
57 | self.heading_alignment = np.random.choice(
58 | (WD_PARAGRAPH_ALIGNMENT.CENTER, WD_PARAGRAPH_ALIGNMENT.LEFT,
59 | WD_PARAGRAPH_ALIGNMENT.RIGHT, WD_PARAGRAPH_ALIGNMENT.JUSTIFY),
60 | p=self._normalize_probabilities(self.docx_config["p_heading_alignment"]))
61 |
62 | def _normalize_probabilities(self, p):
63 | return np.array(p) / sum(p)
64 |
65 | def _list_available_fonts(self):
66 | font_paths = matplotlib.font_manager.findSystemFonts(fontpaths=None, fontext='ttf')
67 | font_names = set()
68 |
69 | for font_path in font_paths:
70 | try:
71 | font = matplotlib.font_manager.get_font(font_path)
72 | font_names.add(font.family_name)
73 | except RuntimeError as e:
74 | print(f"Could not load font from path: {font_path}, error: {e}")
75 | return list(font_names)
76 |
77 | def _init_colors(self, max_colors):
78 | colors = []
79 | hex_color = "#000000"
80 | x = int(max_colors ** (1/3)) + 1
81 | for i in range(x):
82 | for j in range(x):
83 | for k in range(x):
84 | # Convert HEX to RGB
85 | r = int(hex_color[1:3], 16)
86 | g = int(hex_color[3:5], 16)
87 | b = int(hex_color[5:7], 16)
88 |
89 | # Increment RGB values
90 | r = (r + i) % 256
91 | g = (g + j) % 256
92 | b = (b + k) % 256
93 | hex_color = '#{:02x}{:02x}{:02x}'.format(r, g, b)
94 | colors.append(hex_color)
95 | return colors
96 |
97 | def configure_several_columns(self):
98 | section = self.doc.sections[0]
99 | sectPr = section._sectPr
100 | cols = sectPr.xpath('./w:cols')[0]
101 | cols.set(qn('w:num'), str(self.num_columns))
102 |
103 | def add_paragraph(self):
104 | paragraph = self.doc.add_paragraph()
105 | paragraph.alignment = self.paragraph_alignment
106 | paragraph.paragraph_format.space_after = 0
107 | paragraph.paragraph_format.line_spacing_rule = self.line_spacing
108 |
109 | return paragraph
110 |
111 | def add_heading(self, element):
112 | text = element.text
113 | level = int(element.name[1])
114 |
115 | if len(self.doc.paragraphs) > 1 and "Heading" in self.doc.paragraphs[-2].style.style_id:
116 | return
117 |
118 | if text not in ["Contents"]:
119 | paragraph = self.doc.add_heading(level=level)
120 | _, metadata = self.add_words(text, paragraph)
121 | self.color2word.update(metadata)
122 | #for run in paragraph.runs:
123 | #run.font.size = self.heading_size
124 | paragraph.alignment = self.heading_alignment
125 |
126 | p = self.add_paragraph()
127 | p.paragraph_format.line_spacing_rule = WD_LINE_SPACING.SINGLE
128 |
129 | def add_table(self, html_element):
130 | rows = html_element.find_all('tr')
131 | parsed_table = []
132 | for row in rows:
133 | cells = row.find_all(['th', 'td'])
134 | parsed_row = []
135 | for cell in cells:
136 | parsed_row.append(cell.text.strip())
137 | parsed_table.append(parsed_row)
138 |
139 | rows = len(parsed_table)
140 | cols = max(len(row) for row in parsed_table)
141 | if rows <= self.docx_config["table_max_rows"] and cols <= self.docx_config["table_max_cols"]:
142 | table = self.doc.add_table(rows=len(parsed_table), cols=max(len(row) for row in parsed_table))
143 | table.style = 'TableGrid'
144 | self.set_table_border_color(table, "FFFFFF")
145 | # Populating table data
146 | for i, row_data in enumerate(parsed_table):
147 | for j, cell_data in enumerate(row_data):
148 | _, metadata = self.add_words(cell_data, table.cell(i, j).paragraphs[0])
149 | self.color2word.update(metadata)
150 | '''table.cell(i, j).paragraphs[0].paragraph_format.line_spacing = Pt(24)
151 | for run in table.cell(i, j).paragraphs[0].runs:
152 | run.font.size = self.doc_config["font_size"]'''
153 | def add_text(self, html_element):
154 | paragraph = self.add_paragraph()
155 | prev_word = " "
156 | first_word = True
157 | for i, child in enumerate(html_element.children):
158 | if i > 0:
159 | first_word = False
160 | prev_word, metadata = self.add_words(child.get_text(), paragraph, formatting=child.name, prev_word=prev_word, first_word=first_word)
161 | self.color2word.update(metadata)
162 |
163 | def add_words(self, text, paragraph, formatting=None, prev_word=" ", first_word=False):
164 | text = re.sub(r'\[.*?\]', '', text)
165 | words = re.split(r'\s+', text)
166 | metadata = {}
167 | for word in words:
168 | if word:
169 | if word[0] not in ",.?!:;)}]»" and prev_word[-1] not in "«[{(":
170 | if first_word:
171 | paragraph.add_run(' ' * 4)
172 | else:
173 | paragraph.add_run(' ')
174 |
175 | run = paragraph.add_run(word)
176 | color = self.color_word(run)
177 | metadata[color] = word
178 | if formatting == 'b':
179 | run.bold = True
180 | if formatting == 'i':
181 | run.italic = True
182 | if formatting == 'u':
183 | run.underline = True
184 | prev_word = word
185 | run.font.size = self.font_size
186 | run.font.name = self.font_name
187 | return prev_word, metadata
188 |
189 | def color_word(self, run):
190 | color = self.colors[self.color_ptr]
191 |
192 | self.color_ptr += 1
193 | tag = run._r
194 |
195 | # Create XML element
196 | shd = OxmlElement('w:shd')
197 |
198 | # Add attributes to the element
199 | shd.set(qn('w:val'), 'clear')
200 | shd.set(qn('w:color'), 'auto')
201 | shd.set(qn('w:fill'), color)
202 |
203 | # Set the font size - this is important! Without this step the
204 | # tag.rPr value below will be None.
205 | run.element.get_or_add_rPr()
206 |
207 | tag.rPr.append(shd)
208 |
209 | run.font.color.rgb = RGBColor(*tuple(int(color[i:i + 2], 16) for i in (1, 3, 5)))
210 | return color
211 |
212 | def set_table_border_color(self, table, color):
213 | tbl = table._element
214 | tbl_pr = tbl.tblPr
215 |
216 | # Create a new border element
217 | tbl_borders = OxmlElement('w:tblBorders')
218 |
219 | # Create a list of border attributes
220 | borders = [
221 | 'top',
222 | 'left',
223 | 'bottom',
224 | 'right',
225 | 'insideH',
226 | 'insideV'
227 | ]
228 |
229 | # Iterate through each border attribute
230 | for border in borders:
231 | border_element = OxmlElement(f'w:{border}')
232 | border_element.set(qn('w:val'), 'single')
233 | border_element.set(qn('w:sz'), '4')
234 | border_element.set(qn('w:space'), '0')
235 | border_element.set(qn('w:color'), color)
236 | tbl_borders.append(border_element)
237 |
238 | tbl_pr.append(tbl_borders)
239 |
240 | def save_docx(self, path):
241 | self.doc.save(path)
242 |
243 | #@profileit
244 | def get_images(self, image_size, dpi) -> list[Image]:
245 | out = io.BytesIO()
246 | self.doc.save(out)
247 | doc_bytes = out.getvalue()
248 | pdf_bytes = self.uno_client.convert(indata=doc_bytes, convert_to='pdf')
249 | return convert_from_bytes(pdf_bytes, dpi=dpi, size=image_size)
250 |
251 | def convert_to_uncolored_docx(self):
252 | for paragraph in self.doc.paragraphs:
253 | for run in paragraph.runs:
254 | run.font.color.rgb = RGBColor(0, 0, 0)
255 | rpr = run.element.get_or_add_rPr()
256 | element = rpr.find(qn('w:shd'))
257 | if element is not None:
258 | element.set(qn('w:fill'), "#FFFFFF")
259 |
260 | for table in self.doc.tables:
261 | self.set_table_border_color(table, "000000")
262 | for row in table.rows:
263 | for cell in row.cells:
264 | for run in cell.paragraphs[0].runs:
265 | run.font.color.rgb = RGBColor(0, 0, 0)
266 | rpr = run.element.get_or_add_rPr()
267 | element = rpr.find(qn('w:shd'))
268 | if element is not None:
269 | element.set(qn('w:fill'), "#FFFFFF")
270 |
271 | def get_num_words(self):
272 | return len(self.color2word)
273 |
--------------------------------------------------------------------------------
/src/augmentations.py:
--------------------------------------------------------------------------------
1 | import random
2 | from augraphy import *
3 |
4 |
5 | def get_augmentation_phases():
6 |
7 | pre_phase = [
8 | # Rescale(scale="optimal", target_dpi = 300, p = 1.0),
9 | ]
10 |
11 | ink_phase = [
12 | InkColorSwap(
13 | ink_swap_color="random",
14 | ink_swap_sequence_number_range=(5, 10),
15 | ink_swap_min_width_range=(2, 3),
16 | ink_swap_max_width_range=(100, 120),
17 | ink_swap_min_height_range=(2, 3),
18 | ink_swap_max_height_range=(100, 120),
19 | ink_swap_min_area_range=(10, 20),
20 | ink_swap_max_area_range=(400, 500),
21 | p=0.1,
22 | ),
23 | LinesDegradation(
24 | line_roi=(0.0, 0.0, 1.0, 1.0),
25 | line_gradient_range=(32, 255),
26 | line_gradient_direction=(0, 2),
27 | line_split_probability=(0.2, 0.4),
28 | line_replacement_value=(250, 255),
29 | line_min_length=(30, 40),
30 | line_long_to_short_ratio=(5, 7),
31 | line_replacement_probability=(0.4, 0.5),
32 | line_replacement_thickness=(1, 3),
33 | p=0.1,
34 | ),
35 | OneOf(
36 | [
37 | Dithering(
38 | dither=random.choice(["ordered", "floyd-steinberg"]),
39 | order=(3, 5),
40 | ),
41 | InkBleed(
42 | intensity_range=(0.1, 0.2),
43 | kernel_size=random.choice([(7, 7), (5, 5), (3, 3)]),
44 | severity=(0.4, 0.6),
45 | ),
46 | ],
47 | p=0.1,
48 | ),
49 | OneOf(
50 | [
51 | InkShifter(
52 | text_shift_scale_range=(18, 27),
53 | text_shift_factor_range=(1, 4),
54 | text_fade_range=(0, 2),
55 | blur_kernel_size=(5, 5),
56 | blur_sigma=0,
57 | noise_type="random",
58 | ),
59 | BleedThrough(
60 | intensity_range=(0.1, 0.3),
61 | color_range=(32, 224),
62 | ksize=(17, 17),
63 | sigmaX=1,
64 | alpha=random.uniform(0.1, 0.2),
65 | offsets=(10, 20),
66 | ),
67 | ],
68 | p=0.1,
69 | ),
70 | OneOf(
71 | [
72 | Hollow(
73 | hollow_median_kernel_value_range=(71, 101),
74 | hollow_min_width_range=(1, 2),
75 | hollow_max_width_range=(150, 200),
76 | hollow_min_height_range=(1, 2),
77 | hollow_max_height_range=(150, 200),
78 | hollow_min_area_range=(10, 20),
79 | hollow_max_area_range=(2000, 5000),
80 | hollow_dilation_kernel_size_range=(1, 2),
81 | ),
82 | Letterpress(
83 | n_samples=(100, 400),
84 | n_clusters=(200, 400),
85 | std_range=(500, 3000),
86 | value_range=(150, 224),
87 | value_threshold_range=(96, 128),
88 | blur=1,
89 | ),
90 | ],
91 | p=0.1,
92 | ),
93 | OneOf(
94 | [
95 | LowInkRandomLines(
96 | count_range=(5, 10),
97 | use_consistent_lines=random.choice([True, False]),
98 | noise_probability=0.1,
99 | ),
100 | LowInkPeriodicLines(
101 | count_range=(2, 5),
102 | period_range=(16, 32),
103 | use_consistent_lines=random.choice([True, False]),
104 | noise_probability=0.1,
105 | ),
106 | ],
107 | p=0.1,
108 | ),
109 | ]
110 |
111 | paper_phase = [
112 | PaperFactory(p=0.1),
113 | ColorPaper(
114 | hue_range=(0, 255),
115 | saturation_range=(10, 40),
116 | p=0.1,
117 | ),
118 | OneOf(
119 | [
120 | #DelaunayTessellation(
121 | # n_points_range=(500, 800),
122 | # n_horizontal_points_range=(500, 800),
123 | # n_vertical_points_range=(500, 800),
124 | # noise_type="random",
125 | # color_list="default",
126 | # color_list_alternate="default",
127 | #),
128 | PatternGenerator(
129 | imgx=random.randint(256, 512),
130 | imgy=random.randint(256, 512),
131 | n_rotation_range=(10, 15),
132 | color="random",
133 | alpha_range=(0.25, 0.5),
134 | ),
135 | #VoronoiTessellation(
136 | # mult_range=(50, 80),
137 | # seed=19829813472,
138 | # num_cells_range=(500, 1000),
139 | # noise_type="random",
140 | # background_value=(200, 255),
141 | #),
142 | ],
143 | p=0.1,
144 | ),
145 | WaterMark(
146 | watermark_word="random",
147 | watermark_font_size=(10, 15),
148 | watermark_font_thickness=(20, 25),
149 | watermark_rotation=(0, 360),
150 | watermark_location="random",
151 | watermark_color="random",
152 | watermark_method="darken",
153 | p=0.1,
154 | ),
155 | OneOf(
156 | [
157 | AugmentationSequence(
158 | [
159 | NoiseTexturize(
160 | sigma_range=(3, 10),
161 | turbulence_range=(2, 5),
162 | texture_width_range=(300, 500),
163 | texture_height_range=(300, 500),
164 | ),
165 | BrightnessTexturize(
166 | texturize_range=(0.9, 0.99),
167 | deviation=0.03,
168 | ),
169 | ],
170 | ),
171 | AugmentationSequence(
172 | [
173 | BrightnessTexturize(
174 | texturize_range=(0.9, 0.99),
175 | deviation=0.03,
176 | ),
177 | NoiseTexturize(
178 | sigma_range=(3, 10),
179 | turbulence_range=(2, 5),
180 | texture_width_range=(300, 500),
181 | texture_height_range=(300, 500),
182 | ),
183 | ],
184 | ),
185 | ],
186 | p=0.1,
187 | ),
188 | ]
189 |
190 | post_phase = [
191 | ColorShift(
192 | color_shift_offset_x_range=(3, 5),
193 | color_shift_offset_y_range=(3, 5),
194 | color_shift_iterations=(2, 3),
195 | color_shift_brightness_range=(0.9, 1.1),
196 | color_shift_gaussian_kernel_range=(3, 3),
197 | p=0.1,
198 | ),
199 | OneOf(
200 | [
201 | DirtyDrum(
202 | line_width_range=(1, 6),
203 | line_concentration=random.uniform(0.05, 0.15),
204 | direction=random.randint(0, 2),
205 | noise_intensity=random.uniform(0.6, 0.95),
206 | noise_value=(64, 224),
207 | ksize=random.choice([(3, 3), (5, 5), (7, 7)]),
208 | sigmaX=0,
209 | p=0.1,
210 | ),
211 | DirtyRollers(
212 | line_width_range=(2, 32),
213 | scanline_type=0,
214 | ),
215 | ],
216 | p=0.1,
217 | ),
218 | OneOf(
219 | [
220 | LightingGradient(
221 | light_position=None,
222 | direction=None,
223 | max_brightness=255,
224 | min_brightness=0,
225 | mode="gaussian",
226 | linear_decay_rate=None,
227 | transparency=None,
228 | ),
229 | Brightness(
230 | brightness_range=(0.9, 1.1),
231 | min_brightness=0,
232 | min_brightness_value=(120, 150),
233 | ),
234 | Gamma(
235 | gamma_range=(0.9, 1.1),
236 | ),
237 | ],
238 | p=0.1,
239 | ),
240 | OneOf(
241 | [
242 | SubtleNoise(
243 | subtle_range=random.randint(5, 10),
244 | ),
245 | Jpeg(
246 | quality_range=(25, 95),
247 | ),
248 | ],
249 | p=0.1,
250 | ),
251 | OneOf(
252 | [
253 | Markup(
254 | num_lines_range=(2, 7),
255 | markup_length_range=(0.5, 1),
256 | markup_thickness_range=(1, 2),
257 | markup_type=random.choice(["strikethrough", "crossed", "highlight", "underline"]),
258 | markup_color="random",
259 | single_word_mode=False,
260 | repetitions=1,
261 | ),
262 | Scribbles(
263 | scribbles_type="random",
264 | scribbles_location="random",
265 | scribbles_size_range=(250, 600),
266 | scribbles_count_range=(1, 6),
267 | scribbles_thickness_range=(1, 3),
268 | scribbles_brightness_change=[32, 64, 128],
269 | scribbles_text="random",
270 | scribbles_text_font="random",
271 | scribbles_text_rotate_range=(0, 360),
272 | scribbles_lines_stroke_count_range=(1, 6),
273 | ),
274 | ],
275 | p=0.1,
276 | ),
277 | OneOf(
278 | [
279 | #BadPhotoCopy(
280 | # noise_mask=None,
281 | # noise_type=-1,
282 | # noise_side="random",
283 | # noise_iteration=(1, 2),
284 | # noise_size=(1, 3),
285 | # noise_value=(128, 196),
286 | # noise_sparsity=(0.3, 0.6),
287 | # noise_concentration=(0.1, 0.6),
288 | # blur_noise=random.choice([True, False]),
289 | # blur_noise_kernel=random.choice([(3, 3), (5, 5), (7, 7)]),
290 | # wave_pattern=random.choice([True, False]),
291 | # edge_effect=random.choice([True, False]),
292 | #),
293 | ShadowCast(
294 | shadow_side="random",
295 | shadow_vertices_range=(1, 20),
296 | shadow_width_range=(0.3, 0.8),
297 | shadow_height_range=(0.3, 0.8),
298 | shadow_color=(0, 0, 0),
299 | shadow_opacity_range=(0.2, 0.9),
300 | shadow_iterations_range=(1, 2),
301 | shadow_blur_kernel_range=(101, 301),
302 | ),
303 | LowLightNoise(
304 | num_photons_range=(50, 100),
305 | alpha_range=(0.7, 1.0),
306 | beta_range=(10, 30),
307 | gamma_range=(1, 1.8),
308 | bias_range=(20, 40),
309 | dark_current_value=1.0,
310 | exposure_time=0.2,
311 | gain=0.1,
312 | ),
313 | ],
314 | p=0.1,
315 | ),
316 | OneOf(
317 | [
318 | NoisyLines(
319 | noisy_lines_direction="random",
320 | noisy_lines_location="random",
321 | noisy_lines_number_range=(5, 20),
322 | noisy_lines_color=(0, 0, 0),
323 | noisy_lines_thickness_range=(1, 2),
324 | noisy_lines_random_noise_intensity_range=(0.01, 0.1),
325 | noisy_lines_length_interval_range=(0, 100),
326 | noisy_lines_gaussian_kernel_value_range=(3, 5),
327 | noisy_lines_overlay_method="ink_to_paper",
328 | ),
329 | BindingsAndFasteners(
330 | overlay_types="darken",
331 | foreground=None,
332 | effect_type="random",
333 | width_range="random",
334 | height_range="random",
335 | angle_range=(-30, 30),
336 | ntimes=(2, 6),
337 | nscales=(0.9, 1.0),
338 | edge="random",
339 | edge_offset=(10, 50),
340 | use_figshare_library=0,
341 | ),
342 | ],
343 | p=0.1,
344 | ),
345 | OneOf(
346 | [
347 | Squish(
348 | squish_direction="random",
349 | squish_location="random",
350 | squish_number_range=(5, 10),
351 | squish_distance_range=(5, 7),
352 | squish_line="random",
353 | squish_line_thickness_range=(1, 1),
354 | ),
355 | Geometric(
356 | fliplr=False,
357 | flipud=False,
358 | crop=(),
359 | rotate_range=(-4, 4),
360 | randomize=0,
361 | p=1,
362 | ),
363 | ],
364 | p=0.1,
365 | ),
366 | OneOf(
367 | [
368 | DotMatrix(
369 | dot_matrix_shape="random",
370 | dot_matrix_dot_width_range=(3, 3),
371 | dot_matrix_dot_height_range=(3, 3),
372 | dot_matrix_min_width_range=(1, 2),
373 | dot_matrix_max_width_range=(150, 200),
374 | dot_matrix_min_height_range=(1, 2),
375 | dot_matrix_max_height_range=(150, 200),
376 | dot_matrix_min_area_range=(10, 20),
377 | dot_matrix_max_area_range=(2000, 5000),
378 | dot_matrix_median_kernel_value_range=(128, 255),
379 | dot_matrix_gaussian_kernel_value_range=(1, 3),
380 | dot_matrix_rotate_value_range=(0, 360),
381 | ),
382 | Faxify(
383 | scale_range=(0.3, 0.6),
384 | monochrome=random.choice([0, 1]),
385 | monochrome_method="random",
386 | monochrome_arguments={},
387 | halftone=random.choice([0, 1]),
388 | invert=1,
389 | half_kernel_size=random.choice([(1, 1), (2, 2)]),
390 | angle=(0, 360),
391 | sigma=(1, 3),
392 | ),
393 | ],
394 | p=0.1,
395 | ),
396 | OneOf(
397 | [
398 | InkMottling(
399 | ink_mottling_alpha_range=(0.2, 0.3),
400 | ink_mottling_noise_scale_range=(2, 2),
401 | ink_mottling_gaussian_kernel_range=(3, 5),
402 | ),
403 | ReflectedLight(
404 | reflected_light_smoothness=0.8,
405 | reflected_light_internal_radius_range=(0.0, 0.001),
406 | reflected_light_external_radius_range=(0.5, 0.8),
407 | reflected_light_minor_major_ratio_range=(0.9, 1.0),
408 | reflected_light_color=(255, 255, 255),
409 | reflected_light_internal_max_brightness_range=(0.75, 0.75),
410 | reflected_light_external_max_brightness_range=(0.5, 0.75),
411 | reflected_light_location="random",
412 | reflected_light_ellipse_angle_range=(0, 360),
413 | reflected_light_gaussian_kernel_size_range=(5, 310),
414 | p=0.1,
415 | ),
416 | ],
417 | p=0.1,
418 | ),
419 | OneOf(
420 | [
421 | PageBorder(
422 | page_border_width_height="random",
423 | page_border_color=(0, 0, 0),
424 | page_border_background_color=(0, 0, 0),
425 | page_numbers="random",
426 | page_rotation_angle_range=(-3, 3),
427 | curve_frequency=(2, 8),
428 | curve_height=(2, 4),
429 | curve_length_one_side=(50, 100),
430 | same_page_border=random.choice([0, 1]),
431 | ),
432 | #BookBinding(
433 | # shadow_radius_range=(30, 100),
434 | # curve_range_right=(50, 200),
435 | # curve_range_left=(50, 200),
436 | # curve_ratio_right=(0.1, 0.3),
437 | # curve_ratio_left=(0.1, 0.3),
438 | # mirror_range=(1.0, 1.0),
439 | # binding_align="random",
440 | # binding_pages=(5, 10),
441 | # curling_direction=-1,
442 | # backdrop_color=(0, 0, 0),
443 | # enable_shadow=random.choice([0, 1]),
444 | #),
445 | Folding(
446 | fold_x=None,
447 | fold_deviation=(0, 0),
448 | fold_count=random.randint(2, 8),
449 | fold_noise=0.01,
450 | fold_angle_range=(-360, 360),
451 | gradient_width=(0.1, 0.2),
452 | gradient_height=(0.01, 0.02),
453 | backdrop_color=(0, 0, 0),
454 | ),
455 | ],
456 | p=0.1,
457 | ),
458 | # Rescale(scale = "original" , p = 1.0)
459 | ]
460 | return {'ink_phase': ink_phase,
461 | 'paper_phase': paper_phase,
462 | 'post_phase': post_phase,
463 | 'pre_phase': pre_phase}
464 |
--------------------------------------------------------------------------------