├── .gitignore ├── Makefile ├── README.md ├── dev-requirements.txt ├── requirements.txt ├── setup.py └── src ├── BenchmarkTable.py ├── OrderedPage.py ├── PredictionInfo.py ├── ReadingOrderToken.py ├── SegmentProcessor.py ├── benchmark_candidate_finder.py ├── benchmark_reading_order.py ├── benchmark_segmented_reading_order.py ├── create_candidate_finder_model.py ├── create_reading_order_model.py ├── hyperparameter_optimization.py ├── pdf_reading_order ├── PdfReadingOrderTokens.py ├── ReadingOrderBase.py ├── ReadingOrderCandidatesTrainer.py ├── ReadingOrderLabelPage.py ├── ReadingOrderTrainer.py ├── config.py ├── download_models.py ├── load_labeled_data.py └── model_configuration.py ├── predict.py └── show_reading_orders.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.pyo 3 | .DS_Store 4 | .idea 5 | .vscode 6 | *.log 7 | nohup.out 8 | /venv/ 9 | *.cache 10 | *_config.py 11 | service.log 12 | **/*.pytest_cache 13 | src/.pytest_cache 14 | /model/ 15 | /src/data/ 16 | /src/tuned_parameters/ 17 | /src/benchmark_table.txt 18 | /src/benchmark_tables/ 19 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | install: 2 | . venv/bin/activate; pip install -r requirements.txt 3 | 4 | install_venv: 5 | python3 -m venv venv 6 | . venv/bin/activate; python -m pip install --upgrade pip 7 | . venv/bin/activate; python -m pip install -r dev-requirements.txt 8 | 9 | formatter: 10 | . venv/bin/activate; command black --line-length 125 . 11 | 12 | check_format: 13 | . venv/bin/activate; command black --line-length 125 . --check 14 | 15 | test: 16 | . venv/bin/activate; command cd src; python -m unittest test/test_trainer.py 17 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
This tool returns the reading order of a PDF
3 | 4 | ## Quick Start 5 | Create venv: 6 | 7 | make install_venv 8 | 9 | Get the reading order of a PDF: 10 | 11 | source venv/bin/activate 12 | python src/predict.py /path/to/pdf 13 | 14 | 15 | ## Train a new model 16 | 17 | Get the labeled data tool from the GitHub repository: 18 | 19 | https://github.com/huridocs/pdf-labeled-data 20 | 21 | Change the paths in src/config.py 22 | 23 | LABELED_DATA_ROOT_PATH = /path/to/pdf-labeled-data/project 24 | TRAINED_MODEL_PATH = /path/to/save/trained/model 25 | 26 | Create venv: 27 | 28 | make install_venv 29 | 30 | Train a new model: 31 | 32 | source venv/bin/activate 33 | python src/create_candidate_finder_model.py 34 | python src/create_reading_order_model.py 35 | 36 | ## Use a custom model 37 | 38 | python src/predict.py /path/to/pdf --model-path /path/to/model 39 | 40 | ## Process figures and tables 41 | 42 | python src/predict.py /path/to/pdf --extract-figures-and-tables 43 | -------------------------------------------------------------------------------- /dev-requirements.txt: -------------------------------------------------------------------------------- 1 | -r requirements.txt 2 | pytest==7.4.0 3 | black==23.7.0 4 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/huridocs/pdf_paragraphs_extraction@9765c33f32139d6043e67d30b30fd2ba10f4da1d 2 | git+https://github.com/huridocs/pdf-tokens-type-labeler@d237b570f1f3d09dbbd16d0868b7fd9931b327d6 -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | PROJECT_NAME = "pdf-reading-order" 4 | 5 | setup( 6 | name=PROJECT_NAME, 7 | packages=["pdf_reading_order"], 8 | package_dir={"": "src"}, 9 | version="0.25", 10 | url="https://github.com/huridocs/pdf-reading-order", 11 | author="HURIDOCS", 12 | description="This tool returns the reading order of a PDF", 13 | ) 14 | -------------------------------------------------------------------------------- /src/BenchmarkTable.py: -------------------------------------------------------------------------------- 1 | from os.path import exists 2 | from os import makedirs 3 | from tabulate import tabulate 4 | from PredictionInfo import PredictionInfo 5 | from pdf_reading_order.PdfReadingOrderTokens import PdfReadingOrderTokens 6 | 7 | 8 | class BenchmarkTable: 9 | def __init__(self, pdf_reading_order_tokens_list: list[PdfReadingOrderTokens], total_time: float, table_name=""): 10 | self.pdf_paragraphs_tokens_list: list[PdfReadingOrderTokens] = pdf_reading_order_tokens_list 11 | self.total_time = total_time 12 | self.prediction_info_list = [ 13 | PredictionInfo(pdf_reading_order_tokens) for pdf_reading_order_tokens in pdf_reading_order_tokens_list 14 | ] 15 | self.table_name = table_name 16 | 17 | @staticmethod 18 | def get_mistakes_for_file(predictions_for_file: PredictionInfo): 19 | labels_for_file = 0 20 | mistakes_for_file = 0 21 | for page in predictions_for_file.pdf_reading_order_tokens.pdf_features.pages: 22 | actual_orders = predictions_for_file.actual_reading_orders_by_page[page] 23 | labels_for_file += len(actual_orders) 24 | predicted_orders = predictions_for_file.predicted_reading_orders_by_page[page] 25 | labels_map = {prev_token: next_token for prev_token, next_token in zip(actual_orders, actual_orders[1:])} 26 | mistakes_for_file += sum( 27 | 1 28 | for prev_token, next_token in zip(predicted_orders, predicted_orders[1:]) 29 | if prev_token not in labels_map or labels_map[prev_token] != next_token 30 | ) 31 | return labels_for_file, mistakes_for_file 32 | 33 | def get_mistakes_for_file_type(self, predictions_for_file_type: list[PredictionInfo]): 34 | labels_for_file_type = 0 35 | mistakes_for_file_type = 0 36 | for predictions_for_file in predictions_for_file_type: 37 | labels_for_file, mistakes_for_file = self.get_mistakes_for_file(predictions_for_file) 38 | labels_for_file_type += labels_for_file 39 | mistakes_for_file_type += mistakes_for_file 40 | return labels_for_file_type, mistakes_for_file_type 41 | 42 | def get_benchmark_table_rows(self): 43 | benchmark_table_rows: list[list[str]] = [] 44 | file_types = set(info.file_type for info in self.prediction_info_list) 45 | total_label_count = 0 46 | total_mistake_count = 0 47 | for file_type in file_types: 48 | predictions_for_file_type = [info for info in self.prediction_info_list if info.file_type == file_type] 49 | labels_for_file_type, mistakes_for_file_type = self.get_mistakes_for_file_type(predictions_for_file_type) 50 | total_label_count += labels_for_file_type 51 | total_mistake_count += mistakes_for_file_type 52 | accuracy = round(100 - (100 * mistakes_for_file_type / labels_for_file_type), 2) 53 | benchmark_table_rows.append([file_type, f"{mistakes_for_file_type}/{labels_for_file_type} ({accuracy}%)"]) 54 | 55 | return benchmark_table_rows, total_label_count, total_mistake_count 56 | 57 | def prepare_benchmark_table(self): 58 | table_headers = ["File Type", "Mistakes"] 59 | table_rows, total_label_count, total_mistake_count = self.get_benchmark_table_rows() 60 | average_accuracy = round(100 - (100 * total_mistake_count / total_label_count), 2) 61 | if not exists("benchmark_tables"): 62 | makedirs("benchmark_tables") 63 | table_path = f"benchmark_tables/benchmark_table{self.table_name}.txt" 64 | with open(table_path, "w") as benchmark_file: 65 | benchmark_table = ( 66 | tabulate(tabular_data=table_rows, headers=table_headers) 67 | + "\n\n" 68 | + f"Average Accuracy: {total_mistake_count} Mistakes/{total_label_count} Labels ({average_accuracy}%)" 69 | + "\n" 70 | + f"Total Time: {round(self.total_time, 2)}" 71 | ) 72 | benchmark_file.write(benchmark_table) 73 | -------------------------------------------------------------------------------- /src/OrderedPage.py: -------------------------------------------------------------------------------- 1 | from pdf_features.PdfToken import PdfToken 2 | from ReadingOrderToken import ReadingOrderToken 3 | 4 | 5 | class OrderedPage: 6 | def __init__(self, pdf_name: str, page_number: int, reading_order_tokens: list[ReadingOrderToken]): 7 | self.pdf_name = pdf_name 8 | self.page_number = page_number 9 | self.reading_order_tokens = reading_order_tokens 10 | 11 | @staticmethod 12 | def from_pdf_tokens(pdf_name: str, page_number: int, pdf_tokens: list[PdfToken]): 13 | reading_order_tokens = [ 14 | ReadingOrderToken(token.bounding_box, token.content, token.token_type, reading_order_no) 15 | for reading_order_no, token in enumerate(pdf_tokens) 16 | ] 17 | 18 | return OrderedPage(pdf_name, page_number, reading_order_tokens) 19 | 20 | def to_dict(self): 21 | return { 22 | "pdf_name": self.pdf_name, 23 | "page_number": self.page_number, 24 | "tokens": [reading_order_token.to_dict() for reading_order_token in self.reading_order_tokens], 25 | } 26 | -------------------------------------------------------------------------------- /src/PredictionInfo.py: -------------------------------------------------------------------------------- 1 | from pdf_features.PdfPage import PdfPage 2 | from pdf_features.PdfToken import PdfToken 3 | from pdf_reading_order.PdfReadingOrderTokens import PdfReadingOrderTokens 4 | 5 | 6 | class PredictionInfo: 7 | def __init__(self, pdf_reading_order_tokens: PdfReadingOrderTokens): 8 | self.pdf_reading_order_tokens = pdf_reading_order_tokens 9 | self.file_name = pdf_reading_order_tokens.pdf_features.file_name 10 | self.file_type = pdf_reading_order_tokens.pdf_features.file_type 11 | self.label_count = 0 12 | self.mistake_count = 0 13 | self.actual_reading_orders_by_page: dict[PdfPage, list[PdfToken]] = {} 14 | self.predicted_reading_orders_by_page: dict[PdfPage, list[PdfToken]] = {} 15 | self.get_actual_and_predicted_orders() 16 | 17 | def get_actual_and_predicted_orders(self): 18 | for page in self.pdf_reading_order_tokens.pdf_features.pages: 19 | page_reading_orders = self.pdf_reading_order_tokens.labeled_page_by_raw_page[page].reading_order_by_token_id 20 | actual_order = sorted([token for token in page.tokens], key=lambda t: page_reading_orders[t.id]) 21 | predicted_order = sorted([token for token in page.tokens], key=lambda t: t.prediction) 22 | self.actual_reading_orders_by_page[page] = actual_order 23 | self.predicted_reading_orders_by_page[page] = predicted_order 24 | -------------------------------------------------------------------------------- /src/ReadingOrderToken.py: -------------------------------------------------------------------------------- 1 | from pdf_features.Rectangle import Rectangle 2 | from pdf_token_type_labels.TokenType import TokenType 3 | 4 | 5 | class ReadingOrderToken: 6 | def __init__(self, bounding_box: Rectangle, content: str, token_type: TokenType, reading_order_no: int): 7 | self.bounding_box = bounding_box 8 | self.content = content 9 | self.token_type = token_type 10 | self.reading_order_no = reading_order_no 11 | 12 | def to_dict(self): 13 | return { 14 | "bounding_box": self.bounding_box.to_dict(), 15 | "content": self.content, 16 | "token_type": self.token_type.value, 17 | "reading_order_no": self.reading_order_no, 18 | } 19 | -------------------------------------------------------------------------------- /src/SegmentProcessor.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from statistics import mode 3 | from pdf_features.PdfPage import PdfPage 4 | from pdf_features.PdfToken import PdfToken 5 | from pdf_features.Rectangle import Rectangle 6 | from pdf_token_type_labels.TokenType import TokenType 7 | from pdf_reading_order.PdfReadingOrderTokens import PdfReadingOrderTokens 8 | from pdf_reading_order.ReadingOrderLabelPage import ReadingOrderLabelPage 9 | from paragraph_extraction_trainer.Paragraph import Paragraph 10 | from paragraph_extraction_trainer.download_models import paragraph_extraction_model_path 11 | from paragraph_extraction_trainer.ParagraphExtractorTrainer import ParagraphExtractorTrainer 12 | from paragraph_extraction_trainer.model_configuration import MODEL_CONFIGURATION as PARAGRAPH_EXTRACTOR_CONFIGURATION 13 | 14 | 15 | class SegmentProcessor: 16 | def __init__( 17 | self, 18 | pdf_reading_order_tokens_list: list[PdfReadingOrderTokens], 19 | segment_types: list[TokenType] = None, 20 | model_path: str | Path = None, 21 | ): 22 | self.pdf_reading_order_tokens_list = pdf_reading_order_tokens_list 23 | pdf_features_list = [pdf_reading_order.pdf_features for pdf_reading_order in pdf_reading_order_tokens_list] 24 | self.paragraph_extractor = ParagraphExtractorTrainer(pdf_features_list, PARAGRAPH_EXTRACTOR_CONFIGURATION) 25 | self.model_path = paragraph_extraction_model_path if model_path is None else model_path 26 | self.segment_types = segment_types 27 | 28 | @staticmethod 29 | def get_processed_token_from_paragraph(paragraph_tokens: list[PdfToken], label_page: ReadingOrderLabelPage): 30 | page_number = paragraph_tokens[0].page_number 31 | token_id_number = paragraph_tokens[0].id.split("_t")[-1] 32 | token_id = f"p{page_number}_m{token_id_number}" 33 | content = " ".join([token.content for token in paragraph_tokens]) 34 | pdf_font = mode([token.font for token in paragraph_tokens]) 35 | reading_order = -1 36 | if label_page: 37 | reading_order = min([label_page.reading_order_by_token_id[token.id] for token in paragraph_tokens]) 38 | label_page.reading_order_by_token_id[token_id] = reading_order 39 | bounding_box = Rectangle.merge_rectangles([token.bounding_box for token in paragraph_tokens]) 40 | token_type = mode([token.token_type for token in paragraph_tokens]) 41 | return PdfToken(page_number, token_id, content, pdf_font, reading_order, bounding_box, token_type) 42 | 43 | @staticmethod 44 | def remove_paragraph_tokens_from_labels(paragraph_tokens: list[PdfToken], label_page: ReadingOrderLabelPage): 45 | for paragraph_token in paragraph_tokens: 46 | del label_page.reading_order_by_token_id[paragraph_token.id] 47 | 48 | @staticmethod 49 | def remove_paragraph_tokens_from_page(paragraph_tokens: list[PdfToken], page: PdfPage): 50 | for paragraph_token in paragraph_tokens: 51 | page.tokens.remove(paragraph_token) 52 | 53 | @staticmethod 54 | def add_processed_token_to_page(figure_table_token: PdfToken, paragraph_tokens: list[PdfToken], page: PdfPage): 55 | insert_index = sorted([page.tokens.index(token) for token in paragraph_tokens])[0] 56 | page.tokens.insert(insert_index, figure_table_token) 57 | 58 | @staticmethod 59 | def reassign_labels(label_page): 60 | reading_order = 0 61 | for token_id, _ in sorted(label_page.reading_order_by_token_id.items(), key=lambda item: item[1]): 62 | label_page.reading_order_by_token_id[token_id] = reading_order 63 | reading_order += 1 64 | 65 | def process_a_paragraph(self, paragraph: Paragraph, pdf_reading_order_tokens: PdfReadingOrderTokens): 66 | segment_type = mode([token.token_type for token in paragraph.tokens]) 67 | if self.segment_types is not None and segment_type not in self.segment_types: 68 | return 69 | page = pdf_reading_order_tokens.pdf_features.pages[paragraph.tokens[0].page_number - 1] 70 | label_page = ( 71 | pdf_reading_order_tokens.labeled_page_by_raw_page[page] 72 | if page in pdf_reading_order_tokens.labeled_page_by_raw_page 73 | else None 74 | ) 75 | segment_token = self.get_processed_token_from_paragraph(paragraph.tokens, label_page) 76 | self.add_processed_token_to_page(segment_token, paragraph.tokens, page) 77 | self.remove_paragraph_tokens_from_page(paragraph.tokens, page) 78 | if label_page: 79 | self.remove_paragraph_tokens_from_labels(paragraph.tokens, label_page) 80 | self.reassign_labels(label_page) 81 | 82 | def process(self): 83 | paragraphs: list[Paragraph] = self.paragraph_extractor.get_paragraphs(self.model_path) 84 | pdf_reading_order_tokens_index = 0 85 | pdf_reading_order_tokens = self.pdf_reading_order_tokens_list[pdf_reading_order_tokens_index] 86 | current_token_count = 0 87 | document_token_count = sum([len(page.tokens) for page in pdf_reading_order_tokens.pdf_features.pages]) 88 | for paragraph in paragraphs: 89 | if current_token_count == document_token_count: 90 | pdf_reading_order_tokens_index += 1 91 | pdf_reading_order_tokens = self.pdf_reading_order_tokens_list[pdf_reading_order_tokens_index] 92 | current_token_count = 0 93 | document_token_count = sum([len(page.tokens) for page in pdf_reading_order_tokens.pdf_features.pages]) 94 | current_token_count += len(paragraph.tokens) 95 | self.process_a_paragraph(paragraph, pdf_reading_order_tokens) 96 | -------------------------------------------------------------------------------- /src/benchmark_candidate_finder.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | from os.path import join 4 | import numpy as np 5 | from time import time 6 | from pdf_token_type_labels.TokenType import TokenType 7 | from pdf_reading_order.config import ROOT_PATH, PDF_LABELED_DATA_ROOT_PATH 8 | from pdf_reading_order.load_labeled_data import load_labeled_data 9 | from pdf_reading_order.ReadingOrderCandidatesTrainer import ReadingOrderCandidatesTrainer 10 | from pdf_reading_order.model_configuration import CANDIDATE_MODEL_CONFIGURATION 11 | from SegmentProcessor import SegmentProcessor 12 | 13 | BENCHMARK_MODEL_PATH = join(ROOT_PATH, "model", "candidate_selector_benchmark") 14 | CANDIDATES_X_TRAIN_PATH = "data/candidates_X_train.pickle" 15 | CANDIDATES_Y_TRAIN_PATH = "data/candidates_y_train.pickle" 16 | CANDIDATES_X_TEST_PATH = "data/candidates_X_test.pickle" 17 | CANDIDATES_Y_TEST_PATH = "data/candidates_y_test.pickle" 18 | PDF_READING_ORDER_TOKENS_TRAIN_PATH = "data/pdf_reading_order_tokens_train.pickle" 19 | PDF_READING_ORDER_TOKENS_TEST_PATH = "data/pdf_reading_order_tokens_test.pickle" 20 | 21 | 22 | def prepare_features(dataset_type, x_path, y_path): 23 | pdf_reading_order_tokens_list = get_pdf_reading_order_tokens(dataset_type) 24 | trainer = ReadingOrderCandidatesTrainer(pdf_reading_order_tokens_list, None) 25 | x, y = trainer.get_training_data() 26 | if not os.path.exists(join(ROOT_PATH, "src", "data")): 27 | os.makedirs(join(ROOT_PATH, "src", "data")) 28 | with open(x_path, "wb") as x_file: 29 | pickle.dump(x, x_file) 30 | with open(y_path, "wb") as y_file: 31 | pickle.dump(y, y_file) 32 | return x, np.array(y) 33 | 34 | 35 | def get_features(dataset_type: str = "train"): 36 | x_path = CANDIDATES_X_TRAIN_PATH if dataset_type == "train" else CANDIDATES_X_TEST_PATH 37 | y_path = CANDIDATES_Y_TRAIN_PATH if dataset_type == "train" else CANDIDATES_Y_TEST_PATH 38 | 39 | if os.path.exists(x_path) and os.path.exists(y_path): 40 | with open(x_path, "rb") as f: 41 | x_features = pickle.load(f) 42 | with open(y_path, "rb") as f: 43 | y_features = np.array(pickle.load(f)) 44 | return x_features, np.array(y_features) 45 | 46 | return prepare_features(dataset_type, x_path, y_path) 47 | 48 | 49 | def prepare_pdf_reading_order_tokens_list(dataset_type, file_path): 50 | pdf_reading_order_tokens_list = load_labeled_data(PDF_LABELED_DATA_ROOT_PATH, filter_in=dataset_type) 51 | start_time = time() 52 | table_figure_processor = SegmentProcessor(pdf_reading_order_tokens_list, [TokenType.FIGURE, TokenType.TABLE]) 53 | table_figure_processor.process() 54 | total_time = time() - start_time 55 | print(f"Table figure processing took: {round(total_time, 2)} seconds.") 56 | if not os.path.exists(join(ROOT_PATH, "src", "data")): 57 | os.makedirs(join(ROOT_PATH, "src", "data")) 58 | with open(file_path, "wb") as pdf_reading_order_tokens_file: 59 | pickle.dump(pdf_reading_order_tokens_list, pdf_reading_order_tokens_file) 60 | return pdf_reading_order_tokens_list 61 | 62 | 63 | def get_pdf_reading_order_tokens(dataset_type: str = "train"): 64 | file_path = PDF_READING_ORDER_TOKENS_TRAIN_PATH if dataset_type == "train" else PDF_READING_ORDER_TOKENS_TEST_PATH 65 | if os.path.exists(file_path): 66 | with open(file_path, "rb") as f: 67 | return pickle.load(f) 68 | 69 | return prepare_pdf_reading_order_tokens_list(dataset_type, file_path) 70 | 71 | 72 | def train_for_benchmark(model_path: str, include_test_set: bool = False): 73 | x_train, y_train = get_features("train") 74 | if include_test_set: 75 | x_test, y_test = get_features("test") 76 | x_train = np.concatenate((x_train, x_test), axis=0) 77 | y_train = np.append(y_train, y_test) 78 | trainer = ReadingOrderCandidatesTrainer([], CANDIDATE_MODEL_CONFIGURATION) 79 | trainer.train(model_path, x_train, y_train) 80 | 81 | 82 | def test_for_benchmark(): 83 | pdf_reading_order_tokens_list = get_pdf_reading_order_tokens("test") 84 | results = [] 85 | start_time = time() 86 | for model_name in sorted(os.listdir(join(ROOT_PATH, "model"))): 87 | for candidate_count in [18]: 88 | if not model_name.startswith("candidate"): 89 | continue 90 | print(f"Testing: {model_name} with {candidate_count} candidate") 91 | model_path = join(join(ROOT_PATH, "model", model_name)) 92 | trainer = ReadingOrderCandidatesTrainer(pdf_reading_order_tokens_list, CANDIDATE_MODEL_CONFIGURATION) 93 | mistake_count_for_model = trainer.predict(model_path, candidate_count) 94 | results.append([model_name, mistake_count_for_model, candidate_count]) 95 | print(f"Elapsed time: {time() - start_time} seconds") 96 | results.sort(key=lambda result: result[1]) 97 | for result in results: 98 | print(result) 99 | 100 | 101 | def train_models_for_comparison(): 102 | start_time = time() 103 | for num_boost_round in [500]: 104 | for num_leaves in [455]: 105 | CANDIDATE_MODEL_CONFIGURATION.num_boost_round = num_boost_round 106 | CANDIDATE_MODEL_CONFIGURATION.num_leaves = num_leaves 107 | model_path: str = BENCHMARK_MODEL_PATH + f"_nbr{num_boost_round}_nl{num_leaves}.model" 108 | train_for_benchmark(model_path) 109 | print(f"Elapsed time: {time() - start_time} seconds") 110 | 111 | 112 | if __name__ == "__main__": 113 | train_models_for_comparison() 114 | test_for_benchmark() 115 | -------------------------------------------------------------------------------- /src/benchmark_reading_order.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | from os.path import join 4 | import numpy as np 5 | from time import time 6 | from pdf_token_type_labels.TokenType import TokenType 7 | from BenchmarkTable import BenchmarkTable 8 | from SegmentProcessor import SegmentProcessor 9 | from pdf_reading_order.PdfReadingOrderTokens import PdfReadingOrderTokens 10 | from pdf_reading_order.ReadingOrderTrainer import ReadingOrderTrainer 11 | from pdf_reading_order.config import ROOT_PATH, PDF_LABELED_DATA_ROOT_PATH 12 | from pdf_reading_order.load_labeled_data import load_labeled_data 13 | from pdf_reading_order.model_configuration import READING_ORDER_MODEL_CONFIGURATION 14 | from pdf_reading_order.ReadingOrderTrainer import CANDIDATE_COUNT 15 | 16 | BENCHMARK_MODEL_PATH = join(ROOT_PATH, "model", "reading_order_benchmark.model") 17 | BENCHMARK_COMPARISON_MODEL_PATH = join(ROOT_PATH, "model", "reading_order_benchmark") 18 | READING_ORDER_X_TRAIN_PATH = f"data/reading_order_{CANDIDATE_COUNT}_X_train.pickle" 19 | READING_ORDER_Y_TRAIN_PATH = f"data/reading_order_{CANDIDATE_COUNT}_y_train.pickle" 20 | READING_ORDER_X_TEST_PATH = f"data/reading_order_{CANDIDATE_COUNT}_X_test.pickle" 21 | READING_ORDER_Y_TEST_PATH = f"data/reading_order_{CANDIDATE_COUNT}_y_test.pickle" 22 | PDF_READING_ORDER_TOKENS_TRAIN_PATH = "data/pdf_reading_order_tokens_train.pickle" 23 | PDF_READING_ORDER_TOKENS_TEST_PATH = "data/pdf_reading_order_tokens_test.pickle" 24 | 25 | 26 | def prepare_features(dataset_type, x_path, y_path): 27 | pdf_reading_order_tokens_list = get_pdf_reading_order_tokens(dataset_type) 28 | trainer = ReadingOrderTrainer(pdf_reading_order_tokens_list, None) 29 | x, y = trainer.get_training_data() 30 | if not os.path.exists(join(ROOT_PATH, "src", "data")): 31 | os.makedirs(join(ROOT_PATH, "src", "data")) 32 | with open(x_path, "wb") as x_file: 33 | pickle.dump(x, x_file) 34 | with open(y_path, "wb") as y_file: 35 | pickle.dump(y, y_file) 36 | return x, np.array(y) 37 | 38 | 39 | def get_features(dataset_type: str = "train"): 40 | x_path = READING_ORDER_X_TRAIN_PATH if dataset_type == "train" else READING_ORDER_X_TEST_PATH 41 | y_path = READING_ORDER_Y_TRAIN_PATH if dataset_type == "train" else READING_ORDER_Y_TEST_PATH 42 | if os.path.exists(x_path) and os.path.exists(y_path): 43 | with open(x_path, "rb") as f: 44 | x_features = pickle.load(f) 45 | with open(y_path, "rb") as f: 46 | y_features = np.array(pickle.load(f)) 47 | return x_features, np.array(y_features) 48 | 49 | return prepare_features(dataset_type, x_path, y_path) 50 | 51 | 52 | def loop_pages(pdf_reading_order_tokens_list: list[PdfReadingOrderTokens]): 53 | for pdf_reading_order_tokens in pdf_reading_order_tokens_list: 54 | for page in pdf_reading_order_tokens.pdf_features.pages: 55 | label_page = pdf_reading_order_tokens.labeled_page_by_raw_page[page] 56 | yield label_page, page 57 | 58 | 59 | def find_mistake_count(pdf_reading_order_tokens_list: list[PdfReadingOrderTokens]): 60 | mistakes = 0 61 | for label_page, page in loop_pages(pdf_reading_order_tokens_list): 62 | for token_1, token_2 in zip(page.tokens, page.tokens[1:]): 63 | mistakes += 0 if label_page.is_next_token(token_1, token_2) else 1 64 | return mistakes 65 | 66 | 67 | def prepare_pdf_reading_order_tokens_list(dataset_type, file_path): 68 | pdf_reading_order_tokens_list = load_labeled_data(PDF_LABELED_DATA_ROOT_PATH, filter_in=dataset_type) 69 | start_time = time() 70 | table_figure_processor = SegmentProcessor(pdf_reading_order_tokens_list, [TokenType.FIGURE, TokenType.TABLE]) 71 | table_figure_processor.process() 72 | total_time = time() - start_time 73 | print(f"Table figure processing took: {round(total_time, 2)} seconds.") 74 | with open(file_path, "wb") as pdf_reading_order_tokens_file: 75 | pickle.dump(pdf_reading_order_tokens_list, pdf_reading_order_tokens_file) 76 | return pdf_reading_order_tokens_list 77 | 78 | 79 | def get_pdf_reading_order_tokens(dataset_type: str = "train"): 80 | print(f"Loading {dataset_type} data...") 81 | file_path = PDF_READING_ORDER_TOKENS_TRAIN_PATH if dataset_type == "train" else PDF_READING_ORDER_TOKENS_TEST_PATH 82 | if not os.path.exists(join(ROOT_PATH, "src", "data")): 83 | os.makedirs(join(ROOT_PATH, "src", "data")) 84 | if os.path.exists(file_path): 85 | with open(file_path, "rb") as f: 86 | return pickle.load(f) 87 | 88 | return prepare_pdf_reading_order_tokens_list(dataset_type, file_path) 89 | 90 | 91 | def predict_for_benchmark(pdf_reading_order_tokens_list: list[PdfReadingOrderTokens], get_granular_scores: bool): 92 | trainer = ReadingOrderTrainer(pdf_reading_order_tokens_list, READING_ORDER_MODEL_CONFIGURATION) 93 | print(f"Model prediction started...") 94 | start_time = time() 95 | trainer.predict(BENCHMARK_MODEL_PATH) 96 | total_time = time() - start_time 97 | if get_granular_scores: 98 | benchmark_table = BenchmarkTable(pdf_reading_order_tokens_list, total_time) 99 | benchmark_table.prepare_benchmark_table() 100 | return total_time 101 | 102 | 103 | def train_for_benchmark(include_test_set: bool = False): 104 | x_train, y_train = get_features("train") 105 | if include_test_set: 106 | x_test, y_test = get_features("test") 107 | x_train = np.concatenate((x_train, x_test), axis=0) 108 | y_train = np.append(y_train, y_test) 109 | trainer = ReadingOrderTrainer([], READING_ORDER_MODEL_CONFIGURATION) 110 | trainer.train(BENCHMARK_MODEL_PATH, x_train, y_train) 111 | 112 | 113 | def benchmark(get_granular_scores: bool): 114 | pdf_reading_order_tokens_list = get_pdf_reading_order_tokens("test") 115 | total_time = predict_for_benchmark(pdf_reading_order_tokens_list, get_granular_scores) 116 | mistake_count_for_model = find_mistake_count(pdf_reading_order_tokens_list) 117 | print(f"{mistake_count_for_model} mistakes found. Total time: {total_time}") 118 | 119 | 120 | if __name__ == "__main__": 121 | train_for_benchmark() 122 | benchmark(True) 123 | -------------------------------------------------------------------------------- /src/benchmark_segmented_reading_order.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | from os.path import join 4 | from time import time 5 | import numpy as np 6 | import pdf_reading_order.ReadingOrderTrainer 7 | from BenchmarkTable import BenchmarkTable 8 | from benchmark_reading_order import find_mistake_count 9 | from pdf_reading_order.PdfReadingOrderTokens import PdfReadingOrderTokens 10 | from pdf_reading_order.ReadingOrderTrainer import ReadingOrderTrainer 11 | from pdf_reading_order.config import PDF_LABELED_DATA_ROOT_PATH 12 | from pdf_reading_order.load_labeled_data import load_labeled_data 13 | from SegmentProcessor import SegmentProcessor 14 | from pdf_reading_order.config import ROOT_PATH 15 | from pdf_reading_order.model_configuration import SEGMENTED_READING_ORDER_MODEL_CONFIGURATION 16 | 17 | BENCHMARK_MODEL_PATH = join(ROOT_PATH, "model", "segmented_reading_order_benchmark.model") 18 | BENCHMARK_COMPARISON_MODEL_PATH = join(ROOT_PATH, "model", "segmented_reading_order_benchmark") 19 | SEGMENTED_PDF_READING_ORDER_TOKENS_TRAIN_PATH = "data/segmented_pdf_reading_order_tokens_train.pickle" 20 | SEGMENTED_PDF_READING_ORDER_TOKENS_TEST_PATH = "data/segmented_pdf_reading_order_tokens_test.pickle" 21 | SEGMENTED_READING_ORDER_X_TRAIN_PATH = f"data/segmented_reading_order_X_train.pickle" 22 | SEGMENTED_READING_ORDER_Y_TRAIN_PATH = f"data/segmented_reading_order_y_train.pickle" 23 | SEGMENTED_READING_ORDER_X_TEST_PATH = f"data/segmented_reading_order_X_test.pickle" 24 | SEGMENTED_READING_ORDER_Y_TEST_PATH = f"data/segmented_reading_order_y_test.pickle" 25 | 26 | 27 | def prepare_features(dataset_type, x_path, y_path): 28 | pdf_reading_order_tokens_list = get_segmented_pdf_reading_order_tokens(dataset_type) 29 | trainer = ReadingOrderTrainer(pdf_reading_order_tokens_list, None) 30 | x, y = trainer.get_training_data() 31 | if not os.path.exists(join(ROOT_PATH, "src", "data")): 32 | os.makedirs(join(ROOT_PATH, "src", "data")) 33 | with open(x_path, "wb") as x_file: 34 | pickle.dump(x, x_file) 35 | with open(y_path, "wb") as y_file: 36 | pickle.dump(y, y_file) 37 | return x, np.array(y) 38 | 39 | 40 | def get_features(dataset_type: str = "train"): 41 | x_path = SEGMENTED_READING_ORDER_X_TRAIN_PATH if dataset_type == "train" else SEGMENTED_READING_ORDER_X_TEST_PATH 42 | y_path = SEGMENTED_READING_ORDER_Y_TRAIN_PATH if dataset_type == "train" else SEGMENTED_READING_ORDER_Y_TEST_PATH 43 | if os.path.exists(x_path) and os.path.exists(y_path): 44 | with open(x_path, "rb") as f: 45 | x_features = pickle.load(f) 46 | with open(y_path, "rb") as f: 47 | y_features = np.array(pickle.load(f)) 48 | return x_features, np.array(y_features) 49 | 50 | return prepare_features(dataset_type, x_path, y_path) 51 | 52 | 53 | def prepare_segmented_pdf_reading_order_tokens_list(dataset_type, file_path): 54 | pdf_reading_order_tokens_list = load_labeled_data(PDF_LABELED_DATA_ROOT_PATH, filter_in=dataset_type) 55 | start_time = time() 56 | segment_processor = SegmentProcessor(pdf_reading_order_tokens_list) 57 | segment_processor.process() 58 | total_time = time() - start_time 59 | print(f"Segment processing took: {round(total_time, 2)} seconds.") 60 | with open(file_path, "wb") as pdf_reading_order_tokens_file: 61 | pickle.dump(pdf_reading_order_tokens_list, pdf_reading_order_tokens_file) 62 | return pdf_reading_order_tokens_list 63 | 64 | 65 | def get_segmented_pdf_reading_order_tokens(dataset_type: str = "train"): 66 | print(f"Loading {dataset_type} data...") 67 | file_path = ( 68 | SEGMENTED_PDF_READING_ORDER_TOKENS_TRAIN_PATH 69 | if dataset_type == "train" 70 | else SEGMENTED_PDF_READING_ORDER_TOKENS_TEST_PATH 71 | ) 72 | if not os.path.exists(join(ROOT_PATH, "src", "data")): 73 | os.makedirs(join(ROOT_PATH, "src", "data")) 74 | if os.path.exists(file_path): 75 | with open(file_path, "rb") as f: 76 | return pickle.load(f) 77 | 78 | return prepare_segmented_pdf_reading_order_tokens_list(dataset_type, file_path) 79 | 80 | 81 | def train_for_benchmark(include_test_set: bool = False): 82 | pdf_reading_order.ReadingOrderTrainer.USE_CANDIDATES_MODEL = False 83 | x_train, y_train = get_features("train") 84 | if include_test_set: 85 | x_test, y_test = get_features("test") 86 | x_train = np.concatenate((x_train, x_test), axis=0) 87 | y_train = np.append(y_train, y_test) 88 | trainer = ReadingOrderTrainer([], SEGMENTED_READING_ORDER_MODEL_CONFIGURATION) 89 | trainer.train(BENCHMARK_MODEL_PATH, x_train, y_train) 90 | 91 | 92 | def predict_for_benchmark(pdf_reading_order_tokens_list: list[PdfReadingOrderTokens], get_granular_scores: bool): 93 | trainer = ReadingOrderTrainer(pdf_reading_order_tokens_list, SEGMENTED_READING_ORDER_MODEL_CONFIGURATION) 94 | print(f"Model prediction started...") 95 | start_time = time() 96 | trainer.predict(BENCHMARK_MODEL_PATH) 97 | total_time = time() - start_time 98 | if get_granular_scores: 99 | table_name = f"_segmented_reading_order" 100 | benchmark_table = BenchmarkTable(pdf_reading_order_tokens_list, total_time, table_name) 101 | benchmark_table.prepare_benchmark_table() 102 | return total_time 103 | 104 | 105 | def benchmark(get_granular_scores: bool): 106 | pdf_reading_order.ReadingOrderTrainer.USE_CANDIDATES_MODEL = False 107 | pdf_reading_order_tokens_list = get_segmented_pdf_reading_order_tokens("test") 108 | total_time = predict_for_benchmark(pdf_reading_order_tokens_list, get_granular_scores) 109 | mistake_count_for_model = find_mistake_count(pdf_reading_order_tokens_list) 110 | print(f"{mistake_count_for_model} mistakes found. Total time: {total_time}") 111 | 112 | 113 | if __name__ == "__main__": 114 | train_for_benchmark() 115 | benchmark(True) 116 | -------------------------------------------------------------------------------- /src/create_candidate_finder_model.py: -------------------------------------------------------------------------------- 1 | from os.path import join 2 | from os import makedirs 3 | from pathlib import Path 4 | import numpy as np 5 | from benchmark_candidate_finder import get_features 6 | from pdf_reading_order.ReadingOrderCandidatesTrainer import ReadingOrderCandidatesTrainer 7 | from pdf_reading_order.model_configuration import CANDIDATE_MODEL_CONFIGURATION 8 | from pdf_reading_order.config import ROOT_PATH 9 | 10 | 11 | MODEL_PATH = Path(join(ROOT_PATH, "model", "candidate_selector_model.model")) 12 | 13 | 14 | def train_model(): 15 | x_train, y_train = get_features("train") 16 | x_test, y_test = get_features("test") 17 | x_train = np.concatenate((x_train, x_test), axis=0) 18 | y_train = np.append(y_train, y_test) 19 | trainer = ReadingOrderCandidatesTrainer([], CANDIDATE_MODEL_CONFIGURATION) 20 | makedirs(MODEL_PATH.parent, exist_ok=True) 21 | trainer.train(MODEL_PATH, x_train, y_train) 22 | 23 | 24 | if __name__ == "__main__": 25 | train_model() 26 | -------------------------------------------------------------------------------- /src/create_reading_order_model.py: -------------------------------------------------------------------------------- 1 | from os.path import join 2 | from os import makedirs 3 | from pathlib import Path 4 | import numpy as np 5 | from benchmark_reading_order import get_features 6 | from pdf_reading_order.ReadingOrderTrainer import ReadingOrderTrainer 7 | from pdf_reading_order.model_configuration import READING_ORDER_MODEL_CONFIGURATION 8 | from pdf_reading_order.config import ROOT_PATH 9 | 10 | 11 | MODEL_PATH = Path(join(ROOT_PATH, "model", "reading_order_model.model")) 12 | 13 | 14 | def train_model(): 15 | x_train, y_train = get_features("train") 16 | x_test, y_test = get_features("test") 17 | x_train = np.concatenate((x_train, x_test), axis=0) 18 | y_train = np.append(y_train, y_test) 19 | trainer = ReadingOrderTrainer([], READING_ORDER_MODEL_CONFIGURATION) 20 | makedirs(MODEL_PATH.parent, exist_ok=True) 21 | trainer.train(MODEL_PATH, x_train, y_train) 22 | 23 | 24 | if __name__ == "__main__": 25 | train_model() 26 | -------------------------------------------------------------------------------- /src/hyperparameter_optimization.py: -------------------------------------------------------------------------------- 1 | import optuna 2 | import lightgbm as lgb 3 | import pickle 4 | import numpy as np 5 | from os.path import exists 6 | from functools import partial 7 | from pdf_token_type_labels.TokenType import TokenType 8 | from sklearn.metrics import roc_auc_score 9 | from sklearn.model_selection import StratifiedKFold 10 | 11 | import pdf_reading_order.ReadingOrderTrainer 12 | from SegmentProcessor import SegmentProcessor 13 | from pdf_reading_order.ReadingOrderCandidatesTrainer import ReadingOrderCandidatesTrainer 14 | from pdf_reading_order.ReadingOrderTrainer import CANDIDATE_COUNT 15 | from pdf_reading_order.ReadingOrderTrainer import ReadingOrderTrainer 16 | from pdf_reading_order.load_labeled_data import load_labeled_data 17 | from pdf_reading_order.config import PDF_LABELED_DATA_ROOT_PATH 18 | 19 | CANDIDATES_DATA_PATH = "data/candidates_X_train.pickle" 20 | CANDIDATES_LABEL_PATH = "data/candidates_y_train.pickle" 21 | READING_ORDER_DATA_PATH = f"data/reading_order_{CANDIDATE_COUNT}_X_train.pickle" 22 | READING_ORDER_LABEL_PATH = f"data/reading_order_{CANDIDATE_COUNT}_y_train.pickle" 23 | SEGMENTED_READING_ORDER_DATA_PATH = f"data/segmented_reading_order_X_train.pickle" 24 | SEGMENTED_READING_ORDER_LABEL_PATH = f"data/segmented_reading_order_y_train.pickle" 25 | 26 | 27 | def process_segments(pdf_reading_order_tokens_list, segment_types: list[TokenType] = None): 28 | print("Figures and tables are being processed...") 29 | table_figure_processor = SegmentProcessor(pdf_reading_order_tokens_list, segment_types) 30 | table_figure_processor.process() 31 | print("Figures and table processing finished.") 32 | 33 | 34 | def create_candidates_pickle(): 35 | pdf_reading_order_tokens_list = load_labeled_data(PDF_LABELED_DATA_ROOT_PATH, filter_in="train") 36 | process_segments(pdf_reading_order_tokens_list, [TokenType.FIGURE, TokenType.TABLE]) 37 | 38 | trainer = ReadingOrderCandidatesTrainer(pdf_reading_order_tokens_list, None) 39 | x, y = trainer.get_training_data() 40 | 41 | with open(CANDIDATES_DATA_PATH, "wb") as x_file: 42 | pickle.dump(x, x_file) 43 | with open(CANDIDATES_LABEL_PATH, "wb") as y_file: 44 | pickle.dump(y, y_file) 45 | 46 | 47 | def create_reading_order_pickle(): 48 | pdf_reading_order_tokens_list = load_labeled_data(PDF_LABELED_DATA_ROOT_PATH, filter_in="train") 49 | process_segments(pdf_reading_order_tokens_list, [TokenType.FIGURE, TokenType.TABLE]) 50 | 51 | trainer = ReadingOrderTrainer(pdf_reading_order_tokens_list, None) 52 | x, y = trainer.get_training_data() 53 | 54 | with open(READING_ORDER_DATA_PATH, "wb") as x_file: 55 | pickle.dump(x, x_file) 56 | with open(READING_ORDER_LABEL_PATH, "wb") as y_file: 57 | pickle.dump(y, y_file) 58 | 59 | 60 | def create_segmented_reading_order_pickle(): 61 | pdf_reading_order_tokens_list = load_labeled_data(PDF_LABELED_DATA_ROOT_PATH, filter_in="train") 62 | process_segments(pdf_reading_order_tokens_list) 63 | 64 | pdf_reading_order.ReadingOrderTrainer.USE_CANDIDATES_MODEL = False 65 | trainer = ReadingOrderTrainer(pdf_reading_order_tokens_list, None) 66 | x, y = trainer.get_training_data() 67 | 68 | with open(SEGMENTED_READING_ORDER_DATA_PATH, "wb") as x_file: 69 | pickle.dump(x, x_file) 70 | with open(SEGMENTED_READING_ORDER_LABEL_PATH, "wb") as y_file: 71 | pickle.dump(y, y_file) 72 | 73 | 74 | def create_pickle_files(): 75 | if not exists(CANDIDATES_DATA_PATH): 76 | print("Getting candidates data") 77 | create_candidates_pickle() 78 | if not exists(READING_ORDER_DATA_PATH): 79 | print("Getting reading order data") 80 | create_reading_order_pickle() 81 | if not exists(SEGMENTED_READING_ORDER_DATA_PATH): 82 | print("Getting segmented reading order data") 83 | create_segmented_reading_order_pickle() 84 | 85 | 86 | def get_data(data_path: str, label_path: str): 87 | print("Loading X from: ", data_path) 88 | print("Loading y from: ", label_path) 89 | with open(data_path, "rb") as f: 90 | x_train = pickle.load(f) 91 | with open(label_path, "rb") as f: 92 | y_train = pickle.load(f) 93 | return x_train, np.array(y_train) 94 | 95 | 96 | def objective(trial: optuna.trial.Trial, task: str): 97 | if task == "candidates": 98 | X, y = get_data(CANDIDATES_DATA_PATH, CANDIDATES_LABEL_PATH) 99 | elif task == "reading_order": 100 | X, y = get_data(READING_ORDER_DATA_PATH, READING_ORDER_LABEL_PATH) 101 | elif task == "segmented_reading_order": 102 | X, y = get_data(SEGMENTED_READING_ORDER_DATA_PATH, SEGMENTED_READING_ORDER_LABEL_PATH) 103 | 104 | n_splits = 5 105 | random_states = [129, 427, 741] 106 | roc_aucs = [] 107 | 108 | for random_state in random_states: 109 | skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state) 110 | for train_idx, val_idx in skf.split(X, y): 111 | x_train, x_val = X[train_idx], X[val_idx] 112 | y_train, y_val = y[train_idx], y[val_idx] 113 | 114 | train_data = lgb.Dataset(x_train, label=y_train) 115 | val_data = lgb.Dataset(x_val, label=y_val, reference=train_data) 116 | 117 | params = { 118 | "boosting_type": "gbdt", 119 | "objective": "multiclass", 120 | "metric": "multi_logloss", 121 | "learning_rate": 0.1, 122 | "num_class": 2, 123 | "num_leaves": trial.suggest_int("num_leaves", 10, 500), 124 | "bagging_fraction": trial.suggest_float("bagging_fraction", 0.1, 1.0), 125 | "bagging_freq": trial.suggest_int("bagging_freq", 1, 10), 126 | "feature_fraction": trial.suggest_float("feature_fraction", 0.1, 1.0), 127 | "lambda_l1": trial.suggest_float("lambda_l1", 1e-08, 10.0, log=True), 128 | "lambda_l2": trial.suggest_float("lambda_l2", 1e-08, 10.0, log=True), 129 | "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 5, 100), 130 | "feature_pre_filter": trial.suggest_categorical("feature_pre_filter", [False, True]), 131 | # 'num_boost_round': 100, 132 | "early_stopping_rounds": 10, 133 | "verbose": -1, 134 | } 135 | model = lgb.train(params, train_data, valid_sets=[train_data, val_data], num_boost_round=100) 136 | y_pred_scores = model.predict(x_val, num_iteration=model.best_iteration) 137 | roc_auc = roc_auc_score(y_val, y_pred_scores[:, 1], multi_class="ovr") 138 | roc_aucs.append(roc_auc) 139 | 140 | return sum(roc_aucs) / (n_splits * len(random_states)) 141 | 142 | 143 | def optuna_automatic_tuning(task: str): 144 | create_pickle_files() 145 | study = optuna.create_study(direction="maximize") 146 | objective_with_task = partial(objective, task=task) 147 | study.optimize(objective_with_task, n_trials=100) 148 | 149 | print("Number of finished trials: ", len(study.trials)) 150 | print("Best trial: ") 151 | trial = study.best_trial 152 | 153 | print("Value: ", trial.value) 154 | print("Params: ") 155 | result_string: str = "" 156 | for key, value in trial.params.items(): 157 | print(f"\t{key}: {value}") 158 | result_string += f'"{key}": {value},\n' 159 | result_string += f"-> Best trial value: {str(trial.value)}\n" 160 | 161 | result_string += "\n\n\n" 162 | 163 | with open(f"src/tuned_parameters/{task}.txt", "a") as result_file: 164 | result_file.write(result_string) 165 | 166 | 167 | if __name__ == "__main__": 168 | optuna_automatic_tuning("segmented_reading_order") 169 | -------------------------------------------------------------------------------- /src/pdf_reading_order/PdfReadingOrderTokens.py: -------------------------------------------------------------------------------- 1 | from os.path import join 2 | from pdf_features.PdfPage import PdfPage 3 | from pdf_features.Rectangle import Rectangle 4 | from pdf_token_type_labels.PdfLabels import PdfLabels 5 | from pdf_features.PdfFeatures import PdfFeatures 6 | from pdf_reading_order.ReadingOrderLabelPage import ReadingOrderLabelPage 7 | from pdf_reading_order.config import READING_ORDER_RELATIVE_PATH 8 | from pdf_tokens_type_trainer.config import LABELS_FILE_NAME 9 | 10 | 11 | class PdfReadingOrderTokens: 12 | def __init__(self, pdf_features: PdfFeatures, labeled_page_by_raw_page: dict[PdfPage, ReadingOrderLabelPage]): 13 | self.pdf_features: PdfFeatures = pdf_features 14 | self.labeled_page_by_raw_page: dict[PdfPage, ReadingOrderLabelPage] = labeled_page_by_raw_page 15 | 16 | @staticmethod 17 | def loop_labels(reading_order_labels: PdfLabels): 18 | for page in reading_order_labels.pages: 19 | for label in sorted(page.labels, key=lambda _label: _label.area()): 20 | yield label, page.number 21 | 22 | @staticmethod 23 | def loop_tokens_sorted_by_area(pdf_features: PdfFeatures): 24 | for page in pdf_features.pages: 25 | for token in sorted(page.tokens, key=lambda t: Rectangle.area(t.bounding_box)): 26 | yield page, token 27 | 28 | @staticmethod 29 | def from_labeled_data(pdf_labeled_data_root_path, dataset, pdf_name): 30 | pdf_features = PdfFeatures.from_labeled_data(pdf_labeled_data_root_path, dataset, pdf_name) 31 | reading_order_labeled_data_path = join(pdf_labeled_data_root_path, READING_ORDER_RELATIVE_PATH) 32 | reading_order_labels_path = join(reading_order_labeled_data_path, dataset, pdf_name, LABELS_FILE_NAME) 33 | reading_order_labels = PdfFeatures.load_labels(reading_order_labels_path) 34 | return PdfReadingOrderTokens.set_reading_orders(pdf_features, reading_order_labels) 35 | 36 | @staticmethod 37 | def set_reading_orders(pdf_features: PdfFeatures, reading_order_labels: PdfLabels): 38 | labeled_page_by_raw_page: dict[PdfPage, ReadingOrderLabelPage] = {} 39 | last_page = None 40 | used_labels = [] 41 | for page, token in PdfReadingOrderTokens.loop_tokens_sorted_by_area(pdf_features): 42 | if page != last_page: 43 | labeled_page_by_raw_page[page] = ReadingOrderLabelPage() 44 | last_page = page 45 | used_labels = [] 46 | for label, label_page_number in PdfReadingOrderTokens.loop_labels(reading_order_labels): 47 | if page.page_number != label_page_number: 48 | continue 49 | if label in used_labels: 50 | continue 51 | if label.intersection_percentage(token.bounding_box) > 99.9: 52 | used_labels.append(label) 53 | labeled_page_by_raw_page[page].reading_order_by_token_id[token.id] = label.label_type 54 | break 55 | 56 | return PdfReadingOrderTokens(pdf_features, labeled_page_by_raw_page) 57 | -------------------------------------------------------------------------------- /src/pdf_reading_order/ReadingOrderBase.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import lightgbm as lgb 3 | from pathlib import Path 4 | from pdf_features.PdfFont import PdfFont 5 | from pdf_features.PdfToken import PdfToken 6 | from pdf_features.Rectangle import Rectangle 7 | from pdf_token_type_labels.TokenType import TokenType 8 | from pdf_tokens_type_trainer.ModelConfiguration import ModelConfiguration 9 | from pdf_tokens_type_trainer.TokenFeatures import TokenFeatures 10 | from pdf_reading_order.PdfReadingOrderTokens import PdfReadingOrderTokens 11 | 12 | 13 | class ReadingOrderBase: 14 | def __init__( 15 | self, pdf_reading_order_tokens_list: list[PdfReadingOrderTokens], model_configuration: ModelConfiguration = None 16 | ): 17 | self.pdf_reading_order_tokens_list = pdf_reading_order_tokens_list 18 | self.model_configuration = model_configuration 19 | 20 | def loop_token_features(self): 21 | for pdf_reading_order_tokens in self.pdf_reading_order_tokens_list: 22 | token_features = TokenFeatures(pdf_reading_order_tokens.pdf_features) 23 | 24 | for page in pdf_reading_order_tokens.pdf_features.pages: 25 | if not page.tokens: 26 | continue 27 | 28 | yield pdf_reading_order_tokens, token_features, page 29 | 30 | def loop_pages(self): 31 | for pdf_reading_order in self.pdf_reading_order_tokens_list: 32 | for page in pdf_reading_order.pdf_features.pages: 33 | yield pdf_reading_order, page 34 | 35 | @staticmethod 36 | def features_rows_to_x(features_rows): 37 | if not features_rows: 38 | return np.zeros((0, 0)) 39 | 40 | x = np.zeros(((len(features_rows)), len(features_rows[0]))) 41 | for i, v in enumerate(features_rows): 42 | x[i] = v 43 | return x 44 | 45 | @staticmethod 46 | def get_padding_token(segment_number: int, page_number: int): 47 | return PdfToken( 48 | page_number, 49 | "pad_token", 50 | "", 51 | PdfFont("pad_font_id", False, False, 0.0, "#000000"), 52 | segment_number, 53 | Rectangle(0, 0, 0, 0), 54 | TokenType.TEXT, 55 | ) 56 | 57 | def train(self, model_path: str | Path, x_train_data: np.ndarray = None, labels: np.ndarray = None): 58 | x_train, y_train = self.get_training_data() if x_train_data is None else x_train_data, labels 59 | 60 | if not x_train.any(): 61 | print("No data for training") 62 | return 63 | 64 | lgb_train = lgb.Dataset(x_train, y_train) 65 | print(f"Training: {model_path}") 66 | 67 | gbm = lgb.train(self.model_configuration.dict(), lgb_train) 68 | print(f"Saving") 69 | gbm.save_model(model_path, num_iteration=gbm.best_iteration) 70 | 71 | def get_training_data(self): 72 | pass 73 | -------------------------------------------------------------------------------- /src/pdf_reading_order/ReadingOrderCandidatesTrainer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import lightgbm as lgb 3 | from pathlib import Path 4 | from pdf_features.PdfToken import PdfToken 5 | from pdf_reading_order.ReadingOrderLabelPage import ReadingOrderLabelPage 6 | from pdf_reading_order.ReadingOrderBase import ReadingOrderBase 7 | 8 | 9 | class ReadingOrderCandidatesTrainer(ReadingOrderBase): 10 | @staticmethod 11 | def get_features(token_1: PdfToken, token_2: PdfToken): 12 | return [ 13 | token_1.bounding_box.top, 14 | token_1.bounding_box.left, 15 | token_1.bounding_box.right, 16 | token_1.bounding_box.bottom, 17 | token_2.bounding_box.top, 18 | token_2.bounding_box.left, 19 | token_2.bounding_box.right, 20 | token_2.bounding_box.bottom, 21 | token_1.bounding_box.bottom - token_2.bounding_box.top, 22 | ] 23 | 24 | @staticmethod 25 | def get_next_token(reading_order_no: int, label_page: ReadingOrderLabelPage, remaining_tokens: list[PdfToken]): 26 | for remaining_token in remaining_tokens: 27 | if label_page.reading_order_by_token_id[remaining_token.id] == reading_order_no: 28 | return remaining_token 29 | 30 | def loop_token_combinations(self): 31 | for pdf_reading_order, page in self.loop_pages(): 32 | label_page = pdf_reading_order.labeled_page_by_raw_page[page] 33 | current_token = self.get_padding_token(-1, page.page_number) 34 | reading_order_no = 1 35 | remaining_tokens = page.tokens.copy() 36 | for i in range(len(page.tokens)): 37 | yield current_token, remaining_tokens, label_page, reading_order_no 38 | current_token = self.get_next_token(reading_order_no, label_page, remaining_tokens) 39 | reading_order_no += 1 40 | remaining_tokens.remove(current_token) 41 | 42 | def get_training_data(self): 43 | features_rows = [] 44 | labels = [] 45 | for current_token, remaining_tokens, label_page, _ in self.loop_token_combinations(): 46 | for remaining_token in remaining_tokens: 47 | features_rows.append(self.get_features(current_token, remaining_token)) 48 | labels.append(int(label_page.is_next_token(current_token, remaining_token))) 49 | return self.features_rows_to_x(features_rows), labels 50 | 51 | def predict(self, model_path: str | Path = None, candidate_count: int = 18): 52 | mistake_count = 0 53 | model = lgb.Booster(model_file=model_path) 54 | for current_token, remaining_tokens, label_page, reading_order_no in self.loop_token_combinations(): 55 | candidate_tokens = self.get_candidate_tokens(candidate_count, current_token, model, remaining_tokens) 56 | if self.get_next_token(reading_order_no, label_page, remaining_tokens) not in candidate_tokens: 57 | mistake_count += 1 58 | return mistake_count 59 | 60 | def get_candidate_tokens(self, candidate_count, current_token, model, remaining_tokens): 61 | features = [self.get_features(current_token, remaining_token) for remaining_token in remaining_tokens] 62 | prediction_scores = model.predict(self.features_rows_to_x(features)) 63 | candidate_token_indexes = np.argsort([prediction_scores[:, 1]], axis=1)[:, -candidate_count:] 64 | candidate_tokens = [remaining_tokens[i] for i in candidate_token_indexes[0]] 65 | return candidate_tokens 66 | -------------------------------------------------------------------------------- /src/pdf_reading_order/ReadingOrderLabelPage.py: -------------------------------------------------------------------------------- 1 | from pdf_features.PdfToken import PdfToken 2 | 3 | 4 | class ReadingOrderLabelPage: 5 | def __init__(self): 6 | self.reading_order_by_token_id: dict[str, int] = {"pad_token": 0} 7 | 8 | def is_next_token(self, current_token: PdfToken, candidate_token: PdfToken): 9 | if current_token.id not in self.reading_order_by_token_id: 10 | return False 11 | 12 | if candidate_token.id not in self.reading_order_by_token_id: 13 | return False 14 | 15 | return self.reading_order_by_token_id[candidate_token.id] == self.reading_order_by_token_id[current_token.id] + 1 16 | 17 | def is_coming_earlier(self, token_1: PdfToken, token_2: PdfToken): 18 | return self.reading_order_by_token_id[token_2.id] < self.reading_order_by_token_id[token_1.id] 19 | -------------------------------------------------------------------------------- /src/pdf_reading_order/ReadingOrderTrainer.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import lightgbm as lgb 4 | import numpy as np 5 | from random import shuffle 6 | from pathlib import Path 7 | from os.path import join 8 | from pdf_features.PdfPage import PdfPage 9 | from pdf_features.PdfToken import PdfToken 10 | from pdf_token_type_labels.TokenType import TokenType 11 | from pdf_tokens_type_trainer.TokenFeatures import TokenFeatures 12 | from pdf_reading_order.config import ROOT_PATH 13 | from pdf_reading_order.ReadingOrderBase import ReadingOrderBase 14 | from pdf_reading_order.ReadingOrderLabelPage import ReadingOrderLabelPage 15 | from pdf_reading_order.ReadingOrderCandidatesTrainer import ReadingOrderCandidatesTrainer as CandidatesTrainer 16 | from pdf_reading_order.download_models import reading_order_model 17 | from pdf_reading_order.download_models import candidate_selector_model 18 | 19 | USE_CANDIDATES_MODEL = True 20 | CANDIDATE_COUNT = 18 21 | candidate_selector_model_path = join(ROOT_PATH, "model", "candidate_selector_benchmark.model") 22 | if not os.path.exists(candidate_selector_model_path): 23 | candidate_selector_model_path = candidate_selector_model 24 | CANDIDATE_TOKEN_MODEL = lgb.Booster(model_file=candidate_selector_model_path) 25 | 26 | 27 | class ReadingOrderTrainer(ReadingOrderBase): 28 | def get_candidate_tokens(self, current_token, remaining_tokens: list[PdfToken]): 29 | features = [CandidatesTrainer.get_features(current_token, remaining) for remaining in remaining_tokens] 30 | prediction_scores = CANDIDATE_TOKEN_MODEL.predict(self.features_rows_to_x(features)) 31 | candidate_token_indexes = np.argsort([prediction_scores[:, 1]], axis=1)[:, -CANDIDATE_COUNT:] 32 | candidate_tokens = [remaining_tokens[i] for i in candidate_token_indexes[0]] 33 | return candidate_tokens 34 | 35 | @staticmethod 36 | def get_token_type_features(token: PdfToken) -> list[int]: 37 | return [1 if token_type == token.token_type else 0 for token_type in TokenType] 38 | 39 | def get_features(self, current_token, first_candidate, second_candidate, token_features, page_tokens): 40 | bounding_box = current_token.bounding_box 41 | features = [bounding_box.top, bounding_box.left, bounding_box.width, bounding_box.height] 42 | features += self.get_token_type_features(current_token) 43 | features += token_features.get_features(first_candidate, second_candidate, page_tokens) 44 | features += self.get_token_type_features(first_candidate) 45 | features += token_features.get_features(second_candidate, first_candidate, page_tokens) 46 | features += self.get_token_type_features(second_candidate) 47 | return features 48 | 49 | @staticmethod 50 | def get_next_token_label(reading_order_no, label_page: ReadingOrderLabelPage, remaining_tokens: list[PdfToken]): 51 | for remaining_token in remaining_tokens: 52 | if label_page.reading_order_by_token_id[remaining_token.id] == reading_order_no: 53 | return remaining_token 54 | 55 | def loop_candidates_for_each_token(self): 56 | for pdf_reading_order_tokens, token_features, page in self.loop_token_features(): 57 | label_page = pdf_reading_order_tokens.labeled_page_by_raw_page[page] 58 | current_token = self.get_padding_token(-1, page.page_number) 59 | reading_order_no = 1 60 | remaining_tokens = page.tokens.copy() 61 | for i in range(len(page.tokens)): 62 | candidates = ( 63 | self.get_candidate_tokens(current_token, remaining_tokens) if USE_CANDIDATES_MODEL else remaining_tokens 64 | ) 65 | yield current_token, candidates, token_features, label_page, page 66 | current_token = self.get_next_token_label(reading_order_no, label_page, remaining_tokens) 67 | reading_order_no += 1 68 | remaining_tokens.remove(current_token) 69 | 70 | def get_training_data(self): 71 | features_rows = [] 72 | labels = [] 73 | for current_token, candidate_tokens, token_features, label_page, page in self.loop_candidates_for_each_token(): 74 | shuffle(candidate_tokens) 75 | next_token = candidate_tokens[0] 76 | for candidate_token in candidate_tokens[1:]: 77 | feature_row = self.get_features(current_token, next_token, candidate_token, token_features, page.tokens) 78 | features_rows.append(feature_row) 79 | labels.append(label_page.is_coming_earlier(next_token, candidate_token)) 80 | if label_page.is_coming_earlier(next_token, candidate_token): 81 | next_token = candidate_token 82 | 83 | return self.features_rows_to_x(features_rows), labels 84 | 85 | def find_next_token(self, lightgbm_model, token_features, page_tokens, candidate_tokens, current_token): 86 | next_token = candidate_tokens[0] 87 | for candidate_token in candidate_tokens[1:]: 88 | features_rows = [self.get_features(current_token, next_token, candidate_token, token_features, page_tokens)] 89 | X = self.features_rows_to_x(features_rows) 90 | if int(np.argmax(lightgbm_model.predict(X))) == 1: 91 | next_token = candidate_token 92 | return next_token 93 | 94 | def get_reading_orders_for_page(self, lightgbm_model: lgb.Booster, token_features: TokenFeatures, page: PdfPage): 95 | current_token = self.get_padding_token(-1, page.page_number) 96 | remaining_tokens = page.tokens.copy() 97 | reading_order_by_token = {} 98 | current_reading_order_no = 1 99 | for i in range(len(page.tokens)): 100 | candidates = ( 101 | self.get_candidate_tokens(current_token, remaining_tokens) if USE_CANDIDATES_MODEL else remaining_tokens 102 | ) 103 | current_token = self.find_next_token(lightgbm_model, token_features, page.tokens, candidates, current_token) 104 | remaining_tokens.remove(current_token) 105 | reading_order_by_token[current_token] = current_reading_order_no 106 | current_reading_order_no += 1 107 | 108 | return reading_order_by_token 109 | 110 | @staticmethod 111 | def reorder_page_tokens(page: PdfPage, reading_order_by_token: dict[PdfToken, int]): 112 | for token in page.tokens: 113 | token.prediction = reading_order_by_token[token] 114 | page.tokens.sort(key=lambda _token: _token.prediction) 115 | 116 | def predict(self, model_path: str | Path = None): 117 | model_path = model_path if model_path else reading_order_model 118 | lightgbm_model = lgb.Booster(model_file=model_path) 119 | for pdf_reading_order_tokens, token_features, page in self.loop_token_features(): 120 | reading_order_by_token = self.get_reading_orders_for_page(lightgbm_model, token_features, page) 121 | self.reorder_page_tokens(page, reading_order_by_token) 122 | -------------------------------------------------------------------------------- /src/pdf_reading_order/config.py: -------------------------------------------------------------------------------- 1 | from os.path import join 2 | from pathlib import Path 3 | 4 | ROOT_PATH = Path(__file__).parent.parent.parent.absolute() 5 | 6 | PDF_LABELED_DATA_ROOT_PATH = Path(join(ROOT_PATH.parent, "pdf-labeled-data")) 7 | READING_ORDER_RELATIVE_PATH = join("labeled_data", "reading_order") 8 | -------------------------------------------------------------------------------- /src/pdf_reading_order/download_models.py: -------------------------------------------------------------------------------- 1 | from huggingface_hub import hf_hub_download 2 | 3 | candidate_selector_model = hf_hub_download( 4 | repo_id="HURIDOCS/pdf-reading-order", 5 | filename="candidate_selector_model.model", 6 | revision="4117935c3500d58eca15ca89dfba211e5c73ae45", 7 | ) 8 | 9 | reading_order_model = hf_hub_download( 10 | repo_id="HURIDOCS/pdf-reading-order", 11 | filename="reading_order_model.model", 12 | revision="17cf6f396cfd39d2290d70264f97b640c9f5b5c7", 13 | ) 14 | -------------------------------------------------------------------------------- /src/pdf_reading_order/load_labeled_data.py: -------------------------------------------------------------------------------- 1 | from os import listdir 2 | from os.path import join, isdir 3 | from pdf_reading_order.config import READING_ORDER_RELATIVE_PATH 4 | from pdf_reading_order.PdfReadingOrderTokens import PdfReadingOrderTokens 5 | 6 | 7 | def loop_datasets(reading_order_labeled_data_path: str, filter_in: str): 8 | print(reading_order_labeled_data_path) 9 | for dataset_name in listdir(reading_order_labeled_data_path): 10 | if filter_in and filter_in not in dataset_name: 11 | continue 12 | 13 | dataset_path = join(reading_order_labeled_data_path, dataset_name) 14 | 15 | if not isdir(dataset_path): 16 | continue 17 | 18 | yield dataset_name, dataset_path 19 | 20 | 21 | def load_labeled_data(pdf_labeled_data_root_path: str, filter_in: str = None) -> list[PdfReadingOrderTokens]: 22 | if filter_in: 23 | print(f"Loading only datasets with the key word: {filter_in}") 24 | print() 25 | 26 | pdf_paragraph_tokens_list: list[PdfReadingOrderTokens] = list() 27 | reading_order_labeled_data_path: str = join(pdf_labeled_data_root_path, READING_ORDER_RELATIVE_PATH) 28 | 29 | for dataset_name, dataset_path in loop_datasets(reading_order_labeled_data_path, filter_in): 30 | print(f"loading {dataset_name} from {dataset_path}") 31 | 32 | dataset_pdf_name = [(dataset_name, pdf_name) for pdf_name in listdir(dataset_path)] 33 | for dataset, pdf_name in dataset_pdf_name: 34 | pdf_paragraph_tokens = PdfReadingOrderTokens.from_labeled_data(pdf_labeled_data_root_path, dataset, pdf_name) 35 | pdf_paragraph_tokens_list.append(pdf_paragraph_tokens) 36 | 37 | return pdf_paragraph_tokens_list 38 | -------------------------------------------------------------------------------- /src/pdf_reading_order/model_configuration.py: -------------------------------------------------------------------------------- 1 | from pdf_tokens_type_trainer.ModelConfiguration import ModelConfiguration 2 | 3 | candidate_config_json = { 4 | "boosting_type": "gbdt", 5 | "verbose": -1, 6 | "learning_rate": 0.1, 7 | "num_class": 2, 8 | "context_size": 18, 9 | "num_boost_round": 500, 10 | "num_leaves": 455, 11 | "bagging_fraction": 0.9582447218059061, 12 | "bagging_freq": 1, 13 | "feature_fraction": 0.7479496700086276, 14 | "lambda_l1": 0.00017789899076539243, 15 | "lambda_l2": 0.050461704863219915, 16 | "min_data_in_leaf": 98, 17 | "feature_pre_filter": True, 18 | "seed": 22, 19 | "deterministic": True, 20 | } 21 | 22 | 23 | reading_order_config_json = { 24 | "boosting_type": "gbdt", 25 | "verbose": -1, 26 | "learning_rate": 0.1, 27 | "num_class": 2, 28 | "context_size": 18, 29 | "num_boost_round": 800, 30 | "num_leaves": 255, 31 | "bagging_fraction": 0.8140916039821084, 32 | "bagging_freq": 9, 33 | "feature_fraction": 0.3526400612810575, 34 | "lambda_l1": 5.058643948078386e-08, 35 | "lambda_l2": 0.017293649765588552, 36 | "min_data_in_leaf": 34, 37 | "feature_pre_filter": False, 38 | "seed": 22, 39 | "deterministic": True, 40 | } 41 | 42 | 43 | segmented_reading_order_config_json = { 44 | "boosting_type": "gbdt", 45 | "verbose": -1, 46 | "learning_rate": 0.1, 47 | "num_class": 2, 48 | "context_size": 18, 49 | "num_boost_round": 400, 50 | "num_leaves": 490, 51 | "bagging_fraction": 0.9737322102732517, 52 | "bagging_freq": 8, 53 | "feature_fraction": 0.2818771210976459, 54 | "lambda_l1": 4.188768069014224e-08, 55 | "lambda_l2": 0.0028784995521791176, 56 | "min_data_in_leaf": 34, 57 | "feature_pre_filter": True, 58 | "seed": 22, 59 | "deterministic": True, 60 | } 61 | 62 | CANDIDATE_MODEL_CONFIGURATION = ModelConfiguration(**candidate_config_json) 63 | READING_ORDER_MODEL_CONFIGURATION = ModelConfiguration(**reading_order_config_json) 64 | SEGMENTED_READING_ORDER_MODEL_CONFIGURATION = ModelConfiguration(**segmented_reading_order_config_json) 65 | 66 | 67 | if __name__ == "__main__": 68 | print(CANDIDATE_MODEL_CONFIGURATION) 69 | -------------------------------------------------------------------------------- /src/predict.py: -------------------------------------------------------------------------------- 1 | import typer 2 | from pdf_token_type_labels.TokenType import TokenType 3 | from pdf_tokens_type_trainer.TokenTypeTrainer import TokenTypeTrainer 4 | from OrderedPage import OrderedPage 5 | from pdf_features.PdfFeatures import PdfFeatures 6 | from SegmentProcessor import SegmentProcessor 7 | from pdf_reading_order.PdfReadingOrderTokens import PdfReadingOrderTokens 8 | from pdf_tokens_type_trainer.ModelConfiguration import ModelConfiguration 9 | from pdf_reading_order.ReadingOrderTrainer import ReadingOrderTrainer 10 | 11 | 12 | def predict(pdf_path: str, extract_figures_and_tables: bool = False, model_path: str = None): 13 | pdf_features = PdfFeatures.from_pdf_path(pdf_path) 14 | pdf_reading_order_tokens = PdfReadingOrderTokens(pdf_features, {}) 15 | token_type_trainer = TokenTypeTrainer([pdf_features]) 16 | token_type_trainer.set_token_types() 17 | if extract_figures_and_tables: 18 | table_figure_processor = SegmentProcessor([pdf_reading_order_tokens], [TokenType.FIGURE, TokenType.TABLE]) 19 | table_figure_processor.process() 20 | trainer = ReadingOrderTrainer([pdf_reading_order_tokens], ModelConfiguration()) 21 | trainer.predict(model_path) 22 | 23 | predictions: list[OrderedPage] = list() 24 | 25 | for page in pdf_reading_order_tokens.pdf_features.pages: 26 | predictions.append(OrderedPage.from_pdf_tokens(pdf_features.file_name, page.page_number, page.tokens)) 27 | 28 | print([prediction.to_dict() for prediction in predictions]) 29 | 30 | 31 | if __name__ == "__main__": 32 | typer.run(predict) 33 | -------------------------------------------------------------------------------- /src/show_reading_orders.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | import pdf_reading_order.ReadingOrderTrainer 3 | from pdf_token_type_labels.TaskMistakes import TaskMistakes 4 | from benchmark_segmented_reading_order import get_segmented_pdf_reading_order_tokens 5 | from pdf_reading_order.PdfReadingOrderTokens import PdfReadingOrderTokens 6 | from pdf_reading_order.ReadingOrderTrainer import ReadingOrderTrainer 7 | from pdf_reading_order.model_configuration import SEGMENTED_READING_ORDER_MODEL_CONFIGURATION 8 | 9 | PDF_LABELED_DATA_ROOT_PATH = "/path/to/pdf-labeled-data" 10 | MISTAKES_NAME = "segmented_reading_order" 11 | 12 | 13 | def get_reading_order_predictions(model_path: str, pdf_reading_order_tokens_list: list[PdfReadingOrderTokens]): 14 | pdf_reading_order.ReadingOrderTrainer.USE_CANDIDATES_MODEL = False 15 | trainer = ReadingOrderTrainer(pdf_reading_order_tokens_list, SEGMENTED_READING_ORDER_MODEL_CONFIGURATION) 16 | print(f"Model prediction started...") 17 | trainer.predict(model_path) 18 | 19 | 20 | def show_mistakes(pdf_reading_order_tokens_list: list[PdfReadingOrderTokens]): 21 | shutil.rmtree("/path/to/pdf-labeled-data/labeled_data/task_mistakes/segmented_reading_order", ignore_errors=True) 22 | 23 | for pdf_reading_order_tokens in pdf_reading_order_tokens_list: 24 | task_mistakes = TaskMistakes( 25 | PDF_LABELED_DATA_ROOT_PATH, MISTAKES_NAME, pdf_reading_order_tokens.pdf_features.file_name 26 | ) 27 | for page in pdf_reading_order_tokens.pdf_features.pages: 28 | labeled_page = pdf_reading_order_tokens.labeled_page_by_raw_page[page] 29 | for segment_index, segment in enumerate(page.tokens): 30 | if segment.prediction != labeled_page.reading_order_by_token_id[segment.id]: 31 | task_mistakes.add(page.page_number, segment.bounding_box, 1, 0, str(segment_index)) 32 | continue 33 | task_mistakes.add(page.page_number, segment.bounding_box, 1, 1, str(segment_index)) 34 | 35 | task_mistakes.save() 36 | 37 | 38 | def run(): 39 | model_path = "/path/to/pdf-reading-order/model/segmented_reading_order_benchmark.model" 40 | pdf_reading_order_tokens_list = get_segmented_pdf_reading_order_tokens("test") 41 | get_reading_order_predictions(model_path, pdf_reading_order_tokens_list) 42 | show_mistakes(pdf_reading_order_tokens_list) 43 | 44 | 45 | if __name__ == "__main__": 46 | run() 47 | --------------------------------------------------------------------------------