├── .gitignore ├── examples └── __init__.py ├── pdfstructure ├── __init__.py ├── analysis │ ├── __init__.py │ ├── annotate.py │ ├── sizemapper.py │ └── styledistribution.py ├── hierarchy │ ├── __init__.py │ ├── detectheader.py │ ├── headercompare.py │ ├── parser.py │ └── traversal.py ├── model │ ├── __init__.py │ ├── document.py │ └── style.py ├── printer.py ├── source.py └── utils.py ├── readme.md ├── requirements.txt ├── setup.py └── tests ├── __init__.py ├── helper.py ├── resources ├── 5648.pdf ├── IE00BM67HT60-ATB-FS-DE-2020-2-28.pdf ├── SameSize_BoldTitle.pdf ├── SameSize_EnumeratedTitle.pdf ├── SameStyleOnly.pdf ├── interview_cheatsheet-excerpt.png ├── interview_cheatsheet.pdf ├── lorem.pdf ├── paper.pdf ├── parsed │ ├── interview_cheatsheet.json │ └── interview_cheatsheet_pretty.txt └── samplepptx.pdf ├── test_custom_use_cases.py ├── test_document.py ├── test_headercompare.py ├── test_hierarchy.py ├── test_printer.py ├── test_style_analyser.py ├── test_traversal.py └── test_utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Default ignored files 2 | .idea 3 | -------------------------------------------------------------------------------- /examples/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChrizH/pdfstructure/1be97d954418b17d0d628ae6595c6aab54c7bd1c/examples/__init__.py -------------------------------------------------------------------------------- /pdfstructure/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChrizH/pdfstructure/1be97d954418b17d0d628ae6595c6aab54c7bd1c/pdfstructure/__init__.py -------------------------------------------------------------------------------- /pdfstructure/analysis/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChrizH/pdfstructure/1be97d954418b17d0d628ae6595c6aab54c7bd1c/pdfstructure/analysis/__init__.py -------------------------------------------------------------------------------- /pdfstructure/analysis/annotate.py: -------------------------------------------------------------------------------- 1 | import statistics 2 | from collections import Counter 3 | 4 | from pdfminer.layout import LTTextBoxHorizontal, LTChar 5 | 6 | from pdfstructure.analysis.sizemapper import SizeMapper 7 | from pdfstructure.analysis.styledistribution import StyleDistribution 8 | from pdfstructure.model.document import TextElement 9 | from pdfstructure.model.style import Style, TextSize 10 | from pdfstructure.utils import truncate 11 | 12 | 13 | class StyleAnnotator: 14 | """ 15 | creates a PdfElements from incoming pdf-paragraphs (raw LTTextContainer from pdfminer.six). 16 | - annotates paragraph with @Style(italic, bold, fontname, mapped_size, mean_size). 17 | - mapped_font_size: captures most dominant character size within paragraph & maps it to TextSize Enum. 18 | mapped Size is leveraged by the hierarchy detection algorithm. 19 | """ 20 | 21 | def __init__(self, sizemapper: SizeMapper, style_info: StyleDistribution): 22 | self._sizeMapper = sizemapper 23 | self._styleInfo = style_info 24 | 25 | @staticmethod 26 | def __investigate_box_style(element): 27 | fonts = Counter() 28 | sizes = [] 29 | for line in element: 30 | for c in line: 31 | if isinstance(c, LTChar): 32 | fonts.update([c.fontname]) 33 | sizes.append(c.size) 34 | return fonts, sizes 35 | 36 | def process(self, element_gen): # element: LTTextContainer): 37 | """" 38 | annotate each element with fontsize 39 | """ 40 | for element in element_gen: 41 | if isinstance(element, LTTextBoxHorizontal): 42 | 43 | fonts, sizes = self.__investigate_box_style(element) 44 | if not fonts or not element.get_text().rstrip(): 45 | continue 46 | 47 | font_name = fonts.most_common(1)[0][0] 48 | mean_size = truncate(statistics.mean(sizes), 1) 49 | max_size = max(sizes) 50 | # todo currently empty boxes are forwarded.. with holding only \n 51 | mapped_size = self._sizeMapper.translate(target_enum=TextSize, 52 | value=max_size) 53 | s = Style(bold="bold" in str(font_name.lower()), 54 | italic="italic" in font_name.lower(), 55 | font_name=font_name, 56 | mapped_font_size=mapped_size, 57 | mean_size=mean_size, max_size=max_size) 58 | 59 | # todo, split lines within LTTextBoxHorizontal 60 | # split using style as differentiator 61 | # e.g 1st is title with bold text 62 | # 2nd & 3rd line are introduction lines with body style 63 | # -> forward 2 boxes (header, content) 64 | yield TextElement(text_container=element, style=s, 65 | page=element.page if hasattr(element, "page") else None) 66 | -------------------------------------------------------------------------------- /pdfstructure/analysis/sizemapper.py: -------------------------------------------------------------------------------- 1 | import math 2 | from enum import Enum 3 | from typing import Type 4 | 5 | from pdfstructure.analysis.styledistribution import StyleDistribution 6 | from pdfstructure.model.style import TextSize 7 | 8 | 9 | class SizeMapper: 10 | 11 | def __init__(self): 12 | self._borders = None 13 | 14 | @property 15 | def borders(self): 16 | return self._borders 17 | 18 | def translate(self, target_enum: Type[TextSize], value) -> Enum: 19 | return TextSize.from_range(self.borders, value) 20 | 21 | 22 | class PivotLogMapper(SizeMapper): 23 | def __init__(self, style_info: StyleDistribution, bins=5): 24 | super().__init__() 25 | self.bins = bins 26 | borders = [] 27 | # find pivot 28 | # diff pivot to max & min 29 | pivot = style_info.body_size 30 | # if style_info.min_found_size <= pivot <= style_info.max_found_size: 31 | right_span = style_info.max_found_size - pivot 32 | left_span = pivot - style_info.min_found_size 33 | 34 | if right_span > pivot * 2: 35 | right_span = pivot * 2 36 | if right_span == 0: 37 | right_span = 5 38 | if left_span == 0: 39 | left_span = 5 40 | 41 | targetSteps = bins / 2. 42 | alpha = 0.5 43 | thRunner = pivot 44 | mem = 0 45 | for i in range(1, int((bins / 2) + 1)): 46 | scaledStep = left_span / targetSteps * self.weight(i) + mem * alpha 47 | thRunner -= scaledStep 48 | mem = scaledStep 49 | borders.insert(0, thRunner) 50 | thRunner = pivot 51 | mem = 0 52 | for i in range(1, int((bins / 2) + 1)): 53 | scaledStep = right_span / targetSteps * self.weight(i) + mem * alpha 54 | thRunner += scaledStep 55 | mem = scaledStep 56 | borders.append(thRunner) 57 | 58 | self._borders = tuple(borders) 59 | 60 | @staticmethod 61 | def weight(n): 62 | """ 63 | used in sizemapper, walking in N steps from pivot point towards max & min found size. 64 | first step is weighted more (to have a more narrow pivot), the further towards edge values, the further the step. 65 | @param n: 66 | @return: 67 | """ 68 | return 1.0 - 1. / math.exp(n - 0.2) 69 | 70 | 71 | class PivotLinearMapper(SizeMapper): 72 | 73 | def __init__(self, style_info: StyleDistribution): 74 | # find pivot 75 | # diff pivot to max & min 76 | super().__init__() 77 | pivot = style_info.body_size 78 | # if style_info.min_found_size <= pivot <= style_info.max_found_size: 79 | right_span = style_info.max_found_size - pivot 80 | left_span = pivot - style_info.min_found_size 81 | # else: 82 | # print("error?") 83 | 84 | right_step = (right_span / 2.) * 0.5 85 | left_step = (left_span / 2.) * 0.5 86 | 87 | b0, b1 = style_info.min_found_size + left_step, style_info.min_found_size + left_step * 2 88 | b2, b3 = pivot + right_step, pivot + right_step * 2 89 | self._borders = (b0, b1, b2, b3) 90 | 91 | 92 | class LinearSizeMapper(SizeMapper): 93 | 94 | def __init__(self, style_info: StyleDistribution): 95 | super().__init__() 96 | self.style_info = style_info 97 | 98 | def translate(self, target_enum, value) -> Enum: 99 | # Figure out how 'wide' each range is 100 | leftSpan = self.style_info.max_found_size - self.style_info.min_found_size 101 | rightSpan = target_enum.xlarge.value - target_enum.xsmall.value 102 | 103 | # Convert the left range into a 0-1 range (float) 104 | scaled = float(value - self.style_info.min_found_size) / float(leftSpan) 105 | if scaled > 1.0: 106 | return target_enum.xlarge 107 | elif scaled < 0: 108 | return target_enum.xsmall 109 | 110 | else: 111 | # Convert the 0-1 range into a value in the right range. 112 | return TextSize(int(target_enum.xsmall.value + (scaled * rightSpan))) 113 | -------------------------------------------------------------------------------- /pdfstructure/analysis/styledistribution.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | from collections import Counter, defaultdict 3 | 4 | from pdfminer.layout import LTTextContainer, LTTextLine, LTChar 5 | from sortedcontainers import SortedDict 6 | 7 | from pdfstructure.utils import truncate, closest_key 8 | 9 | 10 | class StyleDistribution: 11 | """ 12 | Represents style information for one analysed element stream (typically one stream per document). 13 | """ 14 | 15 | def __init__(self, data=None, line_margin=0.5): 16 | """ 17 | 18 | :type data: Counter 19 | :param data: 20 | """ 21 | if data: 22 | self._data = data 23 | self._body_size = data.most_common(1)[0][0] 24 | self._min_found_size, self._max_found_size = min(data.keys()), max(data.keys()) 25 | if self._min_found_size == self._max_found_size: 26 | self._min_found_size /= 2 27 | self._max_found_size *= 2 28 | self._line_margin = line_margin 29 | 30 | @property 31 | def line_margin(self): 32 | return self._line_margin 33 | 34 | def norm_data_binned(self, bins=50): 35 | amount_items = self.amount_values 36 | step = 1.0 / bins 37 | keys = [step * i for i in range(bins)] 38 | normalised = SortedDict({key: 0.0 for key in keys}) 39 | for size in self.data: 40 | norm_key = truncate(size / self.max_found_size, 2) 41 | k = closest_key(normalised, norm_key) 42 | normalised[k] += float(self.data[size]) / amount_items 43 | 44 | return normalised 45 | 46 | @property 47 | def norm_data(self): 48 | # normalise counts with total amount of collected values 49 | # normalise each key value against max found key value (size) 50 | # normalise X & Y 51 | normalised = defaultdict(int) 52 | amount_items = self.amount_values 53 | for size in self.data: 54 | normalised[truncate(size / self.max_found_size, 2)] += float(self.data[size]) / amount_items 55 | 56 | return normalised 57 | 58 | @property 59 | def min_found_size(self): 60 | return self._min_found_size 61 | 62 | @property 63 | def max_found_size(self): 64 | return self._max_found_size 65 | 66 | @staticmethod 67 | def get_min_size(data: Counter, body_size, title_size): 68 | if len(data) > 2: 69 | tmin = sorted(data.keys(), reverse=True)[:3][-1] 70 | return tmin if tmin > body_size else title_size - 0.5 71 | else: 72 | return title_size - 0.5 73 | 74 | @property 75 | def body_size(self): 76 | return self._body_size 77 | 78 | @property 79 | def is_empty(self): 80 | return not self._data 81 | 82 | @property 83 | def amount_values(self): 84 | return sum(self._data.values(), 0.0) 85 | 86 | @property 87 | def amount_sizes(self): 88 | """ 89 | amount of found sizes 90 | :return: 91 | """ 92 | return len(self._data) 93 | 94 | @property 95 | def data(self) -> Counter: 96 | return self._data.copy() 97 | 98 | 99 | class SizeAnalyser: 100 | def __init__(self): 101 | self.sizeDistribution = Counter() 102 | 103 | def consume(self, node: LTTextContainer): 104 | sizes = list(itertools.islice( 105 | [c.size for c in node if isinstance(c, LTChar)], 10)) 106 | # get max size, check that it occurred at least twice 107 | maxSize = max(sizes) 108 | if sizes.count(maxSize) > 2: 109 | self.sizeDistribution.update([truncate(maxSize, 2)]) 110 | 111 | def process_result(self): 112 | pass 113 | 114 | 115 | class LineMarginAnalyer: 116 | _previousNode: LTTextContainer 117 | 118 | def __init__(self): 119 | self._distanceCounter = defaultdict(int) 120 | self._headingTrailingCounter = defaultdict(int) 121 | self._previousNode = None 122 | self._y = None 123 | self._previousBoxHeight = None 124 | 125 | def consume(self, node: LTTextContainer): 126 | if self._previousNode: 127 | diff = truncate(abs(self._previousNode.y0 - node.y1), 2) 128 | if self._previousNode.height == node.height: 129 | self._distanceCounter[(diff, node.height)] += 1 130 | else: 131 | self._headingTrailingCounter[(diff, self._previousNode.height, node.height)] += 1 132 | 133 | self._previousNode = node 134 | 135 | def process_result(self): 136 | """ 137 | Find relative line margin threshold that will be used in pdfminers paragraphs algorithm. 138 | lines that are vertically closer than margin * height are considered to belong to the same paragraph. 139 | @return: 140 | """ 141 | (abs_margin, line_height), count = max(self._distanceCounter.items(), key=lambda item: item[1]) 142 | body_line_margin = min(0.5, 1.75 * abs_margin / line_height) 143 | # todo, find next largest value from title_trailing --> margin should be smaller than that 144 | # sorted(self._headingTrailingCounter.keys(), key=lambda keys: keys[1]) 145 | # print("line margin: {}".format(line_margin)) 146 | return body_line_margin 147 | 148 | 149 | def count_sizes(element_gen) -> StyleDistribution: 150 | """ 151 | analyse used fonts, character sizes, paragraph margins etc. 152 | :param element_gen: 153 | :return: 154 | """ 155 | sizeAnalyser = SizeAnalyser() 156 | lineMarginAnalyser = LineMarginAnalyer() 157 | 158 | for element in element_gen: 159 | if isinstance(element, LTTextContainer): 160 | for node in element: 161 | if not isinstance(node, LTTextLine) or node.is_empty() \ 162 | or len(node._objs) == 0: 163 | continue 164 | 165 | sizeAnalyser.consume(node) 166 | lineMarginAnalyser.consume(node) 167 | 168 | if not sizeAnalyser.sizeDistribution: 169 | raise TypeError("document does not contain text") 170 | 171 | return StyleDistribution(sizeAnalyser.sizeDistribution, line_margin=lineMarginAnalyser.process_result()) 172 | -------------------------------------------------------------------------------- /pdfstructure/hierarchy/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChrizH/pdfstructure/1be97d954418b17d0d628ae6595c6aab54c7bd1c/pdfstructure/hierarchy/__init__.py -------------------------------------------------------------------------------- /pdfstructure/hierarchy/detectheader.py: -------------------------------------------------------------------------------- 1 | from collections import Counter 2 | 3 | from pdfminer.layout import LTTextBoxVertical 4 | 5 | from pdfstructure.analysis.styledistribution import StyleDistribution 6 | from pdfstructure.model.document import TextElement 7 | from pdfstructure.model.style import TextSize 8 | from pdfstructure.utils import word_generator 9 | 10 | 11 | def header_detector(element: TextElement, style_distribution: StyleDistribution): 12 | if isinstance(element._data, LTTextBoxVertical): 13 | return False 14 | stats = Counter() 15 | terms = element._data 16 | style = element.style 17 | 18 | if len(element.text) <= 2: 19 | return False 20 | 21 | # data tuple per line, element from pdfminer, annotated style info for whole line 22 | # todo, compute ratios over whole line // or paragraph :O 23 | if (style.bold or style.italic) and style.mapped_font_size >= TextSize.middle \ 24 | or style.mapped_font_size > TextSize.middle \ 25 | or style.max_size > style_distribution.body_size + 2: 26 | return check_valid_header_tokens(terms) 27 | else: 28 | return False 29 | 30 | 31 | def check_valid_header_tokens(element): 32 | """ 33 | fr a paragraph to be treated as a header, it has to contain at least 2 letters. 34 | @param element: 35 | @return: 36 | """ 37 | alpha_count = 0 38 | numeric_count = 0 39 | for word in word_generator(element): 40 | for c in word: 41 | if c.isalpha(): 42 | alpha_count += 1 43 | if c.isnumeric(): 44 | numeric_count += 1 45 | 46 | if alpha_count >= 2: 47 | return True 48 | return False 49 | -------------------------------------------------------------------------------- /pdfstructure/hierarchy/headercompare.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from pdfstructure.model.document import Section 4 | from pdfstructure.utils import word_generator 5 | 6 | numeration_pattern = re.compile("^(?=.*\d+)((?=.*\.)|(?=.*:)).*$") 7 | white_space_pattern = re.compile("\\s+") 8 | 9 | 10 | class SubHeaderPredicate: 11 | """ 12 | Compares two paragraphs that are classified as headers, but have the same mapped FontSize. 13 | - Its possible that those headers are actually not on the same level, based on some conditions like H1 is enumerated & bold, H2 not. 14 | """ 15 | 16 | def __init__(self): 17 | self._conditions = [] 18 | 19 | def add_condition(self, condition): 20 | self._conditions.append(condition) 21 | 22 | def test(self, h1, h2): 23 | return any(condition(h1, h2) for condition in self._conditions) 24 | 25 | 26 | def get_default_sub_header_conditions(): 27 | _isSubHeader = SubHeaderPredicate() 28 | _isSubHeader.add_condition(condition_boldness) 29 | _isSubHeader.add_condition(condition_h1_enum_h2_not) 30 | _isSubHeader.add_condition(condition_h2_extends_h1) 31 | _isSubHeader.add_condition(condition_h1_slightly_bigger_h2) 32 | return _isSubHeader 33 | 34 | 35 | def condition_boldness(h1: Section, h2: Section): 36 | """ 37 | h2 is subheader if:if h1 is bold 38 | - h1 is bold & h2 is not bold 39 | - but skip if h2 is enumerated and h1 is not 40 | @param h1: 41 | @param h2: 42 | @return: 43 | """ 44 | h1start = next(word_generator(h1.heading._data)) 45 | h2start = next(word_generator(h2.heading._data)) 46 | if numeration_pattern.match(h2start) and not numeration_pattern.match(h1start): 47 | return False 48 | 49 | return h1.heading.style.bold and not h2.heading.style.bold 50 | 51 | 52 | def condition_h2_extends_h1(h1: Section, h2: Section): 53 | """ 54 | e.g.: h1 -> 1.1 some header 55 | h2 -> 1.1.2 some sub header 56 | @param h1: 57 | @param h2: 58 | @return: 59 | """ 60 | h1start = next(word_generator(h1.heading._data)) 61 | h2start = next(word_generator(h2.heading._data)) 62 | return len(h2start) > len(h1start) and h1start in h2start 63 | 64 | 65 | def condition_h1_enum_h2_not(h1: Section, h2: Section): 66 | """ 67 | e.g. h1 -> 1.1 some header title 68 | h2 -> some other header title 69 | -> applies only if both headers are of same style type 70 | 71 | """ 72 | if h2.heading.style.bold and not h1.heading.style.bold: 73 | return False 74 | # if h2.heading.style.font_name != h1.heading.style.font_name: 75 | # return False 76 | 77 | h1start = next(word_generator(h1.heading._data)) 78 | h2start = next(word_generator(h2.heading._data)) 79 | return numeration_pattern.match(h1start) and not numeration_pattern.match(h2start) 80 | 81 | 82 | def condition_h1_slightly_bigger_h2(h1: Section, h2: Section): 83 | """s 84 | Style analysis maps found sizes to a predefined enum (xsmall, small, large, xlarge). 85 | but sometimes it makes sense to look deeper. 86 | @param h1: 87 | @param h2: 88 | @return: 89 | """ 90 | return h1.heading.style.mapped_font_size == h2.heading.style.mapped_font_size \ 91 | and h1.heading.style.max_size - h2.heading.style.max_size > 1.0 92 | -------------------------------------------------------------------------------- /pdfstructure/hierarchy/parser.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from typing import List, Generator 3 | 4 | from pdfminer.layout import LTTextContainer, LAParams 5 | 6 | from pdfstructure.analysis.annotate import StyleAnnotator 7 | from pdfstructure.analysis.sizemapper import PivotLogMapper 8 | from pdfstructure.analysis.styledistribution import count_sizes, StyleDistribution 9 | from pdfstructure.hierarchy.detectheader import header_detector 10 | from pdfstructure.hierarchy.headercompare import get_default_sub_header_conditions 11 | from pdfstructure.model.document import TextElement, Section, StructuredPdfDocument, DanglingTextSection 12 | from pdfstructure.source import Source 13 | 14 | 15 | class HierarchyParser: 16 | 17 | def __init__(self, sub_header_conditions=get_default_sub_header_conditions()): 18 | self._isSubHeader = sub_header_conditions 19 | 20 | def parse_pdf(self, source: Source) -> StructuredPdfDocument: 21 | """ 22 | Analysises and parses a PDF document from a given @Source containing its natural hierarchy. 23 | @param source: 24 | @return: 25 | """ 26 | # 1. iterate once through PDF and analyse style distribution 27 | distribution = count_sizes(source.read()) 28 | size_mapper = PivotLogMapper(distribution) 29 | style_annotator = StyleAnnotator(sizemapper=size_mapper, style_info=distribution) 30 | 31 | # 2. iterate second time trough pdf 32 | # - annotate each paragraph with mapped Style 33 | elements_with_style = style_annotator.process(source.read( 34 | override_la_params=LAParams(line_margin=distribution.line_margin))) 35 | 36 | # - create nested document structure on the fly 37 | structured_elements = self.create_hierarchy(elements_with_style, distribution) 38 | 39 | # 3. create wrapped document and capture some metadata 40 | pdf_document = StructuredPdfDocument(elements=structured_elements, style_info=distribution) 41 | enrich_metadata(pdf_document, source) 42 | return pdf_document 43 | 44 | def create_hierarchy(self, element_gen: Generator[TextElement, LTTextContainer, None], 45 | style_distribution: StyleDistribution) -> List[Section]: 46 | """ 47 | Takes incoming flat list of paragraphs and creates nested natural order hierarchy. 48 | 49 | Example Structure: 50 | ================== 51 | Document.pdf 52 | << 53 | 1. H1 Chapter Header 54 | content 55 | 1.2 H2 Section Header 56 | content 57 | 1.3 H2 Section Header 58 | content 59 | 1.3.1 H3 Subsection Header 60 | content 61 | 2. H1 Chapter Header 62 | content 63 | >> 64 | 65 | @param element_gen: 66 | @return: 67 | """ 68 | structured = [] 69 | level_stack = [] 70 | 71 | for element in element_gen: 72 | # if line is header 73 | style = element.style 74 | if header_detector(element, style_distribution): 75 | child = Section(element) 76 | header_size = style.mapped_font_size 77 | 78 | # initial state - push and continue with next element 79 | if not level_stack: 80 | self.__push_to_stack(child, level_stack, structured) 81 | continue 82 | 83 | stack_peek_size = level_stack[-1].heading.style.mapped_font_size 84 | 85 | if stack_peek_size > header_size: 86 | # append element as children 87 | self.__push_to_stack(child, level_stack, structured) 88 | 89 | else: 90 | # go up in hierarchy and insert element (as children) on its level 91 | self.__pop_stack_until_match(level_stack, header_size, child) 92 | self.__push_to_stack(child, level_stack, structured) 93 | 94 | else: 95 | # no header found, add paragraph as a content element to previous node 96 | # - content is on same level as its corresponding header 97 | content_node = Section(element, level=len(level_stack)) 98 | if level_stack: 99 | level_stack[-1].append_children(content_node) 100 | else: 101 | # if last element in output structure has also no header, merge 102 | if structured and isinstance(structured[-1], DanglingTextSection): 103 | structured[-1].append_children(content_node) 104 | else: 105 | # # add dangling content as section 106 | dangling_content = DanglingTextSection() 107 | dangling_content.append_children(content_node) 108 | dangling_content.set_level(len(level_stack)) 109 | structured.append(dangling_content) 110 | 111 | return structured 112 | 113 | def __pop_stack_until_match(self, stack, headerSize, header): 114 | # if top level is smaller than current header to test, pop it 115 | # repeat until top level is bigger or same 116 | 117 | while self.__top_has_no_header(stack) or self.__should_pop_higher_level(stack, header): 118 | poped = stack.pop() 119 | # header on higher level in stack has sime FontSize 120 | # -> check additional sub-header conditions like regexes, enumeration etc. 121 | if poped.heading.style.mapped_font_size == headerSize: 122 | # check if header_to_check is sub-header of poped element within stack 123 | if self._isSubHeader.test(poped, header): 124 | stack.append(poped) 125 | return 126 | 127 | @staticmethod 128 | def __push_to_stack(child, stack, output): 129 | """ 130 | insert incoming paragraph (child) in level(hierarchy) stack. 131 | @param child: next incoming paragraph 132 | @param stack: hierarchy-detect helper stack 133 | @param output: exporting list of elements (contains complete structure in the end) 134 | @return: 135 | """ 136 | if stack: 137 | child.set_level(len(stack)) 138 | stack[-1].children.append(child) 139 | else: 140 | # append as highest order element 141 | output.append(child) 142 | stack.append(child) 143 | 144 | @staticmethod 145 | def __should_pop_higher_level(stack: [Section], header_to_test: Section): 146 | """ 147 | helper method for __pop_stack_until_match: check if last element in stack is smaller then new header-paragraph. 148 | @type header_to_test: object 149 | 150 | """ 151 | if not stack: 152 | return False 153 | return stack[-1].heading.style.mapped_font_size <= header_to_test.heading.style.mapped_font_size 154 | 155 | @staticmethod 156 | def __top_has_no_header(stack: [Section]): 157 | """ 158 | helper method for @__pop_stack_until_match 159 | @param stack: 160 | @return: 161 | """ 162 | if not stack: 163 | return False 164 | return len(stack[-1].heading._data) == 0 165 | 166 | 167 | def enrich_metadata(pdf: StructuredPdfDocument, source: Source): 168 | """ 169 | add some metadata to parsed PDF if possible 170 | # add filename 171 | # todo create document summary 172 | # todo extract document-title from best titles 173 | @return: 174 | """ 175 | # try to capture filename 176 | try: 177 | filename = Path(source.uri).name 178 | pdf.update_metadata("filename", filename) 179 | except Exception as e: 180 | pass 181 | -------------------------------------------------------------------------------- /pdfstructure/hierarchy/traversal.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from collections import deque 3 | from typing import Generator 4 | 5 | from pdfstructure.model.document import StructuredPdfDocument, Section 6 | 7 | 8 | def get_document_depth(document: StructuredPdfDocument): 9 | """ 10 | retrieves document depth found within tree structure, + 1 because the levels are 0 notated. 11 | """ 12 | return max(set([section.level for section in traverse_in_order(document)])) + 1 13 | 14 | 15 | def traverse_inorder_sections_with_content(document: StructuredPdfDocument) -> Generator[ 16 | tuple, StructuredPdfDocument, None]: 17 | """ 18 | Traverse section by section in order and group top children as combined content 19 | @param document: 20 | @return: yields level, title, content 21 | """ 22 | for section in filter(lambda sec: len(sec.children) > 0, traverse_in_order(document)): 23 | children: Section 24 | content = [] 25 | for children in section.children: 26 | if children.children: 27 | continue 28 | content.append(children.heading_text) 29 | yield section.level, section.heading_text, "\n".join(content) 30 | 31 | 32 | def traverse_in_order(document: StructuredPdfDocument) \ 33 | -> Generator[Section, StructuredPdfDocument, None]: 34 | """ 35 | 5 10 36 | / \ \ 37 | 1 2 3 38 | / | \ | 39 | a b c x 40 | 41 | yield order: 42 | - [5,1,a,b,c,2,10,3,x] 43 | """ 44 | 45 | def __traverse__(section: Section): 46 | child: Section 47 | for child in section.children: 48 | yield child 49 | yield from __traverse__(child) 50 | 51 | for element in document.elements: 52 | yield element 53 | yield from __traverse__(element) 54 | 55 | 56 | def traverse_level_order(document: StructuredPdfDocument, max_depth=sys.maxsize) \ 57 | -> Generator[Section, StructuredPdfDocument, None]: 58 | """ 59 | 5 10 60 | / \ \ 61 | 1 2 3 62 | / | \ | 63 | a b c x 64 | 65 | yield order: 66 | - [5,10,1,2,3,a,b,c,x] 67 | 68 | @param document: structured pdf document, each element holds its own dopth information (Section.level) 69 | @param max_depth: yield elements until max_depth is reached 70 | """ 71 | 72 | element_queue = deque() 73 | 74 | element: Section 75 | for element in document.elements: 76 | element_queue.append(element) 77 | 78 | while element_queue: 79 | element = element_queue.popleft() 80 | 81 | if element.level < max_depth: 82 | yield element 83 | 84 | # append next layer of elements / children nodes 85 | for child in element.children: 86 | element_queue.append(child) 87 | -------------------------------------------------------------------------------- /pdfstructure/model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChrizH/pdfstructure/1be97d954418b17d0d628ae6595c6aab54c7bd1c/pdfstructure/model/__init__.py -------------------------------------------------------------------------------- /pdfstructure/model/document.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | from typing import List 3 | 4 | from pdfminer.layout import LTTextContainer 5 | 6 | from pdfstructure.analysis.styledistribution import StyleDistribution 7 | from pdfstructure.model.style import Style 8 | 9 | 10 | class TextElement: 11 | """ 12 | Represents one single TextContainer like a line of words. 13 | """ 14 | 15 | def __init__(self, text_container: LTTextContainer, style: Style, text=None, page=None): 16 | self._data = text_container 17 | self._text = text 18 | self.style = style 19 | self.page = page 20 | 21 | @property 22 | def text(self): 23 | if not self._data: 24 | return self._text 25 | else: 26 | return self._data.get_text().strip() 27 | 28 | @classmethod 29 | def from_json(cls, data: dict): 30 | """ 31 | 32 | @param data: 33 | @return: 34 | """ 35 | if data: 36 | return TextElement(text_container=None, style=Style.from_json(data["style"]), 37 | text=data["text"]) 38 | return None 39 | 40 | def __str__(self): 41 | return self.text 42 | 43 | 44 | class Section: 45 | """ 46 | Represents a section with title, contents and children 47 | """ 48 | heading: TextElement 49 | 50 | def __init__(self, element: TextElement, level=0): 51 | self.heading = element 52 | self.children = [] # Section 53 | self.level = None 54 | self.set_level(level) 55 | 56 | def set_level(self, level): 57 | self.level = level 58 | 59 | def append_children(self, section): 60 | self.children.append(section) 61 | 62 | @property 63 | def full_content(self): 64 | """ 65 | Returns merged full content of all nested children. 66 | @return: 67 | """ 68 | contents = [self.heading_text] if self.heading_text else [] 69 | 70 | def __traverse__(section: Section): 71 | child: Section 72 | for child in section.children: 73 | yield child 74 | yield from __traverse__(child) 75 | 76 | for child in __traverse__(self): 77 | if child.heading_text: 78 | contents.append(child.heading_text) 79 | return "\n".join(contents) 80 | 81 | @property 82 | def top_level_content(self): 83 | """ 84 | Paragraphs that belong directly to section, nested children are skipped. 85 | Example: 86 | This is a Header 87 | paragraph 1 88 | paragraph 2 89 | This is a subheader 90 | paragraph 3 91 | Returns: 92 | [paragraph 1, paragraph 2] 93 | 94 | @return: List[Section] 95 | """ 96 | child: Section 97 | content = [] 98 | for child in self.children: 99 | if child.children: 100 | continue 101 | content.append(child) 102 | return content 103 | 104 | @classmethod 105 | def from_json(cls, data: dict): 106 | children = list(map(Section.from_json, data.get("children"))) 107 | heading = TextElement.from_json(data.get("heading")) 108 | element = cls(heading, data["level"]) 109 | element.children = children 110 | return element 111 | 112 | @property 113 | def heading_text(self): 114 | if self.heading and self.heading.text: 115 | return self.heading.text 116 | else: 117 | return "" 118 | 119 | def __str__(self): 120 | return self.heading_text 121 | # return "{}\n{}".format(self.heading.text, 122 | # " ".join([str(child.heading.text) for child in self.children])) 123 | 124 | 125 | class DanglingTextSection(Section): 126 | def __init__(self): 127 | super().__init__(element=None) 128 | 129 | def __str__(self): 130 | return "{}".format(" ".join([str(e) for e in self.content])) 131 | 132 | 133 | class StructuredPdfDocument: 134 | """ 135 | PDF document containing its natural order hierarchy, as detected by the HierarchyParser. 136 | """ 137 | elements: List[Section] 138 | 139 | def __init__(self, elements: [Section], style_info=None): 140 | self.metadata = defaultdict(str) 141 | self.elements = elements 142 | self.metadata["style_distribution"] = style_info 143 | 144 | def update_metadata(self, key, value): 145 | self.metadata[key] = value 146 | 147 | @property 148 | def text(self): 149 | return "\n".join([item.full_content for item in self.elements]) 150 | 151 | @property 152 | def title(self): 153 | return self.metadata.get("title") 154 | 155 | @property 156 | def style_distribution(self) -> StyleDistribution: 157 | return self.metadata.get("style_distribution") 158 | 159 | @classmethod 160 | def from_json(cls, data: dict): 161 | elements = list(map(Section.from_json, data["elements"])) 162 | pdf = cls(elements) 163 | pdf.metadata.update(data.get("metadata")) 164 | return pdf 165 | -------------------------------------------------------------------------------- /pdfstructure/model/style.py: -------------------------------------------------------------------------------- 1 | from enum import IntEnum, auto 2 | 3 | 4 | class TextSize(IntEnum): 5 | """ 6 | Used to annotate extracted paragraphs with a predefined range of possible FontSize. 7 | Convenient to perform style-based grouping of similar pieces of text. 8 | """ 9 | xsmall = auto() 10 | small = auto() 11 | middle = auto() 12 | large = auto() 13 | xlarge = auto() 14 | 15 | @classmethod 16 | def from_range(cls, borders: tuple, value: int): 17 | if value < borders[0]: 18 | return cls.xsmall 19 | elif borders[0] <= value < borders[1]: 20 | return cls.small 21 | elif borders[1] <= value < borders[2]: 22 | return cls.middle 23 | elif borders[2] <= value < borders[3]: 24 | return cls.large 25 | elif value >= borders[3]: 26 | return cls.xlarge 27 | 28 | 29 | class Style: 30 | """ 31 | Extracted paragraphs get annotated with found font-style information. 32 | """ 33 | 34 | def __init__(self, bold, italic, font_name, mapped_font_size: TextSize, mean_size: float, max_size: float): 35 | self.bold = bold 36 | self.italic = italic 37 | self.font_name = font_name 38 | self.mapped_font_size = mapped_font_size 39 | self.mean_size = mean_size 40 | self.max_size = max_size 41 | 42 | @classmethod 43 | def from_json(cls, data: dict): 44 | mapped_size = TextSize[data["mapped_font_size"]] 45 | data["mapped_font_size"] = mapped_size 46 | return cls(**data) 47 | 48 | def __gt__(self, other): 49 | if isinstance(other, Style): 50 | return self.mapped_font_size > other.mapped_font_size or \ 51 | self.mapped_font_size == other.mapped_font_size and self.bold and not other.bold 52 | return False 53 | 54 | def __lt__(self, other): 55 | if isinstance(other, Style): 56 | return self.mapped_font_size < other.mapped_font_size or \ 57 | self.mapped_font_size == other.mapped_font_size and not self.bold and other.bold 58 | return False 59 | 60 | def __eq__(self, other): 61 | if isinstance(other, Style): 62 | return self.mapped_font_size == other.mapped_font_size and \ 63 | self.bold == other.bold 64 | else: 65 | return False 66 | -------------------------------------------------------------------------------- /pdfstructure/printer.py: -------------------------------------------------------------------------------- 1 | import json 2 | from typing import Iterator 3 | 4 | from pdfstructure.hierarchy.traversal import traverse_in_order 5 | from pdfstructure.model.document import Section, StructuredPdfDocument, TextElement 6 | from pdfstructure.model.style import Style 7 | from pdfstructure.utils import dict_subset 8 | 9 | 10 | class Printer: 11 | def print(self, document: StructuredPdfDocument, *args, **kwargs): 12 | pass 13 | 14 | 15 | class PrettyStringPrinter(Printer): 16 | """ 17 | pretty prints nested document structure with \t prefixes per level. 18 | """ 19 | 20 | @staticmethod 21 | def get_title_prefix(level): 22 | return "".join(["\t" for i in range(level)]) 23 | 24 | def make_item_pretty(self, item_gen: Iterator[Section]): 25 | """ 26 | yield pretty string representation of given element 27 | - add prefix for each paragraph, corresponding to its level 28 | - put Title in brackets 29 | @param item_gen: all elements in order, generator provided by StructuredPdfDocument.traverse() 30 | """ 31 | for element in item_gen: 32 | prefix = self.get_title_prefix(element.level) 33 | formatted_text = element.heading_text.rstrip().replace("\n", "\n{}".format(prefix)) 34 | # if element has children, then its content is a header 35 | if element.children: 36 | content = "\n\n{}[{}]".format(prefix, formatted_text) 37 | else: 38 | # render as normal content 39 | content = "\n{}{}".format(prefix, formatted_text) 40 | yield content 41 | 42 | def print(self, document: StructuredPdfDocument, *args, **kwargs): 43 | element_iterator = traverse_in_order(document) 44 | data = [item for item in self.make_item_pretty(element_iterator)] 45 | return "".join(data) 46 | 47 | 48 | class PrettyStringFilePrinter(PrettyStringPrinter): 49 | def print(self, document: StructuredPdfDocument, *args, **kwargs) -> str: 50 | """ 51 | 52 | @param **kwargs: 53 | @param document: 54 | @param args: 55 | @param kwargs: 56 | Keyword Args: 57 | file_path (str): path to output file 58 | @return: file_path to outputfile 59 | """ 60 | output_file = kwargs.get("file_path") 61 | print("write to file: {}".format(output_file)) 62 | with open(output_file, "w") as file: 63 | element_iterator = traverse_in_order(document) 64 | for pretty in self.make_item_pretty(element_iterator): 65 | file.write(pretty) 66 | return output_file 67 | 68 | 69 | class ElementTextEncoder(json.JSONEncoder): 70 | def default(self, e): 71 | if isinstance(e, TextElement): 72 | properties = e.__dict__.copy() 73 | properties["data"] = e._data.get_text() 74 | return properties 75 | else: 76 | return super().default(e) 77 | 78 | 79 | def encode_pdf_element(obj): 80 | """ 81 | customizse pdf element encoding 82 | - get rid of detailed pdf information retrieved from pdfminer like bounding box coords 83 | - use mapped fontsize name instead of ordinal value 84 | @param obj: 85 | @return: 86 | """ 87 | if isinstance(obj, TextElement): 88 | properties = dict_subset(obj.__dict__.copy(), ("_data", "_text")) 89 | properties["text"] = obj.text 90 | properties["style"] = encode_pdf_element(obj.style) 91 | return properties 92 | elif isinstance(obj, Style): 93 | properties = obj.__dict__.copy() 94 | properties["mapped_font_size"] = str(obj.mapped_font_size.name) 95 | return properties 96 | else: 97 | return obj.__dict__ 98 | 99 | 100 | class JsonStringPrinter(Printer): 101 | def print(self, document: StructuredPdfDocument, *args, **kwargs): 102 | return json.dumps(document, default=encode_pdf_element, indent=4) 103 | 104 | 105 | class JsonFilePrinter(Printer): 106 | def print(self, document: StructuredPdfDocument, *args, **kwargs): 107 | """ 108 | @param document: 109 | @param args: 110 | @param kwargs: 111 | Keyword Args: 112 | file_path (str): path to output file 113 | @return: 114 | """ 115 | file_path = kwargs.get("file_path") 116 | with open(file_path, "w") as fp: 117 | json.dump(document, fp=fp, default=encode_pdf_element, indent=4) 118 | return file_path 119 | -------------------------------------------------------------------------------- /pdfstructure/source.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | from typing import Generator, Any 3 | 4 | from pdfminer.high_level import extract_pages 5 | from pdfminer.layout import LTTextContainer, LAParams, LTFigure, LTTextBoxHorizontal, LTTextLineHorizontal, LTChar, \ 6 | LTTextBoxVertical 7 | 8 | 9 | class Source: 10 | """ 11 | Abstract interface to read a PDF from somewhere. 12 | """ 13 | 14 | def __init__(self, uri=None): 15 | """ 16 | 17 | @param uri: points to pdf that should be read 18 | """ 19 | self.uri = uri 20 | 21 | def config(self): 22 | """ 23 | get source configuration 24 | @return: 25 | """ 26 | pass 27 | 28 | def read(self, *args, **kwargs) -> Generator[LTTextContainer, Any, None]: 29 | """ 30 | yields flat list of paragraphs within a document. 31 | @param args: 32 | @param kwargs: 33 | @return: 34 | """ 35 | pass 36 | 37 | 38 | class FileSource(Source): 39 | def __init__(self, file_path: str, page_numbers=None, 40 | la_params=LAParams(boxes_flow=0.3, detect_vertical=True, line_margin=0.3)): 41 | super().__init__(uri=file_path) 42 | self.page_numbers = page_numbers 43 | self.la_params = la_params 44 | 45 | def config(self): 46 | return self.__dict__ 47 | 48 | def __handle_lt_figure(self, element: LTFigure): 49 | """ 50 | sometimes pieces of text are wrongly detected as LTFigure, e.g. in slide-sets with border lines. 51 | -> extract text from LTFigure line by line put them into a LTTextBoxHorizontal as a workaround 52 | @return: LTTextBoxHorizontal containing found texts line by line 53 | """ 54 | # check if text is hold within figure element, forward 55 | 56 | line = LTTextLineHorizontal(0) 57 | wrapper = LTTextBoxHorizontal() 58 | wrapper.add(line) 59 | 60 | y_prior = element._objs[0].y0 61 | 62 | for letter in element: 63 | if isinstance(letter, LTChar): 64 | if abs(letter.y0 - y_prior) > 0.05: 65 | # new line, yield wrapper 66 | wrapper.analyze(self.la_params) 67 | yield wrapper 68 | 69 | wrapper = LTTextBoxHorizontal() 70 | line = LTTextLineHorizontal(0) 71 | wrapper.add(line) 72 | y_prior = letter.y0 73 | 74 | line.add(letter) 75 | 76 | def split_boxes_by_style(self, container: LTTextContainer) -> Generator[LTTextContainer, LTTextContainer, None]: 77 | """ 78 | pdfminers paragraphs are sometimes too broad and contain lines that should be splitted into header and content 79 | @param container: the extracted original paragraph 80 | """ 81 | if isinstance(container, LTTextBoxVertical): 82 | yield container 83 | return 84 | 85 | line: LTTextLineHorizontal 86 | wrapper = LTTextBoxHorizontal() 87 | wrapper.page = container.page 88 | stack = [] 89 | for line in container: 90 | size = max([obj.size for obj in itertools.islice(line, 10) if isinstance(obj, LTChar)]) 91 | if not stack: 92 | wrapper.add(line) 93 | stack.append(size) 94 | else: 95 | prior = stack.pop() 96 | stack.append(size) 97 | diff = abs(prior - size) 98 | if diff != 0 and max(prior, size) / min(prior, size) > 1.15: 99 | # break paragraph 100 | yield wrapper 101 | wrapper = LTTextBoxHorizontal() 102 | wrapper.add(line) 103 | yield wrapper 104 | 105 | def read(self, override_la_params=None, override_page_numbers=None) -> Generator[LTTextContainer, Any, None]: 106 | pNumber = 0 107 | # disable boxes_flow, style based hierarchy detection is based on purely flat list of paragraphs 108 | # params = LAParams(boxes_flow=None, detect_vertical=False) # setting for easy doc 109 | # params = LAParams(boxes_flow=0.5, detect_vertical=True) # setting for column doc 110 | if override_la_params: 111 | # use dynamic line_margin 112 | self.la_params.line_margin = override_la_params.line_margin 113 | # todo, do pre-analysis in count_sizes --> are there many boxes within same line 114 | # todo, understand LAParams, for columns, NONE works better, for vertical only layout LAParams(boxes_flow=None, detect_vertical=False) works better!! :O 115 | # do some sort of layout analyis, if there are many boxes vertically next to each other, use layout analysis 116 | # - column type 117 | # - straight forward document 118 | for page_layout in extract_pages(self.uri, 119 | laparams=self.la_params, 120 | page_numbers=self.page_numbers if not override_page_numbers else override_page_numbers): 121 | for element in page_layout: 122 | element.page = pNumber 123 | if isinstance(element, LTTextContainer): 124 | yield from self.split_boxes_by_style(element) 125 | #yield element 126 | elif isinstance(element, LTFigure): 127 | yield from self.__handle_lt_figure(element) 128 | pNumber += 1 129 | -------------------------------------------------------------------------------- /pdfstructure/utils.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import math 3 | import os 4 | from pathlib import Path 5 | from typing import Generator 6 | 7 | from pdfminer.high_level import extract_pages 8 | from pdfminer.layout import LTTextContainer, LTChar, LTTextLine, LAParams, LTTextLineHorizontal 9 | 10 | 11 | def char_generator(text_container: LTTextContainer): 12 | for container in text_container: 13 | if isinstance(container, LTChar): 14 | yield container 15 | elif isinstance(container, LTTextLine): 16 | for obj in container: 17 | if isinstance(obj, LTChar): 18 | yield obj 19 | 20 | 21 | def dict_subset(d, exclude_keys): 22 | return {k: v for k, v in d.items() if k not in exclude_keys} 23 | 24 | 25 | def word_generator(text_container: LTTextContainer): 26 | """ 27 | iterates through container's characters and yields a word as soon as found (trailing whitespace). 28 | @param text_container: 29 | @return: 30 | """ 31 | characters = [] 32 | 33 | for obj in char_generator(text_container): 34 | character = obj.get_text() 35 | if character != " ": 36 | characters.append(character) 37 | else: 38 | word = "".join(characters).strip() 39 | # skip if word is just a whitespace on its own 40 | if len(word) > 0: 41 | yield word 42 | characters.clear() 43 | if characters: 44 | yield "".join(characters) 45 | 46 | 47 | def element_generator(file_path: str, page_numbers=None) -> Generator[LTTextContainer, None, None]: 48 | """ 49 | yields flat list of paragraphs within a document. 50 | :param file_path: 51 | :return: 52 | """ 53 | pNumber = 0 54 | # disable boxes_flow, style based hierarchy detection is based on purely flat list of paragraphs 55 | params = LAParams(boxes_flow=None, detect_vertical=False) # setting for easy doc 56 | # params = LAParams(boxes_flow=0.5, detect_vertical=True) # setting for column doc 57 | # todo, do pre-analysis in count_sizes --> are there many boxes within same line 58 | # todo, understand LAParams, for columns, NONE works better, for vertical only layout LAParams(boxes_flow=None, detect_vertical=False) works better!! :O 59 | # do some sort of layout analyis, if there are many boxes vertically next to each other, use layout analysis 60 | # - column type 61 | # - straight forward document 62 | for page_layout in extract_pages(file_path, laparams=params, page_numbers=page_numbers): 63 | for element in page_layout: 64 | if isinstance(element, LTTextContainer): 65 | element.meta = {"page": pNumber} 66 | yield element 67 | pNumber += 1 68 | 69 | 70 | def truncate(number, decimals=0): 71 | """ 72 | Returns a value truncated to a specific number of decimal places. 73 | """ 74 | if not isinstance(decimals, int): 75 | raise TypeError("decimal places must be an integer.") 76 | elif decimals < 0: 77 | raise ValueError("decimal places has to be 0 or more.") 78 | elif decimals == 0: 79 | return math.trunc(number) 80 | 81 | factor = 10.0 ** decimals 82 | return math.trunc(number * factor) / factor 83 | 84 | 85 | class DocTypeFilter: 86 | 87 | def __init__(self, endings=("doc", "docx", "ppt", "pptx", "xls", "xlsx", "odt", "rtf")): 88 | self.endings = endings if isinstance(endings, (list, tuple)) else (endings) 89 | 90 | def test(self, name): 91 | return name.split(".")[-1].lower() in self.endings 92 | 93 | 94 | def closest_key(sorted_dict, key): 95 | "Return closest key in `sorted_dict` to given `key`." 96 | assert len(sorted_dict) > 0 97 | keys = list(itertools.islice(sorted_dict.irange(minimum=key), 1)) 98 | keys.extend(itertools.islice(sorted_dict.irange(maximum=key, reverse=True), 1)) 99 | return min(keys, key=lambda k: abs(key - k)) 100 | 101 | 102 | def find_file(root_dir: str, type_filter: DocTypeFilter, print_mod=10) -> iter([Path]): 103 | processed = 0 104 | for root, dirs, files in os.walk(root_dir): 105 | for file in files: 106 | if type_filter.test(file): 107 | yield Path(root + "/" + file) 108 | processed += 1 109 | if print_mod and processed % print_mod == 0: 110 | print("\nprocessed {}\n".format(processed)) 111 | print("found {} file-paths".format(processed)) 112 | 113 | 114 | def head_char_line(container: LTTextLineHorizontal) -> LTChar: 115 | """ 116 | :rtype LTChar 117 | :param container: 118 | :return: 119 | """ 120 | for obj in container: 121 | if isinstance(obj, LTChar): 122 | return obj 123 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # PDF Structural Parser 2 | `pdfstructure` detects, splits and organises the documents text content into its natural structure as envisioned by the author. 3 | The document structure, or hierarchy, stores the relation between chapters, sections and their sub sections in a nested, recursive manner. 4 | 5 | 6 | `pdfstructure` is in early development and built on top of [pdfminer.six](https://github.com/pdfminer/pdfminer.six). 7 | 8 | - Paragraph extraction is performed leveraging `pdfminer.high_level.extract_pages()`. 9 | - Those paragraphs are then grouped together according to some basic (extendable) heuristics. 10 | 11 | ## Document Model 12 | 13 | ``` 14 | class StructuredDocument: 15 | metadata: dict 16 | sections: List[Section] 17 | 18 | class Section: 19 | content: TextElement 20 | children: List[Section] 21 | level: int 22 | 23 | class TextElement: 24 | text: LTTextContainer # the extracted paragraph from pdfminer 25 | style: Style 26 | ``` 27 | 28 | ## Load and parse PDF 29 | 30 | **Illustration of document structure** 31 | 32 | The following screenshot contains sections and subsections with their respective content. 33 | In that case, the structure can be easily parsed by leveraging the Font Style only. 34 | 35 | ![Example PDF](tests/resources/interview_cheatsheet-excerpt.png?raw=true) 36 | *PDF source: [github.com/TSiege](https://gist.github.com/TSiege/cbb0507082bb18ff7e4b)* 37 | 38 | 39 | **Parse PDF** 40 | ``` 41 | from pdfstructure.hierarchy.parser import HierarchyParser 42 | from pdfstructure.source import FileSource 43 | 44 | parser = HierarchyParser() 45 | 46 | # specify source (that implements source.read()) 47 | source = FileSource(path) 48 | 49 | # analyse document and parse as nested data structure 50 | document = parser.parse_pdf(source) 51 | ``` 52 | 53 | ### Serialize Document to String 54 | To export the parsed structure, use a printer implementation. 55 | ``` 56 | from pdfstructure.printer import PrettyStringPrinter 57 | 58 | stringExporter = PrettyStringPrinter() 59 | prettyString = stringExporter.print(document) 60 | ``` 61 | 62 | **Excerpt of the parsed document (serialized to string)** 63 | 64 | [Parsed data: interview_cheatsheet_pretty.txt](tests/resources/parsed/interview_cheatsheet_pretty.txt?raw=true) 65 | ``` 66 | [Search Basics] 67 | [Breadth First Search] 68 | [Definition:] 69 | An algorithm that searches a tree (or graph) by searching levels of the tree first, starting at the root. 70 | It finds every node on the same level, most often moving left to right. 71 | While doing this it tracks the children nodes of the nodes on the current level. 72 | When finished examining a level it moves to the left most node on the next level. 73 | The bottom-right most node is evaluated last (the node that is deepest and is farthest right of it's level). 74 | 75 | [What you need to know:] 76 | Optimal for searching a tree that is wider than it is deep. 77 | Uses a queue to store information about the tree while it traverses a tree. 78 | Because it uses a queue it is more memory intensive than depth first search. 79 | The queue uses more memory because it needs to stores pointers 80 | ``` 81 | 82 | ### Encode Document to JSON 83 | ``` 84 | from pdfstructure.printer import JsonFilePrinter 85 | 86 | printer = JsonFilePrinter() 87 | file_path = Path("resources/parsed/interview_cheatsheet.json") 88 | 89 | printer.print(document, file_path=str(file_path.absolute())) 90 | ``` 91 | 92 | [Parsed data: interview_cheatsheet.json](tests/resources/parsed/interview_cheatsheet.json?raw=true) 93 | 94 | **Excerpt of exported json** 95 | ``` 96 | { 97 | "metadata": { 98 | "style_info": { 99 | "_data": { 100 | "7.99": 24, 101 | "9.6": 1, 102 | "6.4": 7, 103 | "7.47": 3, 104 | "12.8": 7, 105 | "8.53": 206, 106 | "10.67": 12, 107 | "7.25": 14 108 | }, 109 | "_body_size": 8.53, 110 | "_min_found_size": 6.4, 111 | "_max_found_size": 12.8 112 | }, 113 | "filename": "interview_cheatsheet.pdf" 114 | }, 115 | "elements": [ 116 | { 117 | "content": { 118 | "style": { 119 | "bold": true, 120 | "italic": false, 121 | "font_name": ".SFNSDisplay-Semibold", 122 | "mapped_font_size": "xlarge", 123 | "mean_size": 12.8, 124 | "max_size": 12.806323818403143 125 | }, 126 | "page": 0, 127 | "text": "Data Structure Basics" 128 | }, 129 | "children": [ 130 | { 131 | "content": { 132 | "style": { 133 | "bold": true, 134 | "italic": false, 135 | "font_name": ".SFNSDisplay-Semibold", 136 | "mapped_font_size": "large", 137 | "mean_size": 10.6, 138 | "max_size": 10.671936515335972 139 | }, 140 | "page": 0, 141 | "text": "Array" 142 | }, 143 | "children": [ 144 | { 145 | "content": { 146 | "style": { 147 | "bold": true, 148 | "italic": false, 149 | "font_name": ".SFNSText-Semibold", 150 | "mapped_font_size": "middle", 151 | "mean_size": 8.5, 152 | "max_size": 8.537549212268772 153 | }, 154 | "page": 0, 155 | "text": "Definition:" 156 | }, 157 | .... 158 | ``` 159 | 160 | 161 | ### Load JSON as StructuredPdfDocument 162 | 163 | Of course, encoded documents can be easily decoded and used for further analysis. 164 | However, detailed information like bounding boxes or coordinates for each character are not persisted. 165 | 166 | ``` 167 | from pdfstructure.model.document import StructuredPdfDocument 168 | 169 | jsonString = json.load(file) 170 | document = StructuredPdfDocument.from_json(jsonString) 171 | 172 | print(document.title) 173 | `` 174 | $ "interview_cheatsheet.pdf" 175 | ``` 176 | 177 | ## Traverse through document structure 178 | Having all paragraphs and sections organised as a general tree, 179 | its straight forward to iterate through the layers and search for specific elements like headlines, or extract all main headers like chapter titles. 180 | 181 | Two document traversal generators are available that yield each section `in-order` or in `level-order` respectively. 182 | ``` 183 | from pdfstructure.hierarchy.traversal import traverse_in_order 184 | 185 | elements_flat_in_order = [element for element in traverse_in_order(document)] 186 | 187 | Exemplary illustration of yield order: 188 | """ 189 | 5 10 190 | / \ \ 191 | 1 2 3 192 | / | \ | 193 | a b c x 194 | 195 | yield order: 196 | - [5,1,a,b,c,2,10,3,x] 197 | """ 198 | ``` 199 | 200 | 201 | # TODOs 202 | - [ ] **Detect the document layout type (Columns, Book, Magazine)** 203 | 204 | The provided layout analysis algorithm by pdfminer.six performs well on more straightforward documents with default settings. 205 | However, more complicated layouts like scientific papers need custom `LAParams` settings to retrieve paragraphs in correct reading order. 206 | - [ ] High level diagram of algorithm workflow 207 | - [ ] Performance improvement in terms of speed -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | sortedcontainers==2.2.2 2 | pdfminer.six==20200517 -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name='pdfstructure', 5 | description="PDF Natural Structure Parser", 6 | version='0.0.1', 7 | author="Christian Hofer", 8 | author_email="christianhofer91@gmail.com", 9 | packages=find_packages(exclude=("tests")) 10 | ) 11 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChrizH/pdfstructure/1be97d954418b17d0d628ae6595c6aab54c7bd1c/tests/__init__.py -------------------------------------------------------------------------------- /tests/helper.py: -------------------------------------------------------------------------------- 1 | from pdfstructure.analysis.annotate import StyleAnnotator 2 | from pdfstructure.analysis.sizemapper import PivotLogMapper 3 | from pdfstructure.analysis.styledistribution import count_sizes 4 | from pdfstructure.utils import element_generator 5 | 6 | 7 | def generate_annotated_lines(file_path): 8 | """ 9 | yields paragraph detected by pdfminer annotated with detected & mapped style information 10 | """ 11 | element_gen = element_generator(file_path) 12 | distribution = count_sizes(element_gen) 13 | sizeMapper = PivotLogMapper(distribution) 14 | style_annotator = StyleAnnotator(sizemapper=sizeMapper, style_info=distribution) 15 | 16 | elements = element_generator(file_path) 17 | with_style = style_annotator.process(elements) 18 | 19 | yield from with_style 20 | -------------------------------------------------------------------------------- /tests/resources/5648.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChrizH/pdfstructure/1be97d954418b17d0d628ae6595c6aab54c7bd1c/tests/resources/5648.pdf -------------------------------------------------------------------------------- /tests/resources/IE00BM67HT60-ATB-FS-DE-2020-2-28.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChrizH/pdfstructure/1be97d954418b17d0d628ae6595c6aab54c7bd1c/tests/resources/IE00BM67HT60-ATB-FS-DE-2020-2-28.pdf -------------------------------------------------------------------------------- /tests/resources/SameSize_BoldTitle.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChrizH/pdfstructure/1be97d954418b17d0d628ae6595c6aab54c7bd1c/tests/resources/SameSize_BoldTitle.pdf -------------------------------------------------------------------------------- /tests/resources/SameSize_EnumeratedTitle.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChrizH/pdfstructure/1be97d954418b17d0d628ae6595c6aab54c7bd1c/tests/resources/SameSize_EnumeratedTitle.pdf -------------------------------------------------------------------------------- /tests/resources/SameStyleOnly.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChrizH/pdfstructure/1be97d954418b17d0d628ae6595c6aab54c7bd1c/tests/resources/SameStyleOnly.pdf -------------------------------------------------------------------------------- /tests/resources/interview_cheatsheet-excerpt.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChrizH/pdfstructure/1be97d954418b17d0d628ae6595c6aab54c7bd1c/tests/resources/interview_cheatsheet-excerpt.png -------------------------------------------------------------------------------- /tests/resources/interview_cheatsheet.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChrizH/pdfstructure/1be97d954418b17d0d628ae6595c6aab54c7bd1c/tests/resources/interview_cheatsheet.pdf -------------------------------------------------------------------------------- /tests/resources/lorem.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChrizH/pdfstructure/1be97d954418b17d0d628ae6595c6aab54c7bd1c/tests/resources/lorem.pdf -------------------------------------------------------------------------------- /tests/resources/paper.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChrizH/pdfstructure/1be97d954418b17d0d628ae6595c6aab54c7bd1c/tests/resources/paper.pdf -------------------------------------------------------------------------------- /tests/resources/parsed/interview_cheatsheet_pretty.txt: -------------------------------------------------------------------------------- 1 | 2 | 3 | [] 4 | 31/10/2019 5 | This is my technical interview cheat sheet. Feel free to fork it or do whatever you want with it. PLEASE let me know if there are any errors or if a… 6 | 7 | [TSiege / The Technical Interview Cheat Sheet.md] 8 | Last active 2 days ago • Report abuse 9 | Embed 10 | Download ZIP 11 |