├── .gitignore
├── examples
    └── __init__.py
├── pdfstructure
    ├── __init__.py
    ├── analysis
    │   ├── __init__.py
    │   ├── annotate.py
    │   ├── sizemapper.py
    │   └── styledistribution.py
    ├── hierarchy
    │   ├── __init__.py
    │   ├── detectheader.py
    │   ├── headercompare.py
    │   ├── parser.py
    │   └── traversal.py
    ├── model
    │   ├── __init__.py
    │   ├── document.py
    │   └── style.py
    ├── printer.py
    ├── source.py
    └── utils.py
├── readme.md
├── requirements.txt
├── setup.py
└── tests
    ├── __init__.py
    ├── helper.py
    ├── resources
        ├── 5648.pdf
        ├── IE00BM67HT60-ATB-FS-DE-2020-2-28.pdf
        ├── SameSize_BoldTitle.pdf
        ├── SameSize_EnumeratedTitle.pdf
        ├── SameStyleOnly.pdf
        ├── interview_cheatsheet-excerpt.png
        ├── interview_cheatsheet.pdf
        ├── lorem.pdf
        ├── paper.pdf
        ├── parsed
        │   ├── interview_cheatsheet.json
        │   └── interview_cheatsheet_pretty.txt
        └── samplepptx.pdf
    ├── test_custom_use_cases.py
    ├── test_document.py
    ├── test_headercompare.py
    ├── test_hierarchy.py
    ├── test_printer.py
    ├── test_style_analyser.py
    ├── test_traversal.py
    └── test_utils.py


/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | .idea
3 | 


--------------------------------------------------------------------------------
/examples/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChrizH/pdfstructure/1be97d954418b17d0d628ae6595c6aab54c7bd1c/examples/__init__.py


--------------------------------------------------------------------------------
/pdfstructure/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChrizH/pdfstructure/1be97d954418b17d0d628ae6595c6aab54c7bd1c/pdfstructure/__init__.py


--------------------------------------------------------------------------------
/pdfstructure/analysis/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChrizH/pdfstructure/1be97d954418b17d0d628ae6595c6aab54c7bd1c/pdfstructure/analysis/__init__.py


--------------------------------------------------------------------------------
/pdfstructure/analysis/annotate.py:
--------------------------------------------------------------------------------
 1 | import statistics
 2 | from collections import Counter
 3 | 
 4 | from pdfminer.layout import LTTextBoxHorizontal, LTChar
 5 | 
 6 | from pdfstructure.analysis.sizemapper import SizeMapper
 7 | from pdfstructure.analysis.styledistribution import StyleDistribution
 8 | from pdfstructure.model.document import TextElement
 9 | from pdfstructure.model.style import Style, TextSize
10 | from pdfstructure.utils import truncate
11 | 
12 | 
13 | class StyleAnnotator:
14 |     """
15 |     creates a PdfElements from incoming pdf-paragraphs (raw LTTextContainer from pdfminer.six).
16 |     - annotates paragraph with @Style(italic, bold, fontname, mapped_size, mean_size).
17 |     - mapped_font_size: captures most dominant character size within paragraph & maps it to TextSize Enum.
18 |       mapped Size is leveraged by the hierarchy detection algorithm.
19 |     """
20 | 
21 |     def __init__(self, sizemapper: SizeMapper, style_info: StyleDistribution):
22 |         self._sizeMapper = sizemapper
23 |         self._styleInfo = style_info
24 | 
25 |     @staticmethod
26 |     def __investigate_box_style(element):
27 |         fonts = Counter()
28 |         sizes = []
29 |         for line in element:
30 |             for c in line:
31 |                 if isinstance(c, LTChar):
32 |                     fonts.update([c.fontname])
33 |                     sizes.append(c.size)
34 |         return fonts, sizes
35 | 
36 |     def process(self, element_gen):  # element: LTTextContainer):
37 |         """"
38 |         annotate each element with fontsize
39 |         """
40 |         for element in element_gen:
41 |             if isinstance(element, LTTextBoxHorizontal):
42 | 
43 |                 fonts, sizes = self.__investigate_box_style(element)
44 |                 if not fonts or not element.get_text().rstrip():
45 |                     continue
46 | 
47 |                 font_name = fonts.most_common(1)[0][0]
48 |                 mean_size = truncate(statistics.mean(sizes), 1)
49 |                 max_size = max(sizes)
50 |                 # todo currently empty boxes are forwarded.. with holding only \n
51 |                 mapped_size = self._sizeMapper.translate(target_enum=TextSize,
52 |                                                          value=max_size)
53 |                 s = Style(bold="bold" in str(font_name.lower()),
54 |                           italic="italic" in font_name.lower(),
55 |                           font_name=font_name,
56 |                           mapped_font_size=mapped_size,
57 |                           mean_size=mean_size, max_size=max_size)
58 | 
59 |                 # todo, split lines within LTTextBoxHorizontal
60 |                 #  split using style as differentiator
61 |                 #  e.g 1st is title with bold text
62 |                 #      2nd & 3rd line are introduction lines with body style
63 |                 #      -> forward 2 boxes (header, content)
64 |                 yield TextElement(text_container=element, style=s,
65 |                                   page=element.page if hasattr(element, "page") else None)
66 | 


--------------------------------------------------------------------------------
/pdfstructure/analysis/sizemapper.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | from enum import Enum
  3 | from typing import Type
  4 | 
  5 | from pdfstructure.analysis.styledistribution import StyleDistribution
  6 | from pdfstructure.model.style import TextSize
  7 | 
  8 | 
  9 | class SizeMapper:
 10 | 
 11 |     def __init__(self):
 12 |         self._borders = None
 13 | 
 14 |     @property
 15 |     def borders(self):
 16 |         return self._borders
 17 | 
 18 |     def translate(self, target_enum: Type[TextSize], value) -> Enum:
 19 |         return TextSize.from_range(self.borders, value)
 20 | 
 21 | 
 22 | class PivotLogMapper(SizeMapper):
 23 |     def __init__(self, style_info: StyleDistribution, bins=5):
 24 |         super().__init__()
 25 |         self.bins = bins
 26 |         borders = []
 27 |         # find pivot
 28 |         # diff pivot to max & min
 29 |         pivot = style_info.body_size
 30 |         # if style_info.min_found_size <= pivot <= style_info.max_found_size:
 31 |         right_span = style_info.max_found_size - pivot
 32 |         left_span = pivot - style_info.min_found_size
 33 | 
 34 |         if right_span > pivot * 2:
 35 |             right_span = pivot * 2
 36 |         if right_span == 0:
 37 |             right_span = 5
 38 |         if left_span == 0:
 39 |             left_span = 5
 40 | 
 41 |         targetSteps = bins / 2.
 42 |         alpha = 0.5
 43 |         thRunner = pivot
 44 |         mem = 0
 45 |         for i in range(1, int((bins / 2) + 1)):
 46 |             scaledStep = left_span / targetSteps * self.weight(i) + mem * alpha
 47 |             thRunner -= scaledStep
 48 |             mem = scaledStep
 49 |             borders.insert(0, thRunner)
 50 |         thRunner = pivot
 51 |         mem = 0
 52 |         for i in range(1, int((bins / 2) + 1)):
 53 |             scaledStep = right_span / targetSteps * self.weight(i) + mem * alpha
 54 |             thRunner += scaledStep
 55 |             mem = scaledStep
 56 |             borders.append(thRunner)
 57 | 
 58 |         self._borders = tuple(borders)
 59 | 
 60 |     @staticmethod
 61 |     def weight(n):
 62 |         """
 63 |         used in sizemapper, walking in N steps from pivot point towards max & min found size.
 64 |         first step is weighted more (to have a more narrow pivot), the further towards edge values, the further the step.
 65 |         @param n:
 66 |         @return:
 67 |         """
 68 |         return 1.0 - 1. / math.exp(n - 0.2)
 69 | 
 70 | 
 71 | class PivotLinearMapper(SizeMapper):
 72 | 
 73 |     def __init__(self, style_info: StyleDistribution):
 74 |         # find pivot
 75 |         # diff pivot to max & min
 76 |         super().__init__()
 77 |         pivot = style_info.body_size
 78 |         # if style_info.min_found_size <= pivot <= style_info.max_found_size:
 79 |         right_span = style_info.max_found_size - pivot
 80 |         left_span = pivot - style_info.min_found_size
 81 |         # else:
 82 |         #   print("error?")
 83 | 
 84 |         right_step = (right_span / 2.) * 0.5
 85 |         left_step = (left_span / 2.) * 0.5
 86 | 
 87 |         b0, b1 = style_info.min_found_size + left_step, style_info.min_found_size + left_step * 2
 88 |         b2, b3 = pivot + right_step, pivot + right_step * 2
 89 |         self._borders = (b0, b1, b2, b3)
 90 | 
 91 | 
 92 | class LinearSizeMapper(SizeMapper):
 93 | 
 94 |     def __init__(self, style_info: StyleDistribution):
 95 |         super().__init__()
 96 |         self.style_info = style_info
 97 | 
 98 |     def translate(self, target_enum, value) -> Enum:
 99 |         # Figure out how 'wide' each range is
100 |         leftSpan = self.style_info.max_found_size - self.style_info.min_found_size
101 |         rightSpan = target_enum.xlarge.value - target_enum.xsmall.value
102 | 
103 |         # Convert the left range into a 0-1 range (float)
104 |         scaled = float(value - self.style_info.min_found_size) / float(leftSpan)
105 |         if scaled > 1.0:
106 |             return target_enum.xlarge
107 |         elif scaled < 0:
108 |             return target_enum.xsmall
109 | 
110 |         else:
111 |             # Convert the 0-1 range into a value in the right range.
112 |             return TextSize(int(target_enum.xsmall.value + (scaled * rightSpan)))
113 | 


--------------------------------------------------------------------------------
/pdfstructure/analysis/styledistribution.py:
--------------------------------------------------------------------------------
  1 | import itertools
  2 | from collections import Counter, defaultdict
  3 | 
  4 | from pdfminer.layout import LTTextContainer, LTTextLine, LTChar
  5 | from sortedcontainers import SortedDict
  6 | 
  7 | from pdfstructure.utils import truncate, closest_key
  8 | 
  9 | 
 10 | class StyleDistribution:
 11 |     """
 12 |     Represents style information for one analysed element stream (typically one stream per document).
 13 |     """
 14 | 
 15 |     def __init__(self, data=None, line_margin=0.5):
 16 |         """
 17 |         
 18 |         :type data: Counter
 19 |         :param data:
 20 |         """
 21 |         if data:
 22 |             self._data = data
 23 |             self._body_size = data.most_common(1)[0][0]
 24 |             self._min_found_size, self._max_found_size = min(data.keys()), max(data.keys())
 25 |             if self._min_found_size == self._max_found_size:
 26 |                 self._min_found_size /= 2
 27 |                 self._max_found_size *= 2
 28 |         self._line_margin = line_margin
 29 | 
 30 |     @property
 31 |     def line_margin(self):
 32 |         return self._line_margin
 33 | 
 34 |     def norm_data_binned(self, bins=50):
 35 |         amount_items = self.amount_values
 36 |         step = 1.0 / bins
 37 |         keys = [step * i for i in range(bins)]
 38 |         normalised = SortedDict({key: 0.0 for key in keys})
 39 |         for size in self.data:
 40 |             norm_key = truncate(size / self.max_found_size, 2)
 41 |             k = closest_key(normalised, norm_key)
 42 |             normalised[k] += float(self.data[size]) / amount_items
 43 | 
 44 |         return normalised
 45 | 
 46 |     @property
 47 |     def norm_data(self):
 48 |         # normalise counts with total amount of collected values
 49 |         # normalise each key value against max found key value (size)
 50 |         # normalise X & Y
 51 |         normalised = defaultdict(int)
 52 |         amount_items = self.amount_values
 53 |         for size in self.data:
 54 |             normalised[truncate(size / self.max_found_size, 2)] += float(self.data[size]) / amount_items
 55 | 
 56 |         return normalised
 57 | 
 58 |     @property
 59 |     def min_found_size(self):
 60 |         return self._min_found_size
 61 | 
 62 |     @property
 63 |     def max_found_size(self):
 64 |         return self._max_found_size
 65 | 
 66 |     @staticmethod
 67 |     def get_min_size(data: Counter, body_size, title_size):
 68 |         if len(data) > 2:
 69 |             tmin = sorted(data.keys(), reverse=True)[:3][-1]
 70 |             return tmin if tmin > body_size else title_size - 0.5
 71 |         else:
 72 |             return title_size - 0.5
 73 | 
 74 |     @property
 75 |     def body_size(self):
 76 |         return self._body_size
 77 | 
 78 |     @property
 79 |     def is_empty(self):
 80 |         return not self._data
 81 | 
 82 |     @property
 83 |     def amount_values(self):
 84 |         return sum(self._data.values(), 0.0)
 85 | 
 86 |     @property
 87 |     def amount_sizes(self):
 88 |         """
 89 |         amount of found sizes
 90 |         :return:
 91 |         """
 92 |         return len(self._data)
 93 | 
 94 |     @property
 95 |     def data(self) -> Counter:
 96 |         return self._data.copy()
 97 | 
 98 | 
 99 | class SizeAnalyser:
100 |     def __init__(self):
101 |         self.sizeDistribution = Counter()
102 | 
103 |     def consume(self, node: LTTextContainer):
104 |         sizes = list(itertools.islice(
105 |             [c.size for c in node if isinstance(c, LTChar)], 10))
106 |         # get max size, check that it occurred at least twice
107 |         maxSize = max(sizes)
108 |         if sizes.count(maxSize) > 2:
109 |             self.sizeDistribution.update([truncate(maxSize, 2)])
110 | 
111 |     def process_result(self):
112 |         pass
113 | 
114 | 
115 | class LineMarginAnalyer:
116 |     _previousNode: LTTextContainer
117 | 
118 |     def __init__(self):
119 |         self._distanceCounter = defaultdict(int)
120 |         self._headingTrailingCounter = defaultdict(int)
121 |         self._previousNode = None
122 |         self._y = None
123 |         self._previousBoxHeight = None
124 | 
125 |     def consume(self, node: LTTextContainer):
126 |         if self._previousNode:
127 |             diff = truncate(abs(self._previousNode.y0 - node.y1), 2)
128 |             if self._previousNode.height == node.height:
129 |                 self._distanceCounter[(diff, node.height)] += 1
130 |             else:
131 |                 self._headingTrailingCounter[(diff, self._previousNode.height, node.height)] += 1
132 | 
133 |         self._previousNode = node
134 | 
135 |     def process_result(self):
136 |         """
137 |         Find relative line margin threshold that will be used in pdfminers paragraphs algorithm.
138 |         lines that are vertically closer than margin * height are considered to belong to the same paragraph.
139 |         @return:
140 |         """
141 |         (abs_margin, line_height), count = max(self._distanceCounter.items(), key=lambda item: item[1])
142 |         body_line_margin = min(0.5, 1.75 * abs_margin / line_height)
143 |         # todo, find next largest value from title_trailing --> margin should be smaller than that
144 |         # sorted(self._headingTrailingCounter.keys(), key=lambda keys: keys[1])
145 |         # print("line margin: {}".format(line_margin))
146 |         return body_line_margin
147 | 
148 | 
149 | def count_sizes(element_gen) -> StyleDistribution:
150 |     """
151 |     analyse used fonts, character sizes, paragraph margins etc.
152 |     :param element_gen:
153 |     :return:
154 |     """
155 |     sizeAnalyser = SizeAnalyser()
156 |     lineMarginAnalyser = LineMarginAnalyer()
157 | 
158 |     for element in element_gen:
159 |         if isinstance(element, LTTextContainer):
160 |             for node in element:
161 |                 if not isinstance(node, LTTextLine) or node.is_empty() \
162 |                         or len(node._objs) == 0:
163 |                     continue
164 | 
165 |                 sizeAnalyser.consume(node)
166 |                 lineMarginAnalyser.consume(node)
167 | 
168 |     if not sizeAnalyser.sizeDistribution:
169 |         raise TypeError("document does not contain text")
170 | 
171 |     return StyleDistribution(sizeAnalyser.sizeDistribution, line_margin=lineMarginAnalyser.process_result())
172 | 


--------------------------------------------------------------------------------
/pdfstructure/hierarchy/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChrizH/pdfstructure/1be97d954418b17d0d628ae6595c6aab54c7bd1c/pdfstructure/hierarchy/__init__.py


--------------------------------------------------------------------------------
/pdfstructure/hierarchy/detectheader.py:
--------------------------------------------------------------------------------
 1 | from collections import Counter
 2 | 
 3 | from pdfminer.layout import LTTextBoxVertical
 4 | 
 5 | from pdfstructure.analysis.styledistribution import StyleDistribution
 6 | from pdfstructure.model.document import TextElement
 7 | from pdfstructure.model.style import TextSize
 8 | from pdfstructure.utils import word_generator
 9 | 
10 | 
11 | def header_detector(element: TextElement, style_distribution: StyleDistribution):
12 |     if isinstance(element._data, LTTextBoxVertical):
13 |         return False
14 |     stats = Counter()
15 |     terms = element._data
16 |     style = element.style
17 | 
18 |     if len(element.text) <= 2:
19 |         return False
20 | 
21 |     # data tuple per line, element from pdfminer, annotated style info for whole line
22 |     # todo, compute ratios over whole line // or paragraph :O
23 |     if (style.bold or style.italic) and style.mapped_font_size >= TextSize.middle \
24 |             or style.mapped_font_size > TextSize.middle \
25 |             or style.max_size > style_distribution.body_size + 2:
26 |         return check_valid_header_tokens(terms)
27 |     else:
28 |         return False
29 | 
30 | 
31 | def check_valid_header_tokens(element):
32 |     """
33 |     fr a paragraph to be treated as a header, it has to contain at least 2 letters.
34 |     @param element:
35 |     @return:
36 |     """
37 |     alpha_count = 0
38 |     numeric_count = 0
39 |     for word in word_generator(element):
40 |         for c in word:
41 |             if c.isalpha():
42 |                 alpha_count += 1
43 |             if c.isnumeric():
44 |                 numeric_count += 1
45 | 
46 |             if alpha_count >= 2:
47 |                 return True
48 |     return False
49 | 


--------------------------------------------------------------------------------
/pdfstructure/hierarchy/headercompare.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | from pdfstructure.model.document import Section
 4 | from pdfstructure.utils import word_generator
 5 | 
 6 | numeration_pattern = re.compile("^(?=.*\d+)((?=.*\.)|(?=.*:)).*$")
 7 | white_space_pattern = re.compile("\\s+")
 8 | 
 9 | 
10 | class SubHeaderPredicate:
11 |     """
12 |     Compares two paragraphs that are classified as headers, but have the same mapped FontSize.
13 |     - Its possible that those headers are actually not on the same level, based on some conditions like H1 is enumerated & bold, H2 not.
14 |     """
15 | 
16 |     def __init__(self):
17 |         self._conditions = []
18 | 
19 |     def add_condition(self, condition):
20 |         self._conditions.append(condition)
21 | 
22 |     def test(self, h1, h2):
23 |         return any(condition(h1, h2) for condition in self._conditions)
24 | 
25 | 
26 | def get_default_sub_header_conditions():
27 |     _isSubHeader = SubHeaderPredicate()
28 |     _isSubHeader.add_condition(condition_boldness)
29 |     _isSubHeader.add_condition(condition_h1_enum_h2_not)
30 |     _isSubHeader.add_condition(condition_h2_extends_h1)
31 |     _isSubHeader.add_condition(condition_h1_slightly_bigger_h2)
32 |     return _isSubHeader
33 | 
34 | 
35 | def condition_boldness(h1: Section, h2: Section):
36 |     """
37 |     h2 is subheader if:if h1 is bold
38 |     - h1 is bold & h2 is not bold
39 |     - but skip if h2 is enumerated and h1 is not
40 |     @param h1:
41 |     @param h2:
42 |     @return:
43 |     """
44 |     h1start = next(word_generator(h1.heading._data))
45 |     h2start = next(word_generator(h2.heading._data))
46 |     if numeration_pattern.match(h2start) and not numeration_pattern.match(h1start):
47 |         return False
48 | 
49 |     return h1.heading.style.bold and not h2.heading.style.bold
50 | 
51 | 
52 | def condition_h2_extends_h1(h1: Section, h2: Section):
53 |     """
54 |     e.g.:   h1  ->  1.1 some header
55 |             h2  ->  1.1.2   some sub header
56 |     @param h1:
57 |     @param h2:
58 |     @return:
59 |     """
60 |     h1start = next(word_generator(h1.heading._data))
61 |     h2start = next(word_generator(h2.heading._data))
62 |     return len(h2start) > len(h1start) and h1start in h2start
63 | 
64 | 
65 | def condition_h1_enum_h2_not(h1: Section, h2: Section):
66 |     """
67 |     e.g.    h1  -> 1.1 some header title
68 |             h2  -> some other header title
69 |     -> applies only if both headers are of same style type
70 | 
71 |     """
72 |     if h2.heading.style.bold and not h1.heading.style.bold:
73 |         return False
74 |     # if h2.heading.style.font_name != h1.heading.style.font_name:
75 |     #    return False
76 | 
77 |     h1start = next(word_generator(h1.heading._data))
78 |     h2start = next(word_generator(h2.heading._data))
79 |     return numeration_pattern.match(h1start) and not numeration_pattern.match(h2start)
80 | 
81 | 
82 | def condition_h1_slightly_bigger_h2(h1: Section, h2: Section):
83 |     """s
84 |     Style analysis maps found sizes to a predefined enum (xsmall, small, large, xlarge).
85 |     but sometimes it makes sense to look deeper.
86 |     @param h1:
87 |     @param h2:
88 |     @return:
89 |     """
90 |     return h1.heading.style.mapped_font_size == h2.heading.style.mapped_font_size \
91 |            and h1.heading.style.max_size - h2.heading.style.max_size > 1.0
92 | 


--------------------------------------------------------------------------------
/pdfstructure/hierarchy/parser.py:
--------------------------------------------------------------------------------
  1 | from pathlib import Path
  2 | from typing import List, Generator
  3 | 
  4 | from pdfminer.layout import LTTextContainer, LAParams
  5 | 
  6 | from pdfstructure.analysis.annotate import StyleAnnotator
  7 | from pdfstructure.analysis.sizemapper import PivotLogMapper
  8 | from pdfstructure.analysis.styledistribution import count_sizes, StyleDistribution
  9 | from pdfstructure.hierarchy.detectheader import header_detector
 10 | from pdfstructure.hierarchy.headercompare import get_default_sub_header_conditions
 11 | from pdfstructure.model.document import TextElement, Section, StructuredPdfDocument, DanglingTextSection
 12 | from pdfstructure.source import Source
 13 | 
 14 | 
 15 | class HierarchyParser:
 16 | 
 17 |     def __init__(self, sub_header_conditions=get_default_sub_header_conditions()):
 18 |         self._isSubHeader = sub_header_conditions
 19 | 
 20 |     def parse_pdf(self, source: Source) -> StructuredPdfDocument:
 21 |         """
 22 |         Analysises and parses a PDF document from a given @Source containing its natural hierarchy.
 23 |         @param source:
 24 |         @return:
 25 |         """
 26 |         # 1. iterate once through PDF and analyse style distribution
 27 |         distribution = count_sizes(source.read())
 28 |         size_mapper = PivotLogMapper(distribution)
 29 |         style_annotator = StyleAnnotator(sizemapper=size_mapper, style_info=distribution)
 30 | 
 31 |         # 2. iterate second time trough pdf
 32 |         # - annotate each paragraph with mapped Style
 33 |         elements_with_style = style_annotator.process(source.read(
 34 |             override_la_params=LAParams(line_margin=distribution.line_margin)))
 35 | 
 36 |         # - create nested document structure on the fly
 37 |         structured_elements = self.create_hierarchy(elements_with_style, distribution)
 38 | 
 39 |         # 3. create wrapped document and capture some metadata
 40 |         pdf_document = StructuredPdfDocument(elements=structured_elements, style_info=distribution)
 41 |         enrich_metadata(pdf_document, source)
 42 |         return pdf_document
 43 | 
 44 |     def create_hierarchy(self, element_gen: Generator[TextElement, LTTextContainer, None],
 45 |                          style_distribution: StyleDistribution) -> List[Section]:
 46 |         """
 47 |         Takes incoming flat list of paragraphs and creates nested natural order hierarchy.
 48 | 
 49 |         Example Structure:
 50 |         ==================
 51 |         Document.pdf
 52 |         <<
 53 |             1.  H1 Chapter Header
 54 |             content
 55 |                 1.2     H2 Section Header
 56 |                 content
 57 |                 1.3     H2 Section Header
 58 |                 content
 59 |                     1.3.1   H3 Subsection Header
 60 |                     content
 61 |             2.  H1 Chapter Header
 62 |             content
 63 |         >>
 64 | 
 65 |         @param element_gen:
 66 |         @return:
 67 |         """
 68 |         structured = []
 69 |         level_stack = []
 70 | 
 71 |         for element in element_gen:
 72 |             # if line is header
 73 |             style = element.style
 74 |             if header_detector(element, style_distribution):
 75 |                 child = Section(element)
 76 |                 header_size = style.mapped_font_size
 77 | 
 78 |                 # initial state - push and continue with next element
 79 |                 if not level_stack:
 80 |                     self.__push_to_stack(child, level_stack, structured)
 81 |                     continue
 82 | 
 83 |                 stack_peek_size = level_stack[-1].heading.style.mapped_font_size
 84 | 
 85 |                 if stack_peek_size > header_size:
 86 |                     # append element as children
 87 |                     self.__push_to_stack(child, level_stack, structured)
 88 | 
 89 |                 else:
 90 |                     # go up in hierarchy and insert element (as children) on its level
 91 |                     self.__pop_stack_until_match(level_stack, header_size, child)
 92 |                     self.__push_to_stack(child, level_stack, structured)
 93 | 
 94 |             else:
 95 |                 # no header found, add paragraph as a content element to previous node
 96 |                 # - content is on same level as its corresponding header
 97 |                 content_node = Section(element, level=len(level_stack))
 98 |                 if level_stack:
 99 |                     level_stack[-1].append_children(content_node)
100 |                 else:
101 |                     # if last element in output structure has also no header, merge
102 |                     if structured and isinstance(structured[-1], DanglingTextSection):
103 |                         structured[-1].append_children(content_node)
104 |                     else:
105 |                         # # add dangling content as section
106 |                         dangling_content = DanglingTextSection()
107 |                         dangling_content.append_children(content_node)
108 |                         dangling_content.set_level(len(level_stack))
109 |                         structured.append(dangling_content)
110 | 
111 |         return structured
112 | 
113 |     def __pop_stack_until_match(self, stack, headerSize, header):
114 |         # if top level is smaller than current header to test, pop it
115 |         # repeat until top level is bigger or same
116 | 
117 |         while self.__top_has_no_header(stack) or self.__should_pop_higher_level(stack, header):
118 |             poped = stack.pop()
119 |             # header on higher level in stack has sime FontSize
120 |             # -> check additional sub-header conditions like regexes, enumeration etc.
121 |             if poped.heading.style.mapped_font_size == headerSize:
122 |                 # check if header_to_check is sub-header of poped element within stack
123 |                 if self._isSubHeader.test(poped, header):
124 |                     stack.append(poped)
125 |                     return
126 | 
127 |     @staticmethod
128 |     def __push_to_stack(child, stack, output):
129 |         """
130 |         insert incoming paragraph (child) in level(hierarchy) stack.
131 |         @param child: next incoming paragraph
132 |         @param stack: hierarchy-detect helper stack
133 |         @param output: exporting list of elements (contains complete structure in the end)
134 |         @return:
135 |         """
136 |         if stack:
137 |             child.set_level(len(stack))
138 |             stack[-1].children.append(child)
139 |         else:
140 |             # append as highest order element
141 |             output.append(child)
142 |         stack.append(child)
143 | 
144 |     @staticmethod
145 |     def __should_pop_higher_level(stack: [Section], header_to_test: Section):
146 |         """
147 |         helper method for __pop_stack_until_match: check if last element in stack is smaller then new header-paragraph.
148 |         @type header_to_test: object
149 | 
150 |         """
151 |         if not stack:
152 |             return False
153 |         return stack[-1].heading.style.mapped_font_size <= header_to_test.heading.style.mapped_font_size
154 | 
155 |     @staticmethod
156 |     def __top_has_no_header(stack: [Section]):
157 |         """
158 |         helper method for @__pop_stack_until_match
159 |         @param stack:
160 |         @return:
161 |         """
162 |         if not stack:
163 |             return False
164 |         return len(stack[-1].heading._data) == 0
165 | 
166 | 
167 | def enrich_metadata(pdf: StructuredPdfDocument, source: Source):
168 |     """
169 |     add some metadata to parsed PDF if possible
170 |         # add filename
171 |         # todo create document summary
172 |         # todo extract document-title from best titles
173 |     @return:
174 |     """
175 |     # try to capture filename
176 |     try:
177 |         filename = Path(source.uri).name
178 |         pdf.update_metadata("filename", filename)
179 |     except Exception as e:
180 |         pass
181 | 


--------------------------------------------------------------------------------
/pdfstructure/hierarchy/traversal.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from collections import deque
 3 | from typing import Generator
 4 | 
 5 | from pdfstructure.model.document import StructuredPdfDocument, Section
 6 | 
 7 | 
 8 | def get_document_depth(document: StructuredPdfDocument):
 9 |     """
10 |     retrieves document depth found within tree structure, + 1 because the levels are 0 notated.
11 |     """
12 |     return max(set([section.level for section in traverse_in_order(document)])) + 1
13 | 
14 | 
15 | def traverse_inorder_sections_with_content(document: StructuredPdfDocument) -> Generator[
16 |     tuple, StructuredPdfDocument, None]:
17 |     """
18 |     Traverse section by section in order and group top children as combined content
19 |     @param document:
20 |     @return: yields level, title, content
21 |     """
22 |     for section in filter(lambda sec: len(sec.children) > 0, traverse_in_order(document)):
23 |         children: Section
24 |         content = []
25 |         for children in section.children:
26 |             if children.children:
27 |                 continue
28 |             content.append(children.heading_text)
29 |         yield section.level, section.heading_text, "\n".join(content)
30 | 
31 | 
32 | def traverse_in_order(document: StructuredPdfDocument) \
33 |         -> Generator[Section, StructuredPdfDocument, None]:
34 |     """
35 |                      5   10
36 |                   /   \    \
37 |                  1     2    3
38 |                / | \        |
39 |               a  b  c       x
40 | 
41 |     yield order:
42 |     - [5,1,a,b,c,2,10,3,x]
43 |     """
44 | 
45 |     def __traverse__(section: Section):
46 |         child: Section
47 |         for child in section.children:
48 |             yield child
49 |             yield from __traverse__(child)
50 | 
51 |     for element in document.elements:
52 |         yield element
53 |         yield from __traverse__(element)
54 | 
55 | 
56 | def traverse_level_order(document: StructuredPdfDocument, max_depth=sys.maxsize) \
57 |         -> Generator[Section, StructuredPdfDocument, None]:
58 |     """
59 |                      5   10
60 |                   /   \    \
61 |                  1     2    3
62 |                / | \        |
63 |               a  b  c       x
64 | 
65 |     yield order:
66 |     - [5,10,1,2,3,a,b,c,x]
67 | 
68 |     @param document: structured pdf document, each element holds its own dopth information (Section.level)
69 |     @param max_depth: yield elements until max_depth is reached
70 |     """
71 | 
72 |     element_queue = deque()
73 | 
74 |     element: Section
75 |     for element in document.elements:
76 |         element_queue.append(element)
77 | 
78 |     while element_queue:
79 |         element = element_queue.popleft()
80 | 
81 |         if element.level < max_depth:
82 |             yield element
83 | 
84 |             # append next layer of elements / children nodes
85 |             for child in element.children:
86 |                 element_queue.append(child)
87 | 


--------------------------------------------------------------------------------
/pdfstructure/model/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChrizH/pdfstructure/1be97d954418b17d0d628ae6595c6aab54c7bd1c/pdfstructure/model/__init__.py


--------------------------------------------------------------------------------
/pdfstructure/model/document.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict
  2 | from typing import List
  3 | 
  4 | from pdfminer.layout import LTTextContainer
  5 | 
  6 | from pdfstructure.analysis.styledistribution import StyleDistribution
  7 | from pdfstructure.model.style import Style
  8 | 
  9 | 
 10 | class TextElement:
 11 |     """
 12 |     Represents one single TextContainer like a line of words.
 13 |     """
 14 | 
 15 |     def __init__(self, text_container: LTTextContainer, style: Style, text=None, page=None):
 16 |         self._data = text_container
 17 |         self._text = text
 18 |         self.style = style
 19 |         self.page = page
 20 | 
 21 |     @property
 22 |     def text(self):
 23 |         if not self._data:
 24 |             return self._text
 25 |         else:
 26 |             return self._data.get_text().strip()
 27 | 
 28 |     @classmethod
 29 |     def from_json(cls, data: dict):
 30 |         """
 31 | 
 32 |         @param data:
 33 |         @return:
 34 |         """
 35 |         if data:
 36 |             return TextElement(text_container=None, style=Style.from_json(data["style"]),
 37 |                                text=data["text"])
 38 |         return None
 39 | 
 40 |     def __str__(self):
 41 |         return self.text
 42 | 
 43 | 
 44 | class Section:
 45 |     """
 46 |     Represents a section with title, contents and children
 47 |     """
 48 |     heading: TextElement
 49 | 
 50 |     def __init__(self, element: TextElement, level=0):
 51 |         self.heading = element
 52 |         self.children = []  # Section
 53 |         self.level = None
 54 |         self.set_level(level)
 55 | 
 56 |     def set_level(self, level):
 57 |         self.level = level
 58 | 
 59 |     def append_children(self, section):
 60 |         self.children.append(section)
 61 | 
 62 |     @property
 63 |     def full_content(self):
 64 |         """
 65 |         Returns merged full content of all nested children.
 66 |         @return:
 67 |         """
 68 |         contents = [self.heading_text] if self.heading_text else []
 69 | 
 70 |         def __traverse__(section: Section):
 71 |             child: Section
 72 |             for child in section.children:
 73 |                 yield child
 74 |                 yield from __traverse__(child)
 75 | 
 76 |         for child in __traverse__(self):
 77 |             if child.heading_text:
 78 |                 contents.append(child.heading_text)
 79 |         return "\n".join(contents)
 80 | 
 81 |     @property
 82 |     def top_level_content(self):
 83 |         """
 84 |         Paragraphs that belong directly to section, nested children are skipped.
 85 |         Example:
 86 |             This is a Header
 87 |                 paragraph 1
 88 |                 paragraph 2
 89 |                 This is a subheader
 90 |                     paragraph 3
 91 |         Returns:
 92 |             [paragraph 1, paragraph 2]
 93 | 
 94 |         @return: List[Section]
 95 |         """
 96 |         child: Section
 97 |         content = []
 98 |         for child in self.children:
 99 |             if child.children:
100 |                 continue
101 |             content.append(child)
102 |         return content
103 | 
104 |     @classmethod
105 |     def from_json(cls, data: dict):
106 |         children = list(map(Section.from_json, data.get("children")))
107 |         heading = TextElement.from_json(data.get("heading"))
108 |         element = cls(heading, data["level"])
109 |         element.children = children
110 |         return element
111 | 
112 |     @property
113 |     def heading_text(self):
114 |         if self.heading and self.heading.text:
115 |             return self.heading.text
116 |         else:
117 |             return ""
118 | 
119 |     def __str__(self):
120 |         return self.heading_text
121 |         # return "{}\n{}".format(self.heading.text,
122 |         #                       " ".join([str(child.heading.text) for child in self.children]))
123 | 
124 | 
125 | class DanglingTextSection(Section):
126 |     def __init__(self):
127 |         super().__init__(element=None)
128 | 
129 |     def __str__(self):
130 |         return "{}".format(" ".join([str(e) for e in self.content]))
131 | 
132 | 
133 | class StructuredPdfDocument:
134 |     """
135 |     PDF document containing its natural order hierarchy, as detected by the HierarchyParser.
136 |     """
137 |     elements: List[Section]
138 | 
139 |     def __init__(self, elements: [Section], style_info=None):
140 |         self.metadata = defaultdict(str)
141 |         self.elements = elements
142 |         self.metadata["style_distribution"] = style_info
143 | 
144 |     def update_metadata(self, key, value):
145 |         self.metadata[key] = value
146 | 
147 |     @property
148 |     def text(self):
149 |         return "\n".join([item.full_content for item in self.elements])
150 | 
151 |     @property
152 |     def title(self):
153 |         return self.metadata.get("title")
154 | 
155 |     @property
156 |     def style_distribution(self) -> StyleDistribution:
157 |         return self.metadata.get("style_distribution")
158 | 
159 |     @classmethod
160 |     def from_json(cls, data: dict):
161 |         elements = list(map(Section.from_json, data["elements"]))
162 |         pdf = cls(elements)
163 |         pdf.metadata.update(data.get("metadata"))
164 |         return pdf
165 | 


--------------------------------------------------------------------------------
/pdfstructure/model/style.py:
--------------------------------------------------------------------------------
 1 | from enum import IntEnum, auto
 2 | 
 3 | 
 4 | class TextSize(IntEnum):
 5 |     """
 6 |     Used to annotate extracted paragraphs with a predefined range of possible FontSize.
 7 |     Convenient to perform style-based grouping of similar pieces of text.
 8 |     """
 9 |     xsmall = auto()
10 |     small = auto()
11 |     middle = auto()
12 |     large = auto()
13 |     xlarge = auto()
14 | 
15 |     @classmethod
16 |     def from_range(cls, borders: tuple, value: int):
17 |         if value < borders[0]:
18 |             return cls.xsmall
19 |         elif borders[0] <= value < borders[1]:
20 |             return cls.small
21 |         elif borders[1] <= value < borders[2]:
22 |             return cls.middle
23 |         elif borders[2] <= value < borders[3]:
24 |             return cls.large
25 |         elif value >= borders[3]:
26 |             return cls.xlarge
27 | 
28 | 
29 | class Style:
30 |     """
31 |     Extracted paragraphs get annotated with found font-style information.
32 |     """
33 | 
34 |     def __init__(self, bold, italic, font_name, mapped_font_size: TextSize, mean_size: float, max_size: float):
35 |         self.bold = bold
36 |         self.italic = italic
37 |         self.font_name = font_name
38 |         self.mapped_font_size = mapped_font_size
39 |         self.mean_size = mean_size
40 |         self.max_size = max_size
41 | 
42 |     @classmethod
43 |     def from_json(cls, data: dict):
44 |         mapped_size = TextSize[data["mapped_font_size"]]
45 |         data["mapped_font_size"] = mapped_size
46 |         return cls(**data)
47 | 
48 |     def __gt__(self, other):
49 |         if isinstance(other, Style):
50 |             return self.mapped_font_size > other.mapped_font_size or \
51 |                    self.mapped_font_size == other.mapped_font_size and self.bold and not other.bold
52 |         return False
53 | 
54 |     def __lt__(self, other):
55 |         if isinstance(other, Style):
56 |             return self.mapped_font_size < other.mapped_font_size or \
57 |                    self.mapped_font_size == other.mapped_font_size and not self.bold and other.bold
58 |         return False
59 | 
60 |     def __eq__(self, other):
61 |         if isinstance(other, Style):
62 |             return self.mapped_font_size == other.mapped_font_size and \
63 |                    self.bold == other.bold
64 |         else:
65 |             return False
66 | 


--------------------------------------------------------------------------------
/pdfstructure/printer.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | from typing import Iterator
  3 | 
  4 | from pdfstructure.hierarchy.traversal import traverse_in_order
  5 | from pdfstructure.model.document import Section, StructuredPdfDocument, TextElement
  6 | from pdfstructure.model.style import Style
  7 | from pdfstructure.utils import dict_subset
  8 | 
  9 | 
 10 | class Printer:
 11 |     def print(self, document: StructuredPdfDocument, *args, **kwargs):
 12 |         pass
 13 | 
 14 | 
 15 | class PrettyStringPrinter(Printer):
 16 |     """
 17 |     pretty prints nested document structure with \t prefixes per level.
 18 |     """
 19 | 
 20 |     @staticmethod
 21 |     def get_title_prefix(level):
 22 |         return "".join(["\t" for i in range(level)])
 23 | 
 24 |     def make_item_pretty(self, item_gen: Iterator[Section]):
 25 |         """
 26 |         yield pretty string representation of given element
 27 |         - add prefix for each paragraph, corresponding to its level
 28 |         - put Title in brackets
 29 |         @param item_gen: all elements in order, generator provided by StructuredPdfDocument.traverse()
 30 |         """
 31 |         for element in item_gen:
 32 |             prefix = self.get_title_prefix(element.level)
 33 |             formatted_text = element.heading_text.rstrip().replace("\n", "\n{}".format(prefix))
 34 |             # if element has children, then its content is a header
 35 |             if element.children:
 36 |                 content = "\n\n{}[{}]".format(prefix, formatted_text)
 37 |             else:
 38 |                 # render as normal content
 39 |                 content = "\n{}{}".format(prefix, formatted_text)
 40 |             yield content
 41 | 
 42 |     def print(self, document: StructuredPdfDocument, *args, **kwargs):
 43 |         element_iterator = traverse_in_order(document)
 44 |         data = [item for item in self.make_item_pretty(element_iterator)]
 45 |         return "".join(data)
 46 | 
 47 | 
 48 | class PrettyStringFilePrinter(PrettyStringPrinter):
 49 |     def print(self, document: StructuredPdfDocument, *args, **kwargs) -> str:
 50 |         """
 51 |         
 52 |         @param **kwargs:
 53 |         @param document:
 54 |         @param args:
 55 |         @param kwargs:
 56 |             Keyword Args:
 57 |                 file_path (str): path to output file
 58 |         @return: file_path to outputfile
 59 |         """
 60 |         output_file = kwargs.get("file_path")
 61 |         print("write to file: {}".format(output_file))
 62 |         with open(output_file, "w") as file:
 63 |             element_iterator = traverse_in_order(document)
 64 |             for pretty in self.make_item_pretty(element_iterator):
 65 |                 file.write(pretty)
 66 |         return output_file
 67 | 
 68 | 
 69 | class ElementTextEncoder(json.JSONEncoder):
 70 |     def default(self, e):
 71 |         if isinstance(e, TextElement):
 72 |             properties = e.__dict__.copy()
 73 |             properties["data"] = e._data.get_text()
 74 |             return properties
 75 |         else:
 76 |             return super().default(e)
 77 | 
 78 | 
 79 | def encode_pdf_element(obj):
 80 |     """
 81 |     customizse pdf element encoding
 82 |     - get rid of detailed pdf information retrieved from pdfminer like bounding box coords
 83 |     - use mapped fontsize name instead of ordinal value
 84 |     @param obj:
 85 |     @return:
 86 |     """
 87 |     if isinstance(obj, TextElement):
 88 |         properties = dict_subset(obj.__dict__.copy(), ("_data", "_text"))
 89 |         properties["text"] = obj.text
 90 |         properties["style"] = encode_pdf_element(obj.style)
 91 |         return properties
 92 |     elif isinstance(obj, Style):
 93 |         properties = obj.__dict__.copy()
 94 |         properties["mapped_font_size"] = str(obj.mapped_font_size.name)
 95 |         return properties
 96 |     else:
 97 |         return obj.__dict__
 98 | 
 99 | 
100 | class JsonStringPrinter(Printer):
101 |     def print(self, document: StructuredPdfDocument, *args, **kwargs):
102 |         return json.dumps(document, default=encode_pdf_element, indent=4)
103 | 
104 | 
105 | class JsonFilePrinter(Printer):
106 |     def print(self, document: StructuredPdfDocument, *args, **kwargs):
107 |         """
108 |         @param document:
109 |         @param args:
110 |         @param kwargs:
111 |             Keyword Args:
112 |                 file_path (str): path to output file
113 |         @return:
114 |         """
115 |         file_path = kwargs.get("file_path")
116 |         with open(file_path, "w") as fp:
117 |             json.dump(document, fp=fp, default=encode_pdf_element, indent=4)
118 |         return file_path
119 | 


--------------------------------------------------------------------------------
/pdfstructure/source.py:
--------------------------------------------------------------------------------
  1 | import itertools
  2 | from typing import Generator, Any
  3 | 
  4 | from pdfminer.high_level import extract_pages
  5 | from pdfminer.layout import LTTextContainer, LAParams, LTFigure, LTTextBoxHorizontal, LTTextLineHorizontal, LTChar, \
  6 |     LTTextBoxVertical
  7 | 
  8 | 
  9 | class Source:
 10 |     """
 11 |     Abstract interface to read a PDF from somewhere.
 12 |     """
 13 | 
 14 |     def __init__(self, uri=None):
 15 |         """
 16 | 
 17 |         @param uri: points to pdf that should be read
 18 |         """
 19 |         self.uri = uri
 20 | 
 21 |     def config(self):
 22 |         """
 23 |         get source configuration
 24 |         @return:
 25 |         """
 26 |         pass
 27 | 
 28 |     def read(self, *args, **kwargs) -> Generator[LTTextContainer, Any, None]:
 29 |         """
 30 |         yields flat list of paragraphs within a document.
 31 |         @param args:
 32 |         @param kwargs:
 33 |         @return:
 34 |         """
 35 |         pass
 36 | 
 37 | 
 38 | class FileSource(Source):
 39 |     def __init__(self, file_path: str, page_numbers=None,
 40 |                  la_params=LAParams(boxes_flow=0.3, detect_vertical=True, line_margin=0.3)):
 41 |         super().__init__(uri=file_path)
 42 |         self.page_numbers = page_numbers
 43 |         self.la_params = la_params
 44 | 
 45 |     def config(self):
 46 |         return self.__dict__
 47 | 
 48 |     def __handle_lt_figure(self, element: LTFigure):
 49 |         """
 50 |         sometimes pieces of text are wrongly detected as LTFigure, e.g. in slide-sets with border lines.
 51 |         -> extract text from LTFigure line by line put them into a LTTextBoxHorizontal as a workaround
 52 |         @return: LTTextBoxHorizontal containing found texts line by line
 53 |         """
 54 |         # check if text is hold within figure element, forward
 55 | 
 56 |         line = LTTextLineHorizontal(0)
 57 |         wrapper = LTTextBoxHorizontal()
 58 |         wrapper.add(line)
 59 | 
 60 |         y_prior = element._objs[0].y0
 61 | 
 62 |         for letter in element:
 63 |             if isinstance(letter, LTChar):
 64 |                 if abs(letter.y0 - y_prior) > 0.05:
 65 |                     # new line, yield wrapper
 66 |                     wrapper.analyze(self.la_params)
 67 |                     yield wrapper
 68 | 
 69 |                     wrapper = LTTextBoxHorizontal()
 70 |                     line = LTTextLineHorizontal(0)
 71 |                     wrapper.add(line)
 72 |                     y_prior = letter.y0
 73 | 
 74 |                 line.add(letter)
 75 | 
 76 |     def split_boxes_by_style(self, container: LTTextContainer) -> Generator[LTTextContainer, LTTextContainer, None]:
 77 |         """
 78 |         pdfminers paragraphs are sometimes too broad and contain lines that should be splitted into header and content
 79 |         @param container: the extracted original paragraph
 80 |         """
 81 |         if isinstance(container, LTTextBoxVertical):
 82 |             yield container
 83 |             return
 84 | 
 85 |         line: LTTextLineHorizontal
 86 |         wrapper = LTTextBoxHorizontal()
 87 |         wrapper.page = container.page
 88 |         stack = []
 89 |         for line in container:
 90 |             size = max([obj.size for obj in itertools.islice(line, 10) if isinstance(obj, LTChar)])
 91 |             if not stack:
 92 |                 wrapper.add(line)
 93 |                 stack.append(size)
 94 |             else:
 95 |                 prior = stack.pop()
 96 |                 stack.append(size)
 97 |                 diff = abs(prior - size)
 98 |                 if diff != 0 and max(prior, size) / min(prior, size) > 1.15:
 99 |                     # break paragraph
100 |                     yield wrapper
101 |                     wrapper = LTTextBoxHorizontal()
102 |                 wrapper.add(line)
103 |         yield wrapper
104 | 
105 |     def read(self, override_la_params=None, override_page_numbers=None) -> Generator[LTTextContainer, Any, None]:
106 |         pNumber = 0
107 |         # disable boxes_flow, style based hierarchy detection is based on purely flat list of paragraphs
108 |         # params = LAParams(boxes_flow=None, detect_vertical=False)  # setting for easy doc
109 |         # params = LAParams(boxes_flow=0.5, detect_vertical=True) # setting for column doc
110 |         if override_la_params:
111 |             # use dynamic line_margin
112 |             self.la_params.line_margin = override_la_params.line_margin
113 |         # todo, do pre-analysis in count_sizes --> are there many boxes within same line
114 |         # todo, understand LAParams, for columns, NONE works better, for vertical only layout LAParams(boxes_flow=None, detect_vertical=False) works better!! :O
115 |         #   do some sort of layout analyis, if there are many boxes vertically next to each other, use layout analysis
116 |         #   - column type
117 |         #   - straight forward document
118 |         for page_layout in extract_pages(self.uri,
119 |                                          laparams=self.la_params,
120 |                                          page_numbers=self.page_numbers if not override_page_numbers else override_page_numbers):
121 |             for element in page_layout:
122 |                 element.page = pNumber
123 |                 if isinstance(element, LTTextContainer):
124 |                     yield from self.split_boxes_by_style(element)
125 |                     #yield element
126 |                 elif isinstance(element, LTFigure):
127 |                     yield from self.__handle_lt_figure(element)
128 |             pNumber += 1
129 | 


--------------------------------------------------------------------------------
/pdfstructure/utils.py:
--------------------------------------------------------------------------------
  1 | import itertools
  2 | import math
  3 | import os
  4 | from pathlib import Path
  5 | from typing import Generator
  6 | 
  7 | from pdfminer.high_level import extract_pages
  8 | from pdfminer.layout import LTTextContainer, LTChar, LTTextLine, LAParams, LTTextLineHorizontal
  9 | 
 10 | 
 11 | def char_generator(text_container: LTTextContainer):
 12 |     for container in text_container:
 13 |         if isinstance(container, LTChar):
 14 |             yield container
 15 |         elif isinstance(container, LTTextLine):
 16 |             for obj in container:
 17 |                 if isinstance(obj, LTChar):
 18 |                     yield obj
 19 | 
 20 | 
 21 | def dict_subset(d, exclude_keys):
 22 |     return {k: v for k, v in d.items() if k not in exclude_keys}
 23 | 
 24 | 
 25 | def word_generator(text_container: LTTextContainer):
 26 |     """
 27 |     iterates through container's characters and yields a word as soon as found (trailing whitespace).
 28 |     @param text_container:
 29 |     @return:
 30 |     """
 31 |     characters = []
 32 | 
 33 |     for obj in char_generator(text_container):
 34 |         character = obj.get_text()
 35 |         if character != " ":
 36 |             characters.append(character)
 37 |         else:
 38 |             word = "".join(characters).strip()
 39 |             # skip if word is just a whitespace on its own
 40 |             if len(word) > 0:
 41 |                 yield word
 42 |             characters.clear()
 43 |     if characters:
 44 |         yield "".join(characters)
 45 | 
 46 | 
 47 | def element_generator(file_path: str, page_numbers=None) -> Generator[LTTextContainer, None, None]:
 48 |     """
 49 |     yields flat list of paragraphs within a document.
 50 |     :param file_path:
 51 |     :return:
 52 |     """
 53 |     pNumber = 0
 54 |     # disable boxes_flow, style based hierarchy detection is based on purely flat list of paragraphs
 55 |     params = LAParams(boxes_flow=None, detect_vertical=False)  # setting for easy doc
 56 |     # params = LAParams(boxes_flow=0.5, detect_vertical=True) # setting for column doc
 57 |     # todo, do pre-analysis in count_sizes --> are there many boxes within same line
 58 |     # todo, understand LAParams, for columns, NONE works better, for vertical only layout LAParams(boxes_flow=None, detect_vertical=False) works better!! :O
 59 |     #   do some sort of layout analyis, if there are many boxes vertically next to each other, use layout analysis
 60 |     #   - column type
 61 |     #   - straight forward document
 62 |     for page_layout in extract_pages(file_path, laparams=params, page_numbers=page_numbers):
 63 |         for element in page_layout:
 64 |             if isinstance(element, LTTextContainer):
 65 |                 element.meta = {"page": pNumber}
 66 |                 yield element
 67 |         pNumber += 1
 68 | 
 69 | 
 70 | def truncate(number, decimals=0):
 71 |     """
 72 |     Returns a value truncated to a specific number of decimal places.
 73 |     """
 74 |     if not isinstance(decimals, int):
 75 |         raise TypeError("decimal places must be an integer.")
 76 |     elif decimals < 0:
 77 |         raise ValueError("decimal places has to be 0 or more.")
 78 |     elif decimals == 0:
 79 |         return math.trunc(number)
 80 | 
 81 |     factor = 10.0 ** decimals
 82 |     return math.trunc(number * factor) / factor
 83 | 
 84 | 
 85 | class DocTypeFilter:
 86 | 
 87 |     def __init__(self, endings=("doc", "docx", "ppt", "pptx", "xls", "xlsx", "odt", "rtf")):
 88 |         self.endings = endings if isinstance(endings, (list, tuple)) else (endings)
 89 | 
 90 |     def test(self, name):
 91 |         return name.split(".")[-1].lower() in self.endings
 92 | 
 93 | 
 94 | def closest_key(sorted_dict, key):
 95 |     "Return closest key in `sorted_dict` to given `key`."
 96 |     assert len(sorted_dict) > 0
 97 |     keys = list(itertools.islice(sorted_dict.irange(minimum=key), 1))
 98 |     keys.extend(itertools.islice(sorted_dict.irange(maximum=key, reverse=True), 1))
 99 |     return min(keys, key=lambda k: abs(key - k))
100 | 
101 | 
102 | def find_file(root_dir: str, type_filter: DocTypeFilter, print_mod=10) -> iter([Path]):
103 |     processed = 0
104 |     for root, dirs, files in os.walk(root_dir):
105 |         for file in files:
106 |             if type_filter.test(file):
107 |                 yield Path(root + "/" + file)
108 |                 processed += 1
109 |                 if print_mod and processed % print_mod == 0:
110 |                     print("\nprocessed {}\n".format(processed))
111 |     print("found {} file-paths".format(processed))
112 | 
113 | 
114 | def head_char_line(container: LTTextLineHorizontal) -> LTChar:
115 |     """
116 |     :rtype LTChar
117 |     :param container:
118 |     :return:
119 |     """
120 |     for obj in container:
121 |         if isinstance(obj, LTChar):
122 |             return obj
123 | 


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
  1 | # PDF Structural Parser
  2 | `pdfstructure` detects, splits and organises the documents text content into its natural structure as envisioned by the author.
  3 | The document structure, or hierarchy, stores the relation between chapters, sections and their sub sections in a nested, recursive manner.   
  4 | 
  5 | 
  6 | `pdfstructure` is in early development and built on top of [pdfminer.six](https://github.com/pdfminer/pdfminer.six). 
  7 | 
  8 | - Paragraph extraction is performed leveraging `pdfminer.high_level.extract_pages()`.
  9 | - Those paragraphs are then grouped together according to some basic (extendable) heuristics.
 10 | 
 11 | ## Document Model
 12 | 
 13 | ```
 14 | class StructuredDocument:
 15 |   metadata: dict
 16 |   sections: List[Section]
 17 | 
 18 | class Section:
 19 |   content:  TextElement
 20 |   children: List[Section]
 21 |   level:    int
 22 | 
 23 | class TextElement:
 24 |   text:     LTTextContainer # the extracted paragraph from pdfminer
 25 |   style:    Style
 26 | ```
 27 | 
 28 | ## Load and parse PDF
 29 | 
 30 | **Illustration of document structure**
 31 | 
 32 | The following screenshot contains sections and subsections with their respective content. 
 33 | In that case, the structure can be easily parsed by leveraging the Font Style only.
 34 | 
 35 | ![Example PDF](tests/resources/interview_cheatsheet-excerpt.png?raw=true)
 36 | *PDF source: [github.com/TSiege](https://gist.github.com/TSiege/cbb0507082bb18ff7e4b)*
 37 | 
 38 | 
 39 | **Parse PDF**
 40 | ```
 41 |     from pdfstructure.hierarchy.parser import HierarchyParser
 42 |     from pdfstructure.source import FileSource
 43 | 
 44 |     parser = HierarchyParser() 
 45 |     
 46 |     # specify source (that implements source.read())
 47 |     source = FileSource(path) 
 48 |      
 49 |     # analyse document and parse as nested data structure
 50 |     document = parser.parse_pdf(source)
 51 | ```
 52 | 
 53 | ### Serialize Document to String
 54 | To export the parsed structure, use a printer implementation.
 55 | ```
 56 |     from pdfstructure.printer import PrettyStringPrinter
 57 | 
 58 |     stringExporter = PrettyStringPrinter()
 59 |     prettyString = stringExporter.print(document)
 60 | ```
 61 | 
 62 | **Excerpt of the parsed document (serialized to string)**
 63 | 
 64 | [Parsed data: interview_cheatsheet_pretty.txt](tests/resources/parsed/interview_cheatsheet_pretty.txt?raw=true)
 65 | ```
 66 | [Search Basics]
 67 | 	[Breadth First Search]
 68 | 		[Definition:]
 69 | 			An algorithm that searches a tree (or graph) by searching levels of the tree first, starting at the root.
 70 | 			It finds every node on the same level, most often moving left to right.
 71 | 			While doing this it tracks the children nodes of the nodes on the current level.
 72 | 			When finished examining a level it moves to the left most node on the next level.
 73 | 			The bottom-right most node is evaluated last (the node that is deepest and is farthest right of it's level).
 74 | 
 75 | 		[What you need to know:]
 76 | 			Optimal for searching a tree that is wider than it is deep.
 77 | 			Uses a queue to store information about the tree while it traverses a tree.
 78 | 			Because it uses a queue it is more memory intensive than depth first search.
 79 | 			The queue uses more memory because it needs to stores pointers
 80 | ```
 81 | 
 82 | ### Encode Document to JSON
 83 | ```
 84 |     from pdfstructure.printer import JsonFilePrinter
 85 |     
 86 |     printer = JsonFilePrinter()
 87 |     file_path = Path("resources/parsed/interview_cheatsheet.json")
 88 |     
 89 |     printer.print(document, file_path=str(file_path.absolute()))
 90 | ```
 91 | 
 92 | [Parsed data: interview_cheatsheet.json](tests/resources/parsed/interview_cheatsheet.json?raw=true)
 93 | 
 94 | **Excerpt of exported json**
 95 | ```
 96 | {
 97 |     "metadata": {
 98 |         "style_info": {
 99 |             "_data": {
100 |                 "7.99": 24,
101 |                 "9.6": 1,
102 |                 "6.4": 7,
103 |                 "7.47": 3,
104 |                 "12.8": 7,
105 |                 "8.53": 206,
106 |                 "10.67": 12,
107 |                 "7.25": 14
108 |             },
109 |             "_body_size": 8.53,
110 |             "_min_found_size": 6.4,
111 |             "_max_found_size": 12.8
112 |         },
113 |         "filename": "interview_cheatsheet.pdf"
114 |     },
115 |     "elements": [
116 |      {
117 |             "content": {
118 |                 "style": {
119 |                     "bold": true,
120 |                     "italic": false,
121 |                     "font_name": ".SFNSDisplay-Semibold",
122 |                     "mapped_font_size": "xlarge",
123 |                     "mean_size": 12.8,
124 |                     "max_size": 12.806323818403143
125 |                 },
126 |                 "page": 0,
127 |                 "text": "Data Structure Basics"
128 |             },
129 |             "children": [
130 |                 {
131 |                     "content": {
132 |                         "style": {
133 |                             "bold": true,
134 |                             "italic": false,
135 |                             "font_name": ".SFNSDisplay-Semibold",
136 |                             "mapped_font_size": "large",
137 |                             "mean_size": 10.6,
138 |                             "max_size": 10.671936515335972
139 |                         },
140 |                         "page": 0,
141 |                         "text": "Array"
142 |                     },
143 |                     "children": [
144 |                         {
145 |                             "content": {
146 |                                 "style": {
147 |                                     "bold": true,
148 |                                     "italic": false,
149 |                                     "font_name": ".SFNSText-Semibold",
150 |                                     "mapped_font_size": "middle",
151 |                                     "mean_size": 8.5,
152 |                                     "max_size": 8.537549212268772
153 |                                 },
154 |                                 "page": 0,
155 |                                 "text": "Definition:"
156 |                             },
157 |          ....          
158 | ```
159 | 
160 | 
161 | ### Load JSON as StructuredPdfDocument
162 | 
163 | Of course, encoded documents can be easily decoded and used for further analysis. 
164 | However, detailed information like bounding boxes or coordinates for each character are not persisted.
165 | 
166 | ```
167 |     from pdfstructure.model.document import StructuredPdfDocument
168 | 
169 |     jsonString = json.load(file)
170 |     document = StructuredPdfDocument.from_json(jsonString)
171 |     
172 |     print(document.title)
173 | ``
174 |         $ "interview_cheatsheet.pdf"
175 | ```
176 | 
177 | ## Traverse through document structure
178 | Having all paragraphs and sections organised as a general tree, 
179 | its straight forward to iterate through the layers and search for specific elements like headlines, or extract all main headers like chapter titles.  
180 | 
181 | Two document traversal generators are available that yield each section `in-order` or in `level-order` respectively. 
182 | ```
183 |     from pdfstructure.hierarchy.traversal import traverse_in_order
184 | 
185 |     elements_flat_in_order = [element for element in traverse_in_order(document)]
186 | 
187 |     Exemplary illustration of yield order:
188 |         """
189 |                          5   10
190 |                       /   \    \
191 |                      1     2    3
192 |                    / | \        |
193 |                   a  b  c       x
194 |     
195 |         yield order:
196 |         - [5,1,a,b,c,2,10,3,x]
197 |         """
198 | ```
199 | 
200 | 
201 | # TODOs
202 | - [ ] **Detect the document layout type (Columns, Book, Magazine)**
203 |     
204 |     The provided layout analysis algorithm by pdfminer.six performs well on more straightforward documents with default settings. 
205 |     However, more complicated layouts like scientific papers need custom `LAParams` settings to retrieve paragraphs in correct reading order.
206 | - [ ] High level diagram of algorithm workflow
207 | - [ ] Performance improvement in terms of speed


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | sortedcontainers==2.2.2
2 | pdfminer.six==20200517


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | setup(
 4 |     name='pdfstructure',
 5 |     description="PDF Natural Structure Parser",
 6 |     version='0.0.1',
 7 |     author="Christian Hofer",
 8 |     author_email="christianhofer91@gmail.com",
 9 |     packages=find_packages(exclude=("tests"))
10 | )
11 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChrizH/pdfstructure/1be97d954418b17d0d628ae6595c6aab54c7bd1c/tests/__init__.py


--------------------------------------------------------------------------------
/tests/helper.py:
--------------------------------------------------------------------------------
 1 | from pdfstructure.analysis.annotate import StyleAnnotator
 2 | from pdfstructure.analysis.sizemapper import PivotLogMapper
 3 | from pdfstructure.analysis.styledistribution import count_sizes
 4 | from pdfstructure.utils import element_generator
 5 | 
 6 | 
 7 | def generate_annotated_lines(file_path):
 8 |     """
 9 |     yields paragraph detected by pdfminer annotated with detected & mapped style information
10 |     """
11 |     element_gen = element_generator(file_path)
12 |     distribution = count_sizes(element_gen)
13 |     sizeMapper = PivotLogMapper(distribution)
14 |     style_annotator = StyleAnnotator(sizemapper=sizeMapper, style_info=distribution)
15 | 
16 |     elements = element_generator(file_path)
17 |     with_style = style_annotator.process(elements)
18 | 
19 |     yield from with_style
20 | 


--------------------------------------------------------------------------------
/tests/resources/5648.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChrizH/pdfstructure/1be97d954418b17d0d628ae6595c6aab54c7bd1c/tests/resources/5648.pdf


--------------------------------------------------------------------------------
/tests/resources/IE00BM67HT60-ATB-FS-DE-2020-2-28.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChrizH/pdfstructure/1be97d954418b17d0d628ae6595c6aab54c7bd1c/tests/resources/IE00BM67HT60-ATB-FS-DE-2020-2-28.pdf


--------------------------------------------------------------------------------
/tests/resources/SameSize_BoldTitle.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChrizH/pdfstructure/1be97d954418b17d0d628ae6595c6aab54c7bd1c/tests/resources/SameSize_BoldTitle.pdf


--------------------------------------------------------------------------------
/tests/resources/SameSize_EnumeratedTitle.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChrizH/pdfstructure/1be97d954418b17d0d628ae6595c6aab54c7bd1c/tests/resources/SameSize_EnumeratedTitle.pdf


--------------------------------------------------------------------------------
/tests/resources/SameStyleOnly.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChrizH/pdfstructure/1be97d954418b17d0d628ae6595c6aab54c7bd1c/tests/resources/SameStyleOnly.pdf


--------------------------------------------------------------------------------
/tests/resources/interview_cheatsheet-excerpt.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChrizH/pdfstructure/1be97d954418b17d0d628ae6595c6aab54c7bd1c/tests/resources/interview_cheatsheet-excerpt.png


--------------------------------------------------------------------------------
/tests/resources/interview_cheatsheet.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChrizH/pdfstructure/1be97d954418b17d0d628ae6595c6aab54c7bd1c/tests/resources/interview_cheatsheet.pdf


--------------------------------------------------------------------------------
/tests/resources/lorem.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChrizH/pdfstructure/1be97d954418b17d0d628ae6595c6aab54c7bd1c/tests/resources/lorem.pdf


--------------------------------------------------------------------------------
/tests/resources/paper.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChrizH/pdfstructure/1be97d954418b17d0d628ae6595c6aab54c7bd1c/tests/resources/paper.pdf


--------------------------------------------------------------------------------
/tests/resources/parsed/interview_cheatsheet_pretty.txt:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | []
  4 | 31/10/2019
  5 | This is my technical interview cheat sheet. Feel free to fork it or do whatever you want with it. PLEASE let me know if there are any errors or if a…
  6 | 
  7 | [TSiege / The Technical Interview Cheat Sheet.md]
  8 | 	Last active 2 days ago • Report abuse
  9 | 	Embed
 10 | 	Download ZIP
 11 | 	<script src="https://gis
 12 | 	This is my technical interview cheat sheet. Feel free to fork it or do whatever you want with it. PLEASE let me know if there are any errors or if anything
 13 | 	crucial is missing. I will add more links soon.
 14 | 	The Technical Interview Cheat Sheet.md
 15 | 
 16 | [ANNOUNCEMENT]
 17 | 	I have moved this over to the Tech Interview Cheat Sheet Repo since a gist is too difficult to maintain as an open source
 18 | 	endevaor and there is no way to version it. I have updated below, but I will not be able to keep this one up to date so
 19 | 	please checkout the repo instead. The below is just for some preservation for those who stumble across here.
 20 | 	\
 21 | 
 22 | [Studying for a Tech Interview Sucks, so Here's a Cheat Sheet to Help]
 23 | 	This list is meant to be both a quick guide and reference for further research into these topics. It's basically a summary of
 24 | 	that comp sci course you never took or forgot about, so there's no way it can cover everything in depth.
 25 | 
 26 | [Contributing]
 27 | 	Please see the Tech Interview Cheat Sheet Repo
 28 | 
 29 | [Data Structure Basics]
 30 | 
 31 | 	[Array]
 32 | 
 33 | 		[Definition:]
 34 | 			Stores data elements based on an sequential, most commonly 0 based, index.
 35 | 			Based on tuples from set theory.
 36 | 			They are one of the oldest, most commonly used data structures.
 37 | 
 38 | 		[What you need to know:]
 39 | 			Optimal for indexing; bad at searching, inserting, and deleting (except at the end).
 40 | 			Linear arrays, or one dimensional arrays, are the most basic.
 41 | 			Are static in size, meaning that they are declared with a fixed size.
 42 | 			Dynamic arrays are like one dimensional arrays, but have reserved space for additional elements.
 43 | 			If a dynamic array is full, it copies its contents to a larger array.
 44 | 			Multi dimensional arrays nested arrays that allow for multiple dimensions such as an array of arrays providing a 2
 45 | 			dimensional spacial representation via x, y coordinates.
 46 | 			https://gist.github.com/TSiege/cbb0507082bb18ff7e4b#ﬁle-the-technical-interview-cheat-sheet-md
 47 | 			1/18
 48 | 			31/10/2019
 49 | 			This is my technical interview cheat sheet. Feel free to fork it or do whatever you want with it. PLEASE let me know if there are any errors or if a…
 50 | 
 51 | 		[Time Complexity:]
 52 | 			Indexing: Linear array: O(1), Dynamic array: O(1)
 53 | 			Search: Linear array: O(n), Dynamic array: O(n)
 54 | 			Optimized Search: Linear array: O(log n), Dynamic array: O(log n)
 55 | 			Insertion: Linear array: n/a Dynamic array: O(n)
 56 | 
 57 | 	[Linked List]
 58 | 
 59 | 		[Definition:]
 60 | 			Stores data with nodes that point to other nodes.
 61 | 			Nodes, at its most basic it has one datum and one reference (another node).
 62 | 			A linked list chains nodes together by pointing one node's reference towards another node.
 63 | 
 64 | 		[What you need to know:]
 65 | 			Designed to optimize insertion and deletion, slow at indexing and searching.
 66 | 			Doubly linked list has nodes that also reference the previous node.
 67 | 			Circularly linked list is simple linked list whose tail, the last node, references the head, the first node.
 68 | 			Stack, commonly implemented with linked lists but can be made from arrays too.
 69 | 			Stacks are last in, first out (LIFO) data structures.
 70 | 			Made with a linked list by having the head be the only place for insertion and removal.
 71 | 			Queues, too can be implemented with a linked list or an array.
 72 | 			Queues are a first in, first out (FIFO) data structure.
 73 | 			Made with a doubly linked list that only removes from head and adds to tail.
 74 | 
 75 | 		[Time Complexity:]
 76 | 			Indexing: Linked Lists: O(n)
 77 | 			Search: Linked Lists: O(n)
 78 | 			Optimized Search: Linked Lists: O(n)
 79 | 			Insertion: Linked Lists: O(1)
 80 | 
 81 | 	[Hash Table or Hash Map]
 82 | 
 83 | 		[Definition:]
 84 | 			Stores data with key value pairs.
 85 | 			Hash functions accept a key and return an output unique only to that specific key.
 86 | 			This is known as hashing, which is the concept that an input and an output have a one-to-one correspondence
 87 | 			to map information.
 88 | 			Hash functions return a unique address in memory for that data.
 89 | 
 90 | 		[What you need to know:]
 91 | 			Designed to optimize searching, insertion, and deletion.
 92 | 			Hash collisions are when a hash function returns the same output for two distinct inputs.
 93 | 			All hash functions have this problem.
 94 | 			This is often accommodated for by having the hash tables be very large.
 95 | 			Hashes are important for associative arrays and database indexing.
 96 | 
 97 | 		[Time Complexity:]
 98 | 			Indexing: Hash Tables: O(1)
 99 | 			Search: Hash Tables: O(1)
100 | 			Insertion: Hash Tables: O(1)
101 | 
102 | 	[Binary Tree]
103 | 
104 | 		[Definition:]
105 | 			https://gist.github.com/TSiege/cbb0507082bb18ff7e4b#ﬁle-the-technical-interview-cheat-sheet-md
106 | 			2/18
107 | 			31/10/2019
108 | 			This is my technical interview cheat sheet. Feel free to fork it or do whatever you want with it. PLEASE let me know if there are any errors or if a…
109 | 			Is a tree like data structure where every node has at most two children.
110 | 			There is one left and right child node.
111 | 
112 | 		[What you need to know:]
113 | 			Designed to optimize searching and sorting.
114 | 			A degenerate tree is an unbalanced tree, which if entirely one-sided is a essentially a linked list.
115 | 			They are comparably simple to implement than other data structures.
116 | 
117 | 		[Used to make binary search trees.]
118 | 			A binary tree that uses comparable keys to assign which direction a child is.
119 | 			Left child has a key smaller than it's parent node.
120 | 			Right child has a key greater than it's parent node.
121 | 			There can be no duplicate node.
122 | 			Because of the above it is more likely to be used as a data structure than a binary tree.
123 | 
124 | 		[Time Complexity:]
125 | 			Indexing: Binary Search Tree: O(log n)
126 | 			Search: Binary Search Tree: O(log n)
127 | 			Insertion: Binary Search Tree: O(log n)
128 | 
129 | [Search Basics]
130 | 
131 | 	[Breadth First Search]
132 | 
133 | 		[Definition:]
134 | 			An algorithm that searches a tree (or graph) by searching levels of the tree first, starting at the root.
135 | 			It finds every node on the same level, most often moving left to right.
136 | 			While doing this it tracks the children nodes of the nodes on the current level.
137 | 			When finished examining a level it moves to the left most node on the next level.
138 | 			The bottom-right most node is evaluated last (the node that is deepest and is farthest right of it's level).
139 | 
140 | 		[What you need to know:]
141 | 			Optimal for searching a tree that is wider than it is deep.
142 | 			Uses a queue to store information about the tree while it traverses a tree.
143 | 			Because it uses a queue it is more memory intensive than depth first search.
144 | 			The queue uses more memory because it needs to stores pointers
145 | 
146 | 		[Time Complexity:]
147 | 			Search: Breadth First Search: O(V + E)
148 | 			E is number of edges
149 | 			V is number of vertices
150 | 
151 | 	[Depth First Search]
152 | 
153 | 		[Definition:]
154 | 			An algorithm that searches a tree (or graph) by searching depth of the tree first, starting at the root.
155 | 			It traverses left down a tree until it cannot go further.
156 | 			Once it reaches the end of a branch it traverses back up trying the right child of nodes on that branch, and if
157 | 			possible left from the right children.
158 | 			When finished examining a branch it moves to the node right of the root then tries to go left on all it's children
159 | 			until it reaches the bottom.
160 | 			The right most node is evaluated last (the node that is right of all it's ancestors).
161 | 
162 | 		[What you need to know:]
163 | 			https://gist.github.com/TSiege/cbb0507082bb18ff7e4b#ﬁle-the-technical-interview-cheat-sheet-md
164 | 			3/18
165 | 			31/10/2019
166 | 			This is my technical interview cheat sheet. Feel free to fork it or do whatever you want with it. PLEASE let me know if there are any errors or if a…
167 | 			Optimal for searching a tree that is deeper than it is wide.
168 | 			Uses a stack to push nodes onto.
169 | 			Because a stack is LIFO it does not need to keep track of the nodes pointers and is therefore less memory
170 | 			intensive than breadth first search.
171 | 			Once it cannot go further left it begins evaluating the stack.
172 | 
173 | 		[Time Complexity:]
174 | 			Search: Depth First Search: O(|E| + |V|)
175 | 			E is number of edges
176 | 			V is number of vertices
177 | 
178 | 		[Breadth First Search Vs. Depth First Search]
179 | 			The simple answer to this question is that it depends on the size and shape of the tree.
180 | 			For wide, shallow trees use Breadth First Search
181 | 			For deep, narrow trees use Depth First Search
182 | 
183 | 		[Nuances:]
184 | 			Because BFS uses queues to store information about the nodes and its children, it could use more memory than is
185 | 			available on your computer. (But you probably won't have to worry about this.)
186 | 			If using a DFS on a tree that is very deep you might go unnecessarily deep in the search. See xkcd for more
187 | 			information.
188 | 			Breadth First Search tends to be a looping algorithm.
189 | 			Depth First Search tends to be a recursive algorithm.
190 | 
191 | [Efficient Sorting Basics]
192 | 
193 | 	[Merge Sort]
194 | 
195 | 		[Definition:]
196 | 			A comparison based sorting algorithm
197 | 			Divides entire dataset into groups of at most two.
198 | 			Compares each number one at a time, moving the smallest number to left of the pair.
199 | 			Once all pairs sorted it then compares left most elements of the two leftmost pairs creating a sorted group of
200 | 			four with the smallest numbers on the left and the largest ones on the right.
201 | 			This process is repeated until there is only one set.
202 | 
203 | 		[What you need to know:]
204 | 			This is one of the most basic sorting algorithms.
205 | 			Know that it divides all the data into as small possible sets then compares them.
206 | 
207 | 		[Time Complexity:]
208 | 			Best Case Sort: Merge Sort: O(n)
209 | 			Average Case Sort: Merge Sort: O(n log n)
210 | 			Worst Case Sort: Merge Sort: O(n log n)
211 | 
212 | 	[Quicksort]
213 | 
214 | 		[Definition:]
215 | 			A comparison based sorting algorithm
216 | 			Divides entire dataset in half by selecting the middle element and putting all smaller elements to the left of the
217 | 			element and larger ones to the right.
218 | 			It repeats this process on the left side until it is comparing only two elements at which point the left side is
219 | 			sorted.
220 | 			When the left side is finished sorting it performs the same operation on the right side.
221 | 			https://gist.github.com/TSiege/cbb0507082bb18ff7e4b#ﬁle-the-technical-interview-cheat-sheet-md
222 | 			4/18
223 | 			31/10/2019
224 | 			This is my technical interview cheat sheet. Feel free to fork it or do whatever you want with it. PLEASE let me know if there are any errors or if a…
225 | 			Computer architecture favors the quicksort process.
226 | 
227 | 		[What you need to know:]
228 | 			While it has the same Big O as (or worse in some cases) many other sorting algorithms it is often faster in practice
229 | 			than many other sorting algorithms, such as merge sort.
230 | 			Know that it halves the data set by the average continuously until all the information is sorted.
231 | 
232 | 		[Time Complexity:]
233 | 			Best Case Sort: Merge Sort: O(n)
234 | 			Average Case Sort: Merge Sort: O(n log n)
235 | 			Worst Case Sort: Merge Sort: O(n^2)
236 | 
237 | 	[Bubble Sort]
238 | 
239 | 		[Definition:]
240 | 			A comparison based sorting algorithm
241 | 			It iterates left to right comparing every couplet, moving the smaller element to the left.
242 | 			It repeats this process until it no longer moves an element to the left.
243 | 
244 | 		[What you need to know:]
245 | 			While it is very simple to implement, it is the least efficient of these three sorting methods.
246 | 			Know that it moves one space to the right comparing two elements at a time and moving the smaller on to left.
247 | 
248 | 		[Time Complexity:]
249 | 			Best Case Sort: Merge Sort: O(n)
250 | 			Average Case Sort: Merge Sort: O(n^2)
251 | 			Worst Case Sort: Merge Sort: O(n^2)
252 | 
253 | 		[Merge Sort Vs. Quicksort]
254 | 			Quicksort is likely faster in practice.
255 | 			Merge Sort divides the set into the smallest possible groups immediately then reconstructs the incrementally as it
256 | 			sorts the groupings.
257 | 			Quicksort continually divides the set by the average, until the set is recursively sorted.
258 | 
259 | [Basic Types of Algorithms]
260 | 
261 | 	[Recursive Algorithms]
262 | 
263 | 		[Definition:]
264 | 			An algorithm that calls itself in its definition.
265 | 			Recursive case a conditional statement that is used to trigger the recursion.
266 | 			Base case a conditional statement that is used to break the recursion.
267 | 		What you need to know:
268 | 
269 | 		[Stack level too deep and stack overflow.]
270 | 			If you've seen either of these from a recursive algorithm, you messed up.
271 | 			It means that your base case was never triggered because it was faulty or the problem was so massive you ran
272 | 			out of alloted memory.
273 | 			Knowing whether or not you will reach a base case is integral to correctly using recursion.
274 | 			Often used in Depth First Search
275 | 
276 | 	[Iterative Algorithms]
277 | 
278 | 		[Definition:]
279 | 			https://gist.github.com/TSiege/cbb0507082bb18ff7e4b#ﬁle-the-technical-interview-cheat-sheet-md
280 | 			5/18
281 | 			31/10/2019
282 | 			This is my technical interview cheat sheet. Feel free to fork it or do whatever you want with it. PLEASE let me know if there are any errors or if a…
283 | 			An algorithm that is called repeatedly but for a finite number of times, each time being a single iteration.
284 | 			Often used to move incrementally through a data set.
285 | 
286 | 		[What you need to know:]
287 | 			Generally you will see iteration as loops, for, while, and until statements.
288 | 			Think of iteration as moving one at a time through a set.
289 | 			Often used to move through an array.
290 | 
291 | 		[Recursion Vs. Iteration]
292 | 			The differences between recursion and iteration can be confusing to distinguish since both can be used to implement
293 | 			the other. But know that,
294 | 			Recursion is, usually, more expressive and easier to implement.
295 | 			Iteration uses less memory.
296 | 			Functional languages tend to use recursion. (i.e. Haskell)
297 | 			Imperative languages tend to use iteration. (i.e. Ruby)
298 | 			Check out this Stack Overflow post for more info.
299 | 
300 | 		[Pseudo Code of Moving Through an Array (this is why iteration is used for this)]
301 | 			Recursion                         | Iteration 
302 | 			----------------------------------|---------------------------------- 
303 | 			recursive method (array, n)       | iterative method (array) 
304 | 			  if array[n] is not nil          |   for n from 0 to size of array 
305 | 			    print array[n]                |     print(array[n]) 
306 | 			    recursive method(array, n+1)  | 
307 | 			  else                            | 
308 | 			    exit loop                     |
309 | 
310 | 	[Greedy Algorithm]
311 | 
312 | 		[Definition:]
313 | 			An algorithm that, while executing, selects only the information that meets a certain criteria.
314 | 			The general five components, taken from Wikipedia:
315 | 			A candidate set, from which a solution is created.
316 | 			A selection function, which chooses the best candidate to be added to the solution.
317 | 			A feasibility function, that is used to determine if a candidate can be used to contribute to a solution.
318 | 			An objective function, which assigns a value to a solution, or a partial solution.
319 | 			A solution function, which will indicate when we have discovered a complete solution.
320 | 
321 | 		[What you need to know:]
322 | 			Used to find the expedient, though non-optimal, solution for a given problem.
323 | 			Generally used on sets of data where only a small proportion of the information evaluated meets the desired result.
324 | 			Often a greedy algorithm can help reduce the Big O of an algorithm.
325 | 
326 | 		[Pseudo Code of a Greedy Algorithm to Find Largest Difference of any Two Numbers in an Array.]
327 | 			greedy algorithm (array) 
328 | 			  var largest difference = 0 
329 | 			  var new difference = find next difference (array[n], array[n+1]) 
330 | 			  largest difference = new difference if new difference is > largest difference 
331 | 			  repeat above two steps until all differences have been found 
332 | 			  return largest difference
333 | 			This algorithm never needed to compare all the differences to one another, saving it an entire iteration.
334 | 			Author
335 | 			Owner
336 | 			TSiege commented on 2 May 2014
337 | 			https://gist.github.com/TSiege/cbb0507082bb18ff7e4b#ﬁle-the-technical-interview-cheat-sheet-md
338 | 			6/18


--------------------------------------------------------------------------------
/tests/resources/samplepptx.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChrizH/pdfstructure/1be97d954418b17d0d628ae6595c6aab54c7bd1c/tests/resources/samplepptx.pdf


--------------------------------------------------------------------------------
/tests/test_custom_use_cases.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | from unittest import TestCase
 3 | 
 4 | from pdfstructure.hierarchy.parser import HierarchyParser
 5 | from pdfstructure.hierarchy.traversal import traverse_inorder_sections_with_content
 6 | from pdfstructure.printer import PrettyStringPrinter as txtPrinter
 7 | from pdfstructure.source import FileSource
 8 | 
 9 | 
10 | class TestExamples(TestCase):
11 |     parser = HierarchyParser()
12 | 
13 |     def test_count_paragraph_words(self):
14 |         test_file = str(Path("resources/lorem.pdf"))
15 |         document = self.parser.parse_pdf(FileSource(file_path=test_file))
16 |         assert_token_order = [50, 100, 150]
17 |         for level, title, content in traverse_inorder_sections_with_content(document):
18 |             prefix = txtPrinter.get_title_prefix(level)
19 |             tokens = content.split()
20 |             self.assertEqual(assert_token_order.pop(0), len(tokens))
21 |             print("{}{};\twords: {}".format(prefix, title, len(tokens)))
22 | 
23 |     def test_load_book(self):
24 |         book_path = Path("resources/interview_cheatsheet.pdf")
25 |         document = self.parser.parse_pdf(FileSource(file_path=str(book_path)))
26 | 
27 |         for level, title, content in traverse_inorder_sections_with_content(document):
28 |             prefix = txtPrinter.get_title_prefix(level)
29 |             print("{}{};\twords: {}".format(prefix, title, len(content.split())))
30 | 


--------------------------------------------------------------------------------
/tests/test_document.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from pathlib import Path
 3 | from unittest import TestCase
 4 | 
 5 | from pdfstructure.model.document import StructuredPdfDocument
 6 | 
 7 | 
 8 | class TestSection(TestCase):
 9 | 
10 |     def test_full_content(self):
11 |         with open(str(Path("resources/parsed/interview_cheatsheet.json").absolute()), "r") as fp:
12 |             json_string = json.load(fp)
13 |             document = StructuredPdfDocument.from_json(json_string)
14 |             text = document.text
15 | 
16 |             expected_newline_merged_subsections_excerpt = "Greedy Algorithm\nDefinition:\nAn algorithm that, while"
17 | 
18 |             self.assertTrue(expected_newline_merged_subsections_excerpt in text)
19 | 


--------------------------------------------------------------------------------
/tests/test_headercompare.py:
--------------------------------------------------------------------------------
 1 | from unittest import TestCase
 2 | 
 3 | from pdfminer.layout import LTChar, LTTextBoxHorizontal, LTTextLineHorizontal
 4 | 
 5 | from pdfstructure.hierarchy.headercompare import condition_h2_extends_h1, condition_h1_enum_h2_not
 6 | from pdfstructure.model.document import TextElement, Section
 7 | from pdfstructure.model.style import Style, TextSize
 8 | 
 9 | 
10 | class TestSubHeaderConditions(TestCase):
11 |     style_middle_bold = Style(bold=True, italic=True, font_name="test-font",
12 |                               mapped_font_size=TextSize.middle,
13 |                               mean_size=10, max_size=15)
14 | 
15 |     class TestFont:
16 |         fontname = "Test"
17 | 
18 |         def is_vertical(self):
19 |             return True
20 | 
21 |     def create_char(self, text):
22 |         return LTChar((1, 2, 3, 4, 5, 6), TestSubHeaderConditions.TestFont(), 10, 10, 10, text, 10, (1, 1), 10, "")
23 | 
24 |     def create_container(self, text):
25 |         box = LTTextBoxHorizontal()
26 |         line = LTTextLineHorizontal(0)
27 |         for c in text:
28 |             line.add(self.create_char(c))
29 |         box.add(line)
30 |         return box
31 | 
32 |     def test_condition_h2_extends_h1(self):
33 |         element1 = TextElement(text_container=self.create_container("1.1 This is a test header"),
34 |                                style=self.style_middle_bold)
35 | 
36 |         element2 = TextElement(text_container=self.create_container("1.1.2 This is a subheader of 1.1"),
37 |                                style=self.style_middle_bold)
38 | 
39 |         self.assertTrue(condition_h2_extends_h1(Section(element1), Section(element2)))
40 | 
41 |     def test_condition_enumeration__elements_with_same_style(self):
42 |         h1 = TextElement(text_container=self.create_container("1.1 This is a test header"),
43 |                          style=self.style_middle_bold)
44 | 
45 |         subheader = TextElement(text_container=self.create_container("This is a subheader of 1.1"),
46 |                                 style=self.style_middle_bold)
47 | 
48 |         neighbor_element = TextElement(text_container=self.create_container("1.2 this is NOT a subheader of 1.1"),
49 |                                        style=self.style_middle_bold)
50 | 
51 |         self.assertTrue(condition_h1_enum_h2_not(Section(h1), Section(subheader)))
52 |         self.assertFalse(condition_h1_enum_h2_not(Section(h1), Section(neighbor_element)))
53 | 


--------------------------------------------------------------------------------
/tests/test_hierarchy.py:
--------------------------------------------------------------------------------
  1 | from pathlib import Path
  2 | from unittest import TestCase
  3 | 
  4 | from pdfminer.high_level import extract_text
  5 | 
  6 | from pdfstructure.hierarchy.parser import HierarchyParser
  7 | from pdfstructure.model.document import DanglingTextSection
  8 | from pdfstructure.printer import PrettyStringPrinter
  9 | from pdfstructure.source import FileSource
 10 | 
 11 | 
 12 | class TestHierarchy(TestCase):
 13 |     doc_with_columns = str(Path("resources/IE00BM67HT60-ATB-FS-DE-2020-2-28.pdf").absolute())
 14 |     straight_forward_doc = str(Path("resources/interview_cheatsheet.pdf").absolute())
 15 |     nested_doc_bold_title = str(Path("resources/5648.pdf").absolute())
 16 | 
 17 |     same_style_doc = str(Path("resources/SameStyleOnly.pdf").absolute())
 18 |     same_size_bold_header = str(Path("resources/SameSize_BoldTitle.pdf").absolute())
 19 |     same_size_enum_header = str(Path("resources/SameSize_EnumeratedTitle.pdf").absolute())
 20 |     paper = str(Path("resources/paper.pdf").absolute())
 21 |     parser = HierarchyParser()
 22 | 
 23 |     def test_no_hierarchy_detected(self):
 24 |         pdf = self.parser.parse_pdf(FileSource(self.same_style_doc))
 25 |         self.assertEqual(4, len(pdf.elements[0].children))
 26 | 
 27 |         self.assertIsInstance(pdf.elements[0], DanglingTextSection)
 28 | 
 29 |     def test_reading_order_paper_format(self):
 30 |         pdf = self.parser.parse_pdf(FileSource(self.paper))
 31 |         self.assertEqual(1, len(pdf.elements))
 32 |         self.assertEqual("5 Experiments: Passage Retrieval",pdf.elements[0].children[-2].heading_text)
 33 |         self.assertEqual("6 Experiments: Question Answering",pdf.elements[0].children[-1].heading_text)
 34 | 
 35 |     def test_hierarchy_bold_title(self):
 36 |         pdf = self.parser.parse_pdf(FileSource(self.same_size_bold_header))
 37 |         self.assertEqual(2, len(pdf.elements))
 38 |         self.assertEqual("Lorem Ipsum.", pdf.elements[0].heading.text)
 39 |         self.assertEqual("Appendix", pdf.elements[1].heading.text)
 40 | 
 41 |     def test_hierarchy_pdf_parser(self):
 42 |         path = self.straight_forward_doc
 43 |         source = FileSource(path)
 44 |         pdf = self.parser.parse_pdf(source)
 45 |         self.assertEqual(9, len(pdf.elements))
 46 |         self.assertEqual("Data Structure Basics", pdf.elements[5].heading.text)
 47 |         self.assertEqual("Basic Types of Algorithms", pdf.elements[8].heading.text)
 48 |         self.assertEqual(4, pdf.elements[8].heading.page)
 49 | 
 50 |     def test_grouping(self):
 51 |         test_doc = self.nested_doc_bold_title
 52 |         doc = self.parser.parse_pdf(FileSource(test_doc))
 53 |         self.assertEqual(1, doc.elements.__len__())
 54 |         self.assertEqual(13, doc.elements[0].children.__len__())
 55 |         self.assertEqual("Outdoorpädagogik", doc.elements[0].heading.text)
 56 |         self.assertEqual("„Fange den Stock“", doc.elements[0].children[0].heading.text)
 57 | 
 58 |     def test_grouping_bold_key_and_size(self):
 59 |         doc = self.parser.parse_pdf(FileSource(self.straight_forward_doc))
 60 |         self.assertEqual(len(doc.elements), 9)
 61 | 
 62 |     def skip_test_grouping_bold_columns(self):
 63 |         doc = self.parser.parse_pdf(FileSource(self.doc_with_columns))
 64 |         self.assertEqual("Xtrackers MSCI World Information Technology UCITS ETF 1C", doc.elements[1].heading.text)
 65 | 
 66 | 
 67 | class TestComapreFrameworks(TestCase):
 68 |     def skip_test_pdftotext(self):
 69 |         path = TestHierarchy.straight_forward_doc
 70 | 
 71 |         with open(path, "rb") as f:
 72 |             pdf = pdftotext.PDF(f)
 73 | 
 74 |         print("\n\n".join(pdf))
 75 | 
 76 |     def skip_test_pdfminer_high_level(self):
 77 |         text = extract_text(TestHierarchy.straight_forward_doc, laparams=None)
 78 |         print(text)
 79 | 
 80 |     def skip_test_pdfminer(self):
 81 |         from io import StringIO
 82 | 
 83 |         from pdfminer.converter import TextConverter
 84 |         from pdfminer.layout import LAParams
 85 |         from pdfminer.pdfdocument import PDFDocument
 86 |         from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
 87 |         from pdfminer.pdfpage import PDFPage
 88 |         from pdfminer.pdfparser import PDFParser
 89 | 
 90 |         output_string = StringIO()
 91 |         with open(TestHierarchy.straight_forward_doc, 'rb') as in_file:
 92 |             parser = PDFParser(in_file)
 93 |             doc = PDFDocument(parser)
 94 |             rsrcmgr = PDFResourceManager()
 95 |             device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
 96 |             interpreter = PDFPageInterpreter(rsrcmgr, device)
 97 |             for page in PDFPage.create_pages(doc):
 98 |                 interpreter.process_page(page)
 99 | 
100 |         print(output_string.getvalue())
101 | 
102 |     def skip_test_pdfstructure(self):
103 |         path = TestHierarchy.straight_forward_doc
104 |         parser = HierarchyParser()
105 |         source = FileSource(path)
106 |         pdf = parser.parse_pdf(source)
107 |         print(PrettyStringPrinter().print(pdf))
108 | 


--------------------------------------------------------------------------------
/tests/test_printer.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from pathlib import Path
 3 | from unittest import TestCase
 4 | 
 5 | from pdfstructure.hierarchy.parser import HierarchyParser
 6 | from pdfstructure.model.document import StructuredPdfDocument
 7 | from pdfstructure.printer import PrettyStringFilePrinter, PrettyStringPrinter, JsonFilePrinter, JsonStringPrinter
 8 | from pdfstructure.source import FileSource
 9 | 
10 | 
11 | class TestPrettyStringPrinter(TestCase):
12 |     straight_forward_doc = str(Path("resources/interview_cheatsheet.pdf").absolute())
13 |     column_doc = str(Path("resources/IE00BM67HT60-ATB-FS-DE-2020-2-28.pdf").absolute())
14 |     correctFormattedText = "[Data Structure Basics]\n\n\t[Array]\n\n\t\t[Definition:]\n\t\t\tStores data elements" \
15 |                            " based on an sequential, most commonly 0 based, index."
16 | 
17 |     testDocument = None
18 | 
19 |     @classmethod
20 |     def setUpClass(cls) -> None:
21 |         parser = HierarchyParser()
22 |         cls.testDocument = parser.parse_pdf(FileSource(cls.straight_forward_doc))
23 | 
24 |     def test_print_pretty_string(self):
25 |         printer = PrettyStringPrinter()
26 |         printed = printer.print(self.testDocument)
27 |         self.assertTrue(self.correctFormattedText in printed)
28 | 
29 |     def test_print_pretty_file(self):
30 |         printer = PrettyStringFilePrinter()
31 | 
32 |         file_path = Path("resources/parsed/interview_cheatsheet_pretty.txt")
33 |         printed_file = printer.print(self.testDocument, file_path=str(file_path.absolute()))
34 | 
35 |         with open(printed_file, "r") as file:
36 |             printed = "".join(file.readlines())
37 |             self.assertTrue(self.correctFormattedText in printed)
38 | 
39 |     def test_print_json_string(self):
40 |         printer = JsonStringPrinter()
41 | 
42 |         jsonString = printer.print(self.testDocument)
43 | 
44 |         decoded_document = StructuredPdfDocument.from_json(json.loads(jsonString))
45 | 
46 |         self.assertEqual(self.testDocument.elements[1].heading.text,
47 |                          decoded_document.elements[1].heading.text)
48 |         self.assertEqual(self.testDocument.elements[-1].heading.text,
49 |                          decoded_document.elements[-1].heading.text)
50 | 
51 |         self.assertEqual("Array", decoded_document.elements[5].children[0].heading.text)
52 |         self.assertEqual("Time Complexity:", decoded_document.elements[5].children[0].children[2].heading.text)
53 | 
54 |     def test_print_json_file(self):
55 |         printer = JsonFilePrinter()
56 | 
57 |         file_path = Path("resources/parsed/interview_cheatsheet.json")
58 |         printer.print(self.testDocument, file_path=str(file_path.absolute()))
59 | 
60 |         with open(file_path, "r") as file:
61 |             decoded_document = StructuredPdfDocument.from_json(json.load(file))
62 | 
63 |             self.assertEqual(self.testDocument.elements[1].heading.text,
64 |                              decoded_document.elements[1].heading.text)
65 |             self.assertEqual(self.testDocument.elements[-1].heading.text,
66 |                              decoded_document.elements[-1].heading.text)
67 | 
68 |             self.assertEqual("Array", decoded_document.elements[5].children[0].heading.text)
69 |             self.assertEqual("Time Complexity:", decoded_document.elements[5].children[0].children[2].heading.text)
70 | 


--------------------------------------------------------------------------------
/tests/test_style_analyser.py:
--------------------------------------------------------------------------------
 1 | import itertools
 2 | from collections import Counter
 3 | from pathlib import Path
 4 | from unittest import TestCase
 5 | 
 6 | import pandas as pd
 7 | 
 8 | from pdfstructure.analysis.annotate import StyleAnnotator
 9 | from pdfstructure.analysis.sizemapper import PivotLogMapper, PivotLinearMapper
10 | from pdfstructure.analysis.styledistribution import count_sizes, StyleDistribution
11 | from pdfstructure.model.style import TextSize
12 | from pdfstructure.utils import element_generator, find_file, DocTypeFilter
13 | 
14 | 
15 | class TestSizeMapper(TestCase):
16 | 
17 |     def test_log_mapper(self):
18 |         distribution = StyleDistribution(Counter((1, 5, 6, 10, 10, 10, 10, 20, 100)))
19 |         scaler = PivotLogMapper(distribution)
20 |         # test borders
21 |         borders = (4.02, 8.01, 14.41, 23.28)
22 |         [self.assertAlmostEqual(borders[i], b, 1) for i, b in enumerate(scaler.borders)]
23 |         self.assertEqual(TextSize.xsmall, scaler.translate(TextSize, -10))
24 |         self.assertEqual(TextSize.xsmall, scaler.translate(TextSize, 0))
25 |         self.assertEqual(TextSize.xsmall, scaler.translate(TextSize, 1))
26 |         self.assertEqual(TextSize.small, scaler.translate(TextSize, 5))
27 |         self.assertEqual(TextSize.small, scaler.translate(TextSize, 6))
28 |         self.assertEqual(TextSize.small, scaler.translate(TextSize, 7))
29 |         self.assertEqual(TextSize.small, scaler.translate(TextSize, 8))
30 |         self.assertEqual(TextSize.middle, scaler.translate(TextSize, 10))
31 |         self.assertEqual(TextSize.middle, scaler.translate(TextSize, 12))
32 |         self.assertEqual(TextSize.large, scaler.translate(TextSize, 15))
33 |         self.assertEqual(TextSize.large, scaler.translate(TextSize, 16))
34 |         self.assertEqual(TextSize.large, scaler.translate(TextSize, 20))
35 |         self.assertEqual(TextSize.xlarge, scaler.translate(TextSize, 30))
36 |         self.assertEqual(TextSize.xlarge, scaler.translate(TextSize, 200))
37 | 
38 |     
39 |     def test_linear_mapper(self):
40 |         distribution = StyleDistribution(Counter((1, 5, 6, 10, 10, 10, 10, 20, 100)))
41 |         scaler = PivotLinearMapper(distribution)
42 |         
43 |         self.assertTupleEqual((3.25, 5.5, 32.5, 55.0), scaler.borders)
44 |         
45 |         self.assertEqual(TextSize.xsmall, scaler.translate(TextSize, -10))
46 |         self.assertEqual(TextSize.xsmall, scaler.translate(TextSize, 0))
47 |         self.assertEqual(TextSize.xsmall, scaler.translate(TextSize, 1))
48 |         self.assertEqual(TextSize.small, scaler.translate(TextSize, 5))
49 |         self.assertEqual(TextSize.middle, scaler.translate(TextSize, 10))
50 |         self.assertEqual(TextSize.middle, scaler.translate(TextSize, 15))
51 |         self.assertEqual(TextSize.middle, scaler.translate(TextSize, 20))
52 |         self.assertEqual(TextSize.middle, scaler.translate(TextSize, 30))
53 |         self.assertEqual(TextSize.large, scaler.translate(TextSize, 40))
54 |         self.assertEqual(TextSize.large, scaler.translate(TextSize, 50))
55 |         self.assertEqual(TextSize.xlarge, scaler.translate(TextSize, 60))
56 |         self.assertEqual(TextSize.xlarge, scaler.translate(TextSize, 70))
57 |         self.assertEqual(TextSize.xlarge, scaler.translate(TextSize, 90))
58 |         self.assertEqual(TextSize.xlarge, scaler.translate(TextSize, 120))
59 | 
60 | 
61 | class TestFonts(TestCase):
62 |     def test_fontnames(self):
63 |         fonts = []
64 |         for file in itertools.islice(find_file(str(Path("resources/").absolute()), DocTypeFilter(endings=("pdf"))), 3):
65 |             if not file.is_file():
66 |                 continue
67 |             file_path = str(file.absolute())
68 |             distribution = count_sizes(element_generator(file_path))
69 |             
70 |             sizeMapper = PivotLogMapper(distribution)
71 |             style_annotator = StyleAnnotator(sizemapper=sizeMapper, style_info=distribution)
72 |             
73 |             elements = element_generator(file_path)
74 |             with_style = style_annotator.process(elements)
75 |             
76 |             for data in with_style:
77 |                 fonts.append(data.style.font_name)
78 |         
79 |         ds = pd.Series(fonts, name="fonts", dtype=str)
80 |         boldmasked = ds.loc[ds.apply(lambda x: "bold" in x.lower())]
81 |         italic = ds.loc[ds.apply(lambda x: "italic" in x.lower())]
82 |         
83 |         # todo, define test scenario for sample files


--------------------------------------------------------------------------------
/tests/test_traversal.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | from unittest import TestCase
 3 | 
 4 | from pdfstructure.hierarchy.parser import HierarchyParser
 5 | from pdfstructure.hierarchy.traversal import traverse_in_order, traverse_level_order, get_document_depth, \
 6 |     traverse_inorder_sections_with_content
 7 | from pdfstructure.model.document import DanglingTextSection
 8 | from pdfstructure.source import FileSource
 9 | 
10 | 
11 | class TestDocumentTraversal(TestCase):
12 |     straight_forward_doc = str(Path("resources/interview_cheatsheet.pdf").absolute())
13 |     same_style_doc = str(Path("resources/SameStyleOnly.pdf").absolute())
14 |     test_doc = None
15 |     test_doc_same_style = None
16 | 
17 |     @classmethod
18 |     def setUpClass(cls) -> None:
19 |         parser = HierarchyParser()
20 |         cls.test_doc_same_style = parser.parse_pdf(FileSource(cls.same_style_doc))
21 | 
22 |         cls.test_doc = parser.parse_pdf(FileSource(cls.straight_forward_doc))
23 | 
24 |     def test_get_height(self):
25 |         h = get_document_depth(self.test_doc)
26 |         self.assertEqual(4, h)
27 | 
28 |     def test_traverse_in_order(self):
29 |         elements = [element for element in traverse_in_order(self.test_doc)]
30 |         self.assertEqual(251, len(elements))
31 |         self.assertEqual("Data Structure Basics", elements[19].heading.text)
32 |         self.assertEqual("Array", elements[20].heading.text)
33 |         self.assertEqual("Definition:", elements[21].heading.text)
34 | 
35 |     def test_traverse_level_order(self):
36 |         elements = [element for element in traverse_level_order(self.test_doc)]
37 | 
38 |         self.assertEqual(251, len(elements))
39 | 
40 |         self.assertEqual("Data Structure Basics", elements[5].heading.text)
41 |         self.assertEqual("Search Basics", elements[6].heading.text)
42 |         self.assertEqual("Efficient Sorting Basics", elements[7].heading.text)
43 |         self.assertEqual("Basic Types of Algorithms", elements[8].heading.text)
44 | 
45 |         self.assertEqual("https://gist.github.com/TSiege/"
46 |                          "cbb0507082bb18ff7e4b#ﬁle-the-technical-interview-cheat-sheet-md", elements[-2].heading.text)
47 |         self.assertEqual("6/18", elements[-1].heading.text)
48 | 
49 |     def test_traverse_level_order_max_depth(self):
50 |         mid_level = get_document_depth(self.test_doc) / 2
51 | 
52 |         level_iterator = traverse_level_order(self.test_doc, max_depth=mid_level)
53 |         elements = [element for element in level_iterator]
54 | 
55 |         self.assertEqual(35, len(elements))
56 | 
57 |         self.assertEqual("Data Structure Basics", elements[5].heading.text)
58 |         self.assertEqual("Search Basics", elements[6].heading.text)
59 |         self.assertEqual("Efficient Sorting Basics", elements[7].heading.text)
60 |         self.assertEqual("Basic Types of Algorithms", elements[8].heading.text)
61 | 
62 |         self.assertEqual("Recursive Algorithms", elements[-3].heading.text)
63 |         self.assertEqual("Iterative Algorithms", elements[-2].heading.text)
64 |         self.assertEqual("Greedy Algorithm", elements[-1].heading.text)
65 | 
66 |     def test_test_level_order_but_flat_structure(self):
67 |         level_iterator = traverse_level_order(self.test_doc_same_style)
68 |         elements = [element for element in level_iterator]
69 |         self.assertEqual(5, len(elements))
70 |         self.assertIsInstance(elements[0], DanglingTextSection)
71 |         self.assertEqual("Lorem Ipsum.", elements[1].heading_text)
72 | 
73 |     def test_test_in_order_but_flat_structure(self):
74 |         level_iterator = traverse_in_order(self.test_doc_same_style)
75 |         elements = [element for element in level_iterator]
76 |         self.assertEqual(5, len(elements))
77 |         self.assertIsInstance(elements[0], DanglingTextSection)
78 |         self.assertEqual("Lorem Ipsum.", elements[1].heading_text)
79 | 
80 |     def test_retrieve_sections_with_content(self):
81 |         elements = [element for element in traverse_inorder_sections_with_content(self.test_doc)]
82 |         print(elements)
83 | 


--------------------------------------------------------------------------------
/tests/test_utils.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | from unittest import TestCase
 3 | 
 4 | from pdfstructure.utils import element_generator, word_generator
 5 | 
 6 | 
 7 | class Test(TestCase):
 8 |     test_path_1 = str(Path("resources/interview_cheatsheet.pdf").absolute())
 9 |     
10 |     def test_word_generator(self):
11 |         elements = element_generator(self.test_path_1)
12 |         
13 |         # 1st text container
14 |         element = next(elements)
15 |         words = list(word_generator(element))
16 |         self.assertEqual(1, len(words))
17 |         self.assertEqual('31/10/2019', " ".join(words))
18 |         
19 |         # 2nd text container
20 |         element = next(elements)
21 |         words = list(word_generator(element))
22 |         self.assertEqual(31, len(words))
23 |         self.assertEqual("This is my technical interview cheat sheet.", " ".join(words[:7]))
24 | 


--------------------------------------------------------------------------------